11e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Copyright 2012 Google Inc. All Rights Reserved. 21e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// 30406ce1417f76f2034833414dcecc9f56253640cVikas Arora// Use of this source code is governed by a BSD-style license 40406ce1417f76f2034833414dcecc9f56253640cVikas Arora// that can be found in the COPYING file in the root of the source 50406ce1417f76f2034833414dcecc9f56253640cVikas Arora// tree. An additional intellectual property rights grant can be found 60406ce1417f76f2034833414dcecc9f56253640cVikas Arora// in the file PATENTS. All contributing project authors may 70406ce1417f76f2034833414dcecc9f56253640cVikas Arora// be found in the AUTHORS file in the root of the source tree. 81e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// ----------------------------------------------------------------------------- 91e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// 101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// ARM NEON version of speed-critical encoding functions. 111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// 121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// adapted from libvpx (http://www.webmproject.org/code/) 131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#include "./dsp.h" 151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#if defined(WEBP_USE_NEON) 171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 18af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include <assert.h> 19af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 20af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#include "./neon.h" 211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#include "../enc/vp8enci.h" 221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------ 241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Transforms (Paragraph 14.4) 251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Inverse transform. 27af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// This code is pretty much the same as TransformOne in the dec_neon.c, except 281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// for subtraction to *ref. See the comments there for algorithmic explanations. 29af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 30af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic const int16_t kC1 = 20091; 31af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic const int16_t kC2 = 17734; // half of kC2, actually. See comment above. 32af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 33af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// This code works but is *slower* than the inlined-asm version below 34af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to 35af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// USE_INTRINSICS define. 36af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// With gcc-4.8, it's a little faster speed than inlined-assembly. 37af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(USE_INTRINSICS) 38af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 39af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. 40af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { 41af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); 42af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 43af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 44af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result 45af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// to the corresponding rows of 'dst'. 46af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, 47af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t dst01, 48af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t dst23) { 49af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Unsigned saturate to 8b. 50af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x8_t dst01_u8 = vqmovun_s16(dst01); 51af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x8_t dst23_u8 = vqmovun_s16(dst23); 52af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 53af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Store the results. 54af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0); 55af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1); 56af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0); 57af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1); 58af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 59af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 60af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, 61af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8_t* const ref, uint8_t* const dst) { 62af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint32x2_t dst01 = vdup_n_u32(0); 63af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint32x2_t dst23 = vdup_n_u32(0); 64af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 65af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Load the source pixels. 66af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0); 67af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0); 68af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1); 69af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1); 70af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 71af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 72af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Convert to 16b. 73af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t dst01_s16 = ConvertU8ToS16(dst01); 74af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t dst23_s16 = ConvertU8ToS16(dst23); 75af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 76af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Descale with rounding. 77af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); 78af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); 79af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Add the inverse transform. 80af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora SaturateAndStore4x4(dst, out01, out23); 81af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 82af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 83af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 84af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, 85af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int16x8x2_t* const out) { 86af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 87af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 88af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... 89af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // b0 d0 b1 d1 b2 d2 ... 90af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); 91af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 92af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 93af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { 94af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // {rows} = in0 | in4 95af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // in8 | in12 96af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // B1 = in4 | in12 97af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t B1 = 98af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1])); 99af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // C0 = kC1 * in4 | kC1 * in12 100af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // C1 = kC2 * in4 | kC2 * in12 101af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1); 102af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2); 103af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]), 104af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vget_low_s16(rows->val[1])); // in0 + in8 105af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]), 106af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vget_low_s16(rows->val[1])); // in0 - in8 107af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // c = kC2 * in4 - kC1 * in12 108af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // d = kC1 * in4 + kC2 * in12 109af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0)); 110af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1)); 111af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b 112af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c 113af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c 114af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c 115af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp)); 116af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora Transpose8x2(E0, E1, rows); 117af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 118af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 119af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void ITransformOne(const uint8_t* ref, 120af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16_t* in, uint8_t* dst) { 121af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int16x8x2_t rows; 122af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); 123af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora TransformPass(&rows); 124af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora TransformPass(&rows); 125af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora Add4x4(rows.val[0], rows.val[1], ref, dst); 126af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 127af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 128af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#else 129af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 1301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void ITransformOne(const uint8_t* ref, 1311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int16_t* in, uint8_t* dst) { 1321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int kBPS = BPS; 133af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16_t kC1C2[] = { kC1, kC2, 0, 0 }; 1341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora __asm__ volatile ( 1361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.16 {q1, q2}, [%[in]] \n" 1371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.16 {d0}, [%[kC1C2]] \n" 1381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d2: in[0] 1401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d3: in[8] 1411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d4: in[4] 1421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d5: in[12] 1431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d3, d4 \n" 1441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q8 = {in[4], in[12]} * kC1 * 2 >> 16 1461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q9 = {in[4], in[12]} * kC2 >> 16 1471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqdmulh.s16 q8, q2, d0[0] \n" 1481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqdmulh.s16 q9, q2, d0[1] \n" 1491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d22 = a = in[0] + in[8] 1511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d23 = b = in[0] - in[8] 1521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d22, d2, d3 \n" 1531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d23, d2, d3 \n" 1541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q8 = in[4]/[12] * kC1 >> 16 1561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshr.s16 q8, q8, #1 \n" 1571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Add {in[4], in[12]} back after the multiplication. 1591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 q8, q2, q8 \n" 1601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d20 = c = in[4]*kC2 - in[12]*kC1 1621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d21 = d = in[4]*kC1 + in[12]*kC2 1631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d20, d18, d17 \n" 1641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d21, d19, d16 \n" 1651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d2 = tmp[0] = a + d 1671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d3 = tmp[1] = b + c 1681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d4 = tmp[2] = b - c 1691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d5 = tmp[3] = a - d 1701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d2, d22, d21 \n" 1711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d3, d23, d20 \n" 1721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d4, d23, d20 \n" 1731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d5, d22, d21 \n" 1741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vzip.16 q1, q2 \n" 1761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vzip.16 q1, q2 \n" 1771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d3, d4 \n" 1791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 1811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q9 = {tmp[4], tmp[12]} * kC2 >> 16 1821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqdmulh.s16 q8, q2, d0[0] \n" 1831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqdmulh.s16 q9, q2, d0[1] \n" 1841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d22 = a = tmp[0] + tmp[8] 1861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d23 = b = tmp[0] - tmp[8] 1871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d22, d2, d3 \n" 1881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d23, d2, d3 \n" 1891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshr.s16 q8, q8, #1 \n" 1911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 q8, q2, q8 \n" 1921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d20 = c = in[4]*kC2 - in[12]*kC1 1941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d21 = d = in[4]*kC1 + in[12]*kC2 1951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d20, d18, d17 \n" 1961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d21, d19, d16 \n" 1971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d2 = tmp[0] = a + d 1991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d3 = tmp[1] = b + c 2001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d4 = tmp[2] = b - c 2011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d5 = tmp[3] = a - d 2021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d2, d22, d21 \n" 2031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d3, d23, d20 \n" 2041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d4, d23, d20 \n" 2051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d5, d22, d21 \n" 2061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d6[0], [%[ref]], %[kBPS] \n" 2081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d6[1], [%[ref]], %[kBPS] \n" 2091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d7[0], [%[ref]], %[kBPS] \n" 2101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d7[1], [%[ref]], %[kBPS] \n" 2111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "sub %[ref], %[ref], %[kBPS], lsl #2 \n" 2131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // (val) + 4 >> 3 2151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vrshr.s16 d2, d2, #3 \n" 2161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vrshr.s16 d3, d3, #3 \n" 2171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vrshr.s16 d4, d4, #3 \n" 2181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vrshr.s16 d5, d5, #3 \n" 2191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vzip.16 q1, q2 \n" 2211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vzip.16 q1, q2 \n" 2221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Must accumulate before saturating 2241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u8 q8, d6 \n" 2251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u8 q9, d7 \n" 2261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 q1, q1, q8 \n" 2281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 q2, q2, q9 \n" 2291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqmovun.s16 d0, q1 \n" 2311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqmovun.s16 d1, q2 \n" 2321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 2341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 2351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 2361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vst1.32 d1[1], [%[dst]] \n" 2371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [in] "+r"(in), [dst] "+r"(dst) // modified registers 2391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants 2401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered 2411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora ); 2421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 2431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 244af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif // USE_INTRINSICS 245af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 2461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void ITransform(const uint8_t* ref, 2471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int16_t* in, uint8_t* dst, int do_two) { 2481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora ITransformOne(ref, in, dst); 2491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora if (do_two) { 2501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora ITransformOne(ref + 4, in + 16, dst + 4); 2511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora } 2521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 2531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 254af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Load all 4x4 pixels into a single uint8x16_t variable. 255af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic uint8x16_t Load4x4(const uint8_t* src) { 256af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint32x4_t out = { 0, 0, 0, 0 }; 257af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); 258af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); 259af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); 260af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); 261af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return vreinterpretq_u8_u32(out); 2621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 2631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Forward transform. 2651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 266af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(USE_INTRINSICS) 267af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 268af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B, 269af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t C, const int16x4_t D, 270af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int16x8_t* const out01, 271af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int16x8_t* const out32) { 272af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4x2_t AB = vtrn_s16(A, B); 273af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4x2_t CD = vtrn_s16(C, D); 274af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]), 275af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vreinterpret_s32_s16(CD.val[0])); 276af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]), 277af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vreinterpret_s32_s16(CD.val[1])); 278af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora *out01 = vreinterpretq_s16_s64( 279af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]), 280af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vreinterpret_s64_s32(tmp13.val[0]))); 281af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora *out32 = vreinterpretq_s16_s64( 282af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]), 283af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vreinterpret_s64_s32(tmp02.val[1]))); 284af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 285af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 286af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a, 287af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x8_t b) { 288af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return vreinterpretq_s16_u16(vsubl_u8(a, b)); 289af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 290af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 291af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void FTransform(const uint8_t* src, const uint8_t* ref, 292af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int16_t* out) { 293af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int16x8_t d0d1, d3d2; // working 4x4 int16 variables 294af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 295af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x16_t S0 = Load4x4(src); 296af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x16_t R0 = Load4x4(ref); 297af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0)); 298af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0)); 299af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t D0 = vget_low_s16(D0D1); 300af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t D1 = vget_high_s16(D0D1); 301af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t D2 = vget_low_s16(D2D3); 302af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t D3 = vget_high_s16(D2D3); 303af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2); 304af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 305af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { // 1rst pass 306af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t kCst937 = vdupq_n_s32(937); 307af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t kCst1812 = vdupq_n_s32(1812); 308af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) 309af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) 310af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3); 311af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2), 312af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vget_high_s16(a0a1_2)); 313af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2), 314af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vget_high_s16(a0a1_2)); 315af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); 316af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); 317af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); 318af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); 319af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9); 320af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9); 321af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2); 322af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 323af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { // 2nd pass 324af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0) 325af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16)); 326af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t kCst51000 = vdupq_n_s32(51000); 327af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) 328af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) 329af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7)); 330af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4); 331af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4); 332af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); 333af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); 334af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); 335af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); 336af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000); 337af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000); 338af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t a3_eq_0 = 339af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0))); 340af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0); 341af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_s16(out + 0, out0); 342af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_s16(out + 4, out1); 343af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_s16(out + 8, out2); 344af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_s16(out + 12, out3); 345af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 346af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 347af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 348af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#else 349af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 3501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm 3511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic const int16_t kCoeff16[] = { 3521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217 3531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}; 3541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic const int32_t kCoeff32[] = { 3551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1812, 1812, 1812, 1812, 3561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 937, 937, 937, 937, 3571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 12000, 12000, 12000, 12000, 3581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 51000, 51000, 51000, 51000 3591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}; 3601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void FTransform(const uint8_t* src, const uint8_t* ref, 3621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora int16_t* out) { 3631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int kBPS = BPS; 3641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint8_t* src_ptr = src; 3651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint8_t* ref_ptr = ref; 3661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int16_t* coeff16 = kCoeff16; 3671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int32_t* coeff32 = kCoeff32; 3681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora __asm__ volatile ( 3701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // load src into q4, q5 in high half 3711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n" 3721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n" 3731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n" 3741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d11}, [%[src_ptr]] \n" 3751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // load ref into q6, q7 in high half 3771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n" 3781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n" 3791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n" 3801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d15}, [%[ref_ptr]] \n" 3811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Pack the high values in to q4 and q6 3831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q4, q5 \n" 3841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q6, q7 \n" 3851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d[0-3] = src - ref 3871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.u8 q0, d8, d12 \n" 3881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.u8 q1, d9, d13 \n" 3891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // load coeff16 into q8(d16=5352, d17=2217) 3911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.16 {q8}, [%[coeff16]] \n" 3921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // load coeff32 high half into q9 = 1812, q10 = 937 3941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 {q9, q10}, [%[coeff32]]! \n" 3951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // load coeff32 low half into q11=12000, q12=51000 3971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 {q11,q12}, [%[coeff32]] \n" 3981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // part 1 4001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Transpose. Register dN is the same as dN in C 4011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 d0, d2 \n" 4021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 d1, d3 \n" 4031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 d0, d1 \n" 4041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 d2, d3 \n" 4051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3 4071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2 4081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2 4091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3 4101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d0, d4, d5 \n" // a0 + a1 4121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3 4131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d2, d4, d5 \n" // a0 - a1 4141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3 4151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812 4171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937 4181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812 4191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352 4201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9 4221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9 4231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshrn.s32 d1, q9, #9 \n" 4241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshrn.s32 d3, q10, #9 \n" 4251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // part 2 4271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] 4281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 d0, d2 \n" 4291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 d1, d3 \n" 4301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 d0, d1 \n" 4311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 d2, d3 \n" 4321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmov.s16 d26, #7 \n" 4341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12] 4361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8] 4371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8] 4381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d4, d4, d26 \n" // a1 + 7 4391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12] 4401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7 4421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7 4431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000 4451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000 4461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vceq.s16 d4, d7, #0 \n" 4481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshr.s16 d0, d0, #4 \n" 4501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshr.s16 d2, d2, #4 \n" 4511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000 4531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000 4541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4550406ce1417f76f2034833414dcecc9f56253640cVikas Arora "vmvn d4, d4 \n" // !(d1 == 0) 4561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // op[4] = (c1*2217 + d1*5352 + 12000)>>16 4571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshrn.s32 d1, q11, #16 \n" 4581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // op[4] += (d1!=0) 4591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d1, d1, d4 \n" 4601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // op[12]= (d1*2217 - c1*5352 + 51000)>>16 4611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshrn.s32 d3, q12, #16 \n" 4621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // set result to out array 4641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vst1.16 {q0, q1}, [%[out]] \n" 4651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr), 4661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora [coeff32] "+r"(coeff32) // modified registers 4671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16), 4681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora [out] "r"(out) // constants 4691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", 4701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "q10", "q11", "q12", "q13" // clobbered 4711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora ); 4721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 4731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 474af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif 475af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 476af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define LOAD_LANE_16b(VALUE, LANE) do { \ 477af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \ 478af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora src += stride; \ 479af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} while (0) 480af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 481af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void FTransformWHT(const int16_t* src, int16_t* out) { 482af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int stride = 16; 483af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t zero = vdup_n_s16(0); 484af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int32x4x4_t tmp0; 485af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int16x4x4_t in; 486af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora INIT_VECTOR4(in, zero, zero, zero, zero); 487af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[0], 0); 488af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[1], 0); 489af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[2], 0); 490af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[3], 0); 491af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[0], 1); 492af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[1], 1); 493af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[2], 1); 494af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[3], 1); 495af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[0], 2); 496af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[1], 2); 497af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[2], 2); 498af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[3], 2); 499af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[0], 3); 500af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[1], 3); 501af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[2], 3); 502af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_16b(in.val[3], 3); 503af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 504af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 505af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a0 = in[0 * 16] + in[2 * 16] 506af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a1 = in[1 * 16] + in[3 * 16] 507af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a2 = in[1 * 16] - in[3 * 16] 508af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a3 = in[0 * 16] - in[2 * 16] 509af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]); 510af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]); 511af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]); 512af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]); 513af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora tmp0.val[0] = vaddq_s32(a0, a1); 514af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora tmp0.val[1] = vaddq_s32(a3, a2); 515af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora tmp0.val[2] = vsubq_s32(a3, a2); 516af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora tmp0.val[3] = vsubq_s32(a0, a1); 517af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 518af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 519af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4x4_t tmp1 = Transpose4x4(tmp0); 520af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a0 = tmp[0 + i] + tmp[ 8 + i] 521af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a1 = tmp[4 + i] + tmp[12 + i] 522af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a2 = tmp[4 + i] - tmp[12 + i] 523af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a3 = tmp[0 + i] - tmp[ 8 + i] 524af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]); 525af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]); 526af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]); 527af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]); 528af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1 529af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1 530af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1 531af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1 532af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t out0 = vmovn_s32(b0); 533af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t out1 = vmovn_s32(b1); 534af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t out2 = vmovn_s32(b2); 535af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x4_t out3 = vmovn_s32(b3); 536af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 537af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_s16(out + 0, out0); 538af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_s16(out + 4, out1); 539af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_s16(out + 8, out2); 540af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_s16(out + 12, out3); 541af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 5421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 543af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef LOAD_LANE_16b 5441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 5451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------ 5461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Texture distortion 5471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// 5481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// We try to match the spectral content (weighted) between source and 5491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// reconstructed samples. 5501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 551af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// This code works but is *slower* than the inlined-asm version below 552af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to 553af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// USE_INTRINSICS define. 554af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// With gcc-4.8, it's only slightly slower than the inlined. 555af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if defined(USE_INTRINSICS) 556af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 557af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Zero extend an uint16x4_t 'v' to an int32x4_t. 558af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) { 559af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return vreinterpretq_s32_u32(vmovl_u16(v)); 560af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 561af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 562af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Does a regular 4x4 transpose followed by an adjustment of the upper columns 563af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// in the inner rows to restore the source order of differences, 564af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// i.e., a0 - a1 | a3 - a2. 565af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) { 566af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int32x4x4_t out = Transpose4x4(rows); 567af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // restore source order in the columns containing differences. 568af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x2_t r1h = vget_high_s32(out.val[1]); 569af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x2_t r2h = vget_high_s32(out.val[2]); 570af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h); 571af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h); 572af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return out; 573af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 574af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 575af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1, 576af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x8_t r2r3) { 577af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a0 = in[0] + in[2] | a1 = in[1] + in[3] 578af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3); 579af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a3 = in[0] - in[2] | a2 = in[1] - in[3] 580af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3); 581af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1)); // a0 + a1 582af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2)); // a3 + a2 583af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations. 584af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 585af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 586af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8x2_t transpose = 587af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2)); 588af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // tmp[3] = a0 - a1 | tmp[2] = a3 - a2 589af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]), 590af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vget_low_s16(transpose.val[1])); 591af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]), 592af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vget_high_s16(transpose.val[1])); 593af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // [0]: tmp[3] [1]: tmp[2] 594af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2); 595af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } }; 596af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return res; 597af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 598af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 599af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) { 600af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a0 = tmp[0 + i] + tmp[8 + i]; 601af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]); 602af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a1 = tmp[4 + i] + tmp[12+ i]; 603af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]); 604af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a2 = tmp[4 + i] - tmp[12+ i]; 605af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]); 606af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a3 = tmp[0 + i] - tmp[8 + i]; 607af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]); 608af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1)); // abs(a0 + a1) 609af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2)); // abs(a3 + a2) 610af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t b2 = vabdq_s32(a3, a2); // abs(a3 - a2) 611af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t b3 = vabdq_s32(a0, a1); // abs(a0 - a1) 612af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4x4_t res = { { b0, b1, b2, b3 } }; 613af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return res; 614af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 615af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 616af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Calculate the weighted sum of the rows in 'b'. 617af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b, 618af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t w0, const int32x4_t w1, 619af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t w2, const int32x4_t w3) { 620af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t s0 = vmulq_s32(w0, b.val[0]); 621af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]); 622af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]); 623af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]); 624af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int64x2_t sum1 = vpaddlq_s32(s3); 625af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1)); 626af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return sum2; 627af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 628af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 629af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#define LOAD_LANE_32b(src, VALUE, LANE) \ 630af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE)) 631af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 632af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Hadamard transform 633af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Returns the weighted sum of the absolute value of transformed coefficients. 634af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b, 635af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16_t* const w) { 636af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint32x4_t d0d1 = { 0, 0, 0, 0 }; 637af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint32x4_t d2d3 = { 0, 0, 0, 0 }; 638af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_32b(a + 0 * BPS, d0d1, 0); // a00 a01 a02 a03 639af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_32b(a + 1 * BPS, d0d1, 1); // a10 a11 a12 a13 640af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_32b(b + 0 * BPS, d0d1, 2); // b00 b01 b02 b03 641af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_32b(b + 1 * BPS, d0d1, 3); // b10 b11 b12 b13 642af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_32b(a + 2 * BPS, d2d3, 0); // a20 a21 a22 a23 643af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_32b(a + 3 * BPS, d2d3, 1); // a30 a31 a32 a33 644af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_32b(b + 2 * BPS, d2d3, 2); // b20 b21 b22 b23 645af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora LOAD_LANE_32b(b + 3 * BPS, d2d3, 3); // b30 b31 b32 b33 646af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 647af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 648af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31 649af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33 650af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8x2_t tmp = 651af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3)); 652af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]); 653af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]); 654af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8), 655af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vget_low_u8(d2d3u8)); 656af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8), 657af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vget_high_u8(d2d3u8)); 658af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a); 659af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b); 660af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a); 661af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b); 662af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0)); 663af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4)); 664af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8)); 665af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12)); 666af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3); 667af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3); 668af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1), 669af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vreinterpret_s32_s64(sum2)); 670af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int32x2_t res = vshr_n_s32(diff, 5); 671af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return vget_lane_s32(res, 0); 672af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 673af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 674af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 675af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#undef LOAD_LANE_32b 676af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 677af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#else 678af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 6791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Hadamard transform 6801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Returns the weighted sum of the absolute value of transformed coefficients. 6811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b, 6821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint16_t* const w) { 6831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int kBPS = BPS; 6841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint8_t* A = a; 6851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint8_t* B = b; 6861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint16_t* W = w; 6871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora int sum; 6881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora __asm__ volatile ( 6891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d0[0], [%[a]], %[kBPS] \n" 6901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d0[1], [%[a]], %[kBPS] \n" 6911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d2[0], [%[a]], %[kBPS] \n" 6921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d2[1], [%[a]] \n" 6931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 6941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d1[0], [%[b]], %[kBPS] \n" 6951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d1[1], [%[b]], %[kBPS] \n" 6961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d3[0], [%[b]], %[kBPS] \n" 6971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d3[1], [%[b]] \n" 6981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 6991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // a d0/d2, b d1/d3 7001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d0/d1: 01 01 01 01 7011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d2/d3: 23 23 23 23 7021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // But: it goes 01 45 23 67 7031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Notice the middle values are transposed 7041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 q0, q1 \n" 7051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // {a0, a1} = {in[0] + in[2], in[1] + in[3]} 7071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vaddl.u8 q2, d0, d2 \n" 7081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vaddl.u8 q10, d1, d3 \n" 7091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // {a3, a2} = {in[0] - in[2], in[1] - in[3]} 7101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.u8 q3, d0, d2 \n" 7111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.u8 q11, d1, d3 \n" 7121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // tmp[0] = a0 + a1 7141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.s16 q0, q2 \n" 7151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.s16 q8, q10 \n" 7161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // tmp[1] = a3 + a2 7181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.s16 q1, q3 \n" 7191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.s16 q9, q11 \n" 7201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // No pair subtract 7221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q2 = {a0, a3} 7231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q3 = {a1, a2} 7241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 q2, q3 \n" 7251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 q10, q11 \n" 7261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2} 7281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.s16 q12, d4, d6 \n" 7291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.s16 q13, d5, d7 \n" 7301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.s16 q14, d20, d22 \n" 7311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.s16 q15, d21, d23 \n" 7321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // separate tmp[3] and tmp[2] 7341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q12 = tmp[3] 7351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q13 = tmp[2] 7361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q12, q13 \n" 7371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q14, q15 \n" 7381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Transpose tmp for a 7401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d1, d26 \n" // vtrn.64 7411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d3, d24 \n" // vtrn.64 7421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q0, q1 \n" 7431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q13, q12 \n" 7441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Transpose tmp for b 7461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d17, d30 \n" // vtrn.64 7471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d19, d28 \n" // vtrn.64 7481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q8, q9 \n" 7491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q15, q14 \n" 7501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // The first Q register is a, the second b. 7521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q0/8 tmp[0-3] 7531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q13/15 tmp[4-7] 7541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q1/9 tmp[8-11] 7551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q12/14 tmp[12-15] 7561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // These are still in 01 45 23 67 order. We fix it easily in the addition 7588b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora // case but the subtraction propagates them. 7591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d3, d27 \n" 7601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d19, d31 \n" 7611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // a0 = tmp[0] + tmp[8] 7631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q2, q0, q1 \n" 7641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q3, q8, q9 \n" 7651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // a1 = tmp[4] + tmp[12] 7671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q10, q13, q12 \n" 7681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q11, q15, q14 \n" 7691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // a2 = tmp[4] - tmp[12] 7711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q13, q13, q12 \n" 7721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q15, q15, q14 \n" 7731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // a3 = tmp[0] - tmp[8] 7751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q0, q0, q1 \n" 7761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q8, q8, q9 \n" 7771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // b0 = a0 + a1 7791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q1, q2, q10 \n" 7801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q9, q3, q11 \n" 7811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // b1 = a3 + a2 7831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q12, q0, q13 \n" 7841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q14, q8, q15 \n" 7851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // b2 = a3 - a2 7871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q0, q0, q13 \n" 7881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q8, q8, q15 \n" 7891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // b3 = a0 - a1 7911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q2, q2, q10 \n" 7921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q3, q3, q11 \n" 7931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.64 {q10, q11}, [%[w]] \n" 7951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(b0) 7971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q1, q1 \n" 7981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q9, q9 \n" 7991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(b1) 8001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q12, q12 \n" 8011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q14, q14 \n" 8021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(b2) 8031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q0, q0 \n" 8041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q8, q8 \n" 8051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(b3) 8061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q2, q2 \n" 8071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q3, q3 \n" 8081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // expand w before using. 8101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u16 q13, d20 \n" 8111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u16 q15, d21 \n" 8121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // w[0] * abs(b0) 8141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmul.u32 q1, q1, q13 \n" 8151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmul.u32 q9, q9, q13 \n" 8161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // w[4] * abs(b1) 8181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q1, q12, q15 \n" 8191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q9, q14, q15 \n" 8201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // expand w before using. 8221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u16 q13, d22 \n" 8231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u16 q15, d23 \n" 8241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // w[8] * abs(b1) 8261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q1, q0, q13 \n" 8271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q9, q8, q13 \n" 8281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // w[12] * abs(b1) 8301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q1, q2, q15 \n" 8311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q9, q3, q15 \n" 8321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Sum the arrays 8341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.u32 q1, q1 \n" 8351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.u32 q9, q9 \n" 8361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.u64 d2, d3 \n" 8371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.u64 d18, d19 \n" 8381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Hadamard transform needs 4 bits of extra precision (2 bits in each 8401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum 8411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // precision for coeff is 8bit of input + 4bits of Hadamard transform + 8421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // 16bits for w[] + 2 bits of abs() summation. 8431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // 8441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is 8451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // A-OK. 8461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // sum2 - sum1 8481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.u32 d0, d2, d18 \n" 8491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(sum2 - sum1) 8501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 d0, d0 \n" 8511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(sum2 - sum1) >> 5 8521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshr.u32 d0, #5 \n" 8531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // It would be better to move the value straight into r0 but I'm not 8551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // entirely sure how this works with inline assembly. 8561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmov.32 %[sum], d0[0] \n" 8571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W) 8591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [kBPS] "r"(kBPS) 8601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", 8611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "q10", "q11", "q12", "q13", "q14", "q15" // clobbered 8621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora ) ; 8631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora return sum; 8651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 8661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 867af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif // USE_INTRINSICS 868af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 8691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic int Disto16x16(const uint8_t* const a, const uint8_t* const b, 8701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint16_t* const w) { 8711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora int D = 0; 8721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora int x, y; 8731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora for (y = 0; y < 16 * BPS; y += 4 * BPS) { 8741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora for (x = 0; x < 16; x += 4) { 8751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora D += Disto4x4(a + x + y, b + x + y, w); 8761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora } 8771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora } 8781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora return D; 8791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 8801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 881af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//------------------------------------------------------------------------------ 882af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 883af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic void CollectHistogram(const uint8_t* ref, const uint8_t* pred, 884af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int start_block, int end_block, 885af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8Histogram* const histo) { 886af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); 887af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int j; 888af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (j = start_block; j < end_block; ++j) { 889af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int16_t out[16]; 890af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); 891af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 892af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int k; 893af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t a0 = vld1q_s16(out + 0); 894af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t b0 = vld1q_s16(out + 8); 895af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0)); 896af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0)); 897af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t a2 = vshrq_n_u16(a1, 3); 898af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t b2 = vshrq_n_u16(b1, 3); 899af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh); 900af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh); 901af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1q_s16(out + 0, vreinterpretq_s16_u16(a3)); 902af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1q_s16(out + 8, vreinterpretq_s16_u16(b3)); 903af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Convert coefficients to bin. 904af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (k = 0; k < 16; ++k) { 905af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora histo->distribution[out[k]]++; 906af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 907af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 908af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 909af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 910af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 911af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//------------------------------------------------------------------------------ 912af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 913af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, 914af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8_t* const b, 915af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint32x4_t* const sum) { 916af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x16_t a0 = vld1q_u8(a); 917af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x16_t b0 = vld1q_u8(b); 918af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x16_t abs_diff = vabdq_u8(a0, b0); 919af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); 920af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); 921af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora *sum = vpadalq_u16(*sum, prod); // pair-wise add and accumulate 922af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 923af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 924af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Horizontal sum of all four uint32_t values in 'sum'. 925af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SumToInt(uint32x4_t sum) { 926af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint64x2_t sum2 = vpaddlq_u32(sum); 927af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); 928af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return (int)sum3; 929af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 930af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 931af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE16x16(const uint8_t* a, const uint8_t* b) { 932af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint32x4_t sum = { 0, 0, 0, 0 }; 933af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int y; 934af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (y = 0; y < 16; ++y) { 935af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); 936af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 937af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return SumToInt(sum); 938af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 939af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 940af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE16x8(const uint8_t* a, const uint8_t* b) { 941af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint32x4_t sum = { 0, 0, 0, 0 }; 942af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int y; 943af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (y = 0; y < 8; ++y) { 944af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); 945af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 946af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return SumToInt(sum); 947af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 948af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 949af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE8x8(const uint8_t* a, const uint8_t* b) { 950af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint32x4_t sum = { 0, 0, 0, 0 }; 951af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora int y; 952af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora for (y = 0; y < 8; ++y) { 953af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x8_t a0 = vld1_u8(a + y * BPS); 954af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x8_t b0 = vld1_u8(b + y * BPS); 955af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x8_t abs_diff = vabd_u8(a0, b0); 956af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); 957af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora sum = vpadalq_u16(sum, prod); 958af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora } 959af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return SumToInt(sum); 960af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 961af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 962af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int SSE4x4(const uint8_t* a, const uint8_t* b) { 963af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x16_t a0 = Load4x4(a); 964af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x16_t b0 = Load4x4(b); 965af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint8x16_t abs_diff = vabdq_u8(a0, b0); 966af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); 967af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); 968af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return SumToInt(vpaddlq_u16(prod)); 969af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 970af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 971af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora//------------------------------------------------------------------------------ 972af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 973af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// Compilation with gcc-4.6.x is problematic for now and vtbl? are unavailable 974af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora// in iOS/arm64 builds. Disable this function in those cases. 975af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__)) 976af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 977af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int16x8_t Quantize(int16_t* const in, 978af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const VP8Matrix* const mtx, int offset) { 979af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); 980af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); 981af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); 982af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]); 983af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]); 984af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 985af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t a = vld1q_s16(in + offset); // in 986af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in) 987af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t sign = vshrq_n_s16(a, 15); // sign 988af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen 989af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq)); 990af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq)); 991af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint32x4_t m2 = vhaddq_u32(m0, bias0); 992af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1 993af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16), 994af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1 995af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL)); 996af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign); 997af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign 998af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q)); 999af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1q_s16(in + offset, c4); 1000af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora assert(QFIX == 17); // this function can't work as is if QFIX != 16+1 1001af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return c3; 1002af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 1003af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 1004af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic const uint8_t kShuffles[4][8] = { 1005af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 0, 1, 2, 3, 8, 9, 16, 17 }, 1006af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 10, 11, 4, 5, 6, 7, 12, 13 }, 1007af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 18, 19, 24, 25, 26, 27, 20, 21 }, 1008af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora { 14, 15, 22, 23, 28, 29, 30, 31 } 1009af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora}; 1010af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 1011af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arorastatic int QuantizeBlock(int16_t in[16], int16_t out[16], 1012af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const VP8Matrix* const mtx) { 1013af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t out0 = Quantize(in, mtx, 0); 1014af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora const int16x8_t out1 = Quantize(in, mtx, 8); 1015af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora uint8x8x4_t all_out; 1016af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora INIT_VECTOR4(all_out, 1017af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vreinterpret_u8_s16(vget_low_s16(out0)), 1018af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vreinterpret_u8_s16(vget_high_s16(out0)), 1019af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vreinterpret_u8_s16(vget_low_s16(out1)), 1020af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vreinterpret_u8_s16(vget_high_s16(out1))); 1021af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // Zigzag reordering 1022af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_u8((uint8_t*)(out + 0), vtbl4_u8(all_out, vld1_u8(kShuffles[0]))); 1023af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_u8((uint8_t*)(out + 4), vtbl4_u8(all_out, vld1_u8(kShuffles[1]))); 1024af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_u8((uint8_t*)(out + 8), vtbl4_u8(all_out, vld1_u8(kShuffles[2]))); 1025af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora vst1_u8((uint8_t*)(out + 12), vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); 1026af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora // test zeros 1027af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora if (*(uint64_t*)(out + 0) != 0) return 1; 1028af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora if (*(uint64_t*)(out + 4) != 0) return 1; 1029af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora if (*(uint64_t*)(out + 8) != 0) return 1; 1030af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora if (*(uint64_t*)(out + 12) != 0) return 1; 1031af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora return 0; 1032af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora} 1033af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 1034af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif // !WORK_AROUND_GCC && !__aarch64__ 1035af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora 10361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#endif // WEBP_USE_NEON 10371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 10381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------ 10391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Entry point 10401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 10411e7bf8805bd030c19924a5306837ecd72c295751Vikas Aroraextern void VP8EncDspInitNEON(void); 10421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 10431e7bf8805bd030c19924a5306837ecd72c295751Vikas Aroravoid VP8EncDspInitNEON(void) { 10441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#if defined(WEBP_USE_NEON) 10451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora VP8ITransform = ITransform; 10461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora VP8FTransform = FTransform; 10471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 10481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora VP8FTransformWHT = FTransformWHT; 10491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 10501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora VP8TDisto4x4 = Disto4x4; 10511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora VP8TDisto16x16 = Disto16x16; 1052af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8CollectHistogram = CollectHistogram; 1053af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8SSE16x16 = SSE16x16; 1054af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8SSE16x8 = SSE16x8; 1055af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8SSE8x8 = SSE8x8; 1056af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8SSE4x4 = SSE4x4; 1057af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#if !(defined(WORK_AROUND_GCC) || defined(__aarch64__)) 1058af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora VP8EncQuantizeBlock = QuantizeBlock; 1059af51b94a435132e9014c324e25fb686b3d07a8c8Vikas Arora#endif 10601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#endif // WEBP_USE_NEON 10611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 1062