11e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Copyright 2012 Google Inc. All Rights Reserved. 21e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// 30406ce1417f76f2034833414dcecc9f56253640cVikas Arora// Use of this source code is governed by a BSD-style license 40406ce1417f76f2034833414dcecc9f56253640cVikas Arora// that can be found in the COPYING file in the root of the source 50406ce1417f76f2034833414dcecc9f56253640cVikas Arora// tree. An additional intellectual property rights grant can be found 60406ce1417f76f2034833414dcecc9f56253640cVikas Arora// in the file PATENTS. All contributing project authors may 70406ce1417f76f2034833414dcecc9f56253640cVikas Arora// be found in the AUTHORS file in the root of the source tree. 81e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// ----------------------------------------------------------------------------- 91e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// 101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// ARM NEON version of speed-critical encoding functions. 111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// 121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// adapted from libvpx (http://www.webmproject.org/code/) 131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#include "./dsp.h" 151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#if defined(WEBP_USE_NEON) 171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include <assert.h> 1933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 2033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#include "./neon.h" 211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#include "../enc/vp8enci.h" 221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------ 241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Transforms (Paragraph 14.4) 251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Inverse transform. 2733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// This code is pretty much the same as TransformOne in the dec_neon.c, except 281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// for subtraction to *ref. See the comments there for algorithmic explanations. 2933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 3033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic const int16_t kC1 = 20091; 3133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic const int16_t kC2 = 17734; // half of kC2, actually. See comment above. 3233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 3333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// This code works but is *slower* than the inlined-asm version below 3433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to 3533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// USE_INTRINSICS define. 3633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// With gcc-4.8, it's a little faster speed than inlined-assembly. 3733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(USE_INTRINSICS) 3833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 3933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t. 4033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) { 4133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v))); 4233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 4333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 4433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result 4533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// to the corresponding rows of 'dst'. 4633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst, 4733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t dst01, 4833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t dst23) { 4933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Unsigned saturate to 8b. 5033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x8_t dst01_u8 = vqmovun_s16(dst01); 5133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x8_t dst23_u8 = vqmovun_s16(dst23); 5233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 5333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Store the results. 5433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0); 5533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1); 5633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0); 5733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1); 5833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 5933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 6033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23, 6133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8_t* const ref, uint8_t* const dst) { 6233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint32x2_t dst01 = vdup_n_u32(0); 6333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint32x2_t dst23 = vdup_n_u32(0); 6433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 6533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Load the source pixels. 6633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0); 6733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0); 6833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1); 6933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1); 7033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 7133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora { 7233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Convert to 16b. 7333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t dst01_s16 = ConvertU8ToS16(dst01); 7433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t dst23_s16 = ConvertU8ToS16(dst23); 7533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 7633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Descale with rounding. 7733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3); 7833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3); 7933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Add the inverse transform. 8033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora SaturateAndStore4x4(dst, out01, out23); 8133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 8233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 8333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 8433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1, 8533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int16x8x2_t* const out) { 8633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1 8733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3 8833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ... 8933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // b0 d0 b1 d1 b2 d2 ... 9033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora *out = vzipq_s16(tmp0.val[0], tmp0.val[1]); 9133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 9233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 9333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void TransformPass(int16x8x2_t* const rows) { 9433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // {rows} = in0 | in4 9533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // in8 | in12 9633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // B1 = in4 | in12 9733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t B1 = 9833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1])); 9933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // C0 = kC1 * in4 | kC1 * in12 10033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // C1 = kC2 * in4 | kC2 * in12 10133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1); 10233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2); 10333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]), 10433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vget_low_s16(rows->val[1])); // in0 + in8 10533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]), 10633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vget_low_s16(rows->val[1])); // in0 - in8 10733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // c = kC2 * in4 - kC1 * in12 10833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // d = kC1 * in4 + kC2 * in12 10933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0)); 11033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1)); 11133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b 11233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c 11333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c 11433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c 11533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp)); 11633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora Transpose8x2(E0, E1, rows); 11733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 11833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 11933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ITransformOne(const uint8_t* ref, 12033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16_t* in, uint8_t* dst) { 12133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int16x8x2_t rows; 12233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8)); 12333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora TransformPass(&rows); 12433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora TransformPass(&rows); 12533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora Add4x4(rows.val[0], rows.val[1], ref, dst); 12633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 12733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 12833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else 12933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 1301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void ITransformOne(const uint8_t* ref, 1311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int16_t* in, uint8_t* dst) { 1321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int kBPS = BPS; 13333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16_t kC1C2[] = { kC1, kC2, 0, 0 }; 1341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora __asm__ volatile ( 1361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.16 {q1, q2}, [%[in]] \n" 1371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.16 {d0}, [%[kC1C2]] \n" 1381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d2: in[0] 1401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d3: in[8] 1411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d4: in[4] 1421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d5: in[12] 1431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d3, d4 \n" 1441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q8 = {in[4], in[12]} * kC1 * 2 >> 16 1461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q9 = {in[4], in[12]} * kC2 >> 16 1471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqdmulh.s16 q8, q2, d0[0] \n" 1481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqdmulh.s16 q9, q2, d0[1] \n" 1491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d22 = a = in[0] + in[8] 1511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d23 = b = in[0] - in[8] 1521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d22, d2, d3 \n" 1531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d23, d2, d3 \n" 1541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q8 = in[4]/[12] * kC1 >> 16 1561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshr.s16 q8, q8, #1 \n" 1571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Add {in[4], in[12]} back after the multiplication. 1591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 q8, q2, q8 \n" 1601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d20 = c = in[4]*kC2 - in[12]*kC1 1621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d21 = d = in[4]*kC1 + in[12]*kC2 1631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d20, d18, d17 \n" 1641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d21, d19, d16 \n" 1651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d2 = tmp[0] = a + d 1671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d3 = tmp[1] = b + c 1681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d4 = tmp[2] = b - c 1691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d5 = tmp[3] = a - d 1701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d2, d22, d21 \n" 1711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d3, d23, d20 \n" 1721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d4, d23, d20 \n" 1731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d5, d22, d21 \n" 1741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vzip.16 q1, q2 \n" 1761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vzip.16 q1, q2 \n" 1771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d3, d4 \n" 1791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16 1811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q9 = {tmp[4], tmp[12]} * kC2 >> 16 1821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqdmulh.s16 q8, q2, d0[0] \n" 1831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqdmulh.s16 q9, q2, d0[1] \n" 1841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d22 = a = tmp[0] + tmp[8] 1861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d23 = b = tmp[0] - tmp[8] 1871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d22, d2, d3 \n" 1881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d23, d2, d3 \n" 1891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshr.s16 q8, q8, #1 \n" 1911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 q8, q2, q8 \n" 1921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d20 = c = in[4]*kC2 - in[12]*kC1 1941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d21 = d = in[4]*kC1 + in[12]*kC2 1951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d20, d18, d17 \n" 1961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d21, d19, d16 \n" 1971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d2 = tmp[0] = a + d 1991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d3 = tmp[1] = b + c 2001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d4 = tmp[2] = b - c 2011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d5 = tmp[3] = a - d 2021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d2, d22, d21 \n" 2031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 d3, d23, d20 \n" 2041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d4, d23, d20 \n" 2051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqsub.s16 d5, d22, d21 \n" 2061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d6[0], [%[ref]], %[kBPS] \n" 2081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d6[1], [%[ref]], %[kBPS] \n" 2091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d7[0], [%[ref]], %[kBPS] \n" 2101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d7[1], [%[ref]], %[kBPS] \n" 2111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "sub %[ref], %[ref], %[kBPS], lsl #2 \n" 2131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // (val) + 4 >> 3 2151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vrshr.s16 d2, d2, #3 \n" 2161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vrshr.s16 d3, d3, #3 \n" 2171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vrshr.s16 d4, d4, #3 \n" 2181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vrshr.s16 d5, d5, #3 \n" 2191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vzip.16 q1, q2 \n" 2211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vzip.16 q1, q2 \n" 2221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Must accumulate before saturating 2241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u8 q8, d6 \n" 2251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u8 q9, d7 \n" 2261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 q1, q1, q8 \n" 2281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqadd.s16 q2, q2, q9 \n" 2291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqmovun.s16 d0, q1 \n" 2311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vqmovun.s16 d1, q2 \n" 2321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vst1.32 d0[0], [%[dst]], %[kBPS] \n" 2341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vst1.32 d0[1], [%[dst]], %[kBPS] \n" 2351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vst1.32 d1[0], [%[dst]], %[kBPS] \n" 2361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vst1.32 d1[1], [%[dst]] \n" 2371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [in] "+r"(in), [dst] "+r"(dst) // modified registers 2391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants 2401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered 2411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora ); 2421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 2431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 24433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif // USE_INTRINSICS 24533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 2461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void ITransform(const uint8_t* ref, 2471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int16_t* in, uint8_t* dst, int do_two) { 2481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora ITransformOne(ref, in, dst); 2491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora if (do_two) { 2501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora ITransformOne(ref + 4, in + 16, dst + 4); 2511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora } 2521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 2531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 25433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Load all 4x4 pixels into a single uint8x16_t variable. 25533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic uint8x16_t Load4x4(const uint8_t* src) { 2568c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora uint32x4_t out = vdupq_n_u32(0); 25733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0); 25833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1); 25933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2); 26033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3); 26133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return vreinterpretq_u8_u32(out); 2621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 2631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Forward transform. 2651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 26633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(USE_INTRINSICS) 26733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 26833f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B, 26933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t C, const int16x4_t D, 27033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int16x8_t* const out01, 27133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int16x8_t* const out32) { 27233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4x2_t AB = vtrn_s16(A, B); 27333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4x2_t CD = vtrn_s16(C, D); 27433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]), 27533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vreinterpret_s32_s16(CD.val[0])); 27633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]), 27733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vreinterpret_s32_s16(CD.val[1])); 27833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora *out01 = vreinterpretq_s16_s64( 27933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]), 28033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vreinterpret_s64_s32(tmp13.val[0]))); 28133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora *out32 = vreinterpretq_s16_s64( 28233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]), 28333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vreinterpret_s64_s32(tmp02.val[1]))); 28433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 28533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 28633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a, 28733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x8_t b) { 28833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return vreinterpretq_s16_u16(vsubl_u8(a, b)); 28933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 29033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 29133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void FTransform(const uint8_t* src, const uint8_t* ref, 29233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int16_t* out) { 29333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int16x8_t d0d1, d3d2; // working 4x4 int16 variables 29433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora { 29533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x16_t S0 = Load4x4(src); 29633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x16_t R0 = Load4x4(ref); 29733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0)); 29833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0)); 29933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t D0 = vget_low_s16(D0D1); 30033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t D1 = vget_high_s16(D0D1); 30133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t D2 = vget_low_s16(D2D3); 30233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t D3 = vget_high_s16(D2D3); 30333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2); 30433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 30533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora { // 1rst pass 30633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t kCst937 = vdupq_n_s32(937); 30733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t kCst1812 = vdupq_n_s32(1812); 30833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) 30933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) 31033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3); 31133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2), 31233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vget_high_s16(a0a1_2)); 31333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2), 31433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vget_high_s16(a0a1_2)); 31533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); 31633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); 31733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); 31833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); 31933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9); 32033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9); 32133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2); 32233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 32333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora { // 2nd pass 32433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0) 32533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16)); 32633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t kCst51000 = vdupq_n_s32(51000); 32733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1) 32833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2) 32933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7)); 33033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4); 33133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4); 33233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217); 33333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217); 33433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352); 33533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352); 33633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000); 33733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000); 33833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t a3_eq_0 = 33933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0))); 34033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0); 34133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_s16(out + 0, out0); 34233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_s16(out + 4, out1); 34333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_s16(out + 8, out2); 34433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_s16(out + 12, out3); 34533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 34633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 34733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 34833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else 34933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 3501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm 3511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic const int16_t kCoeff16[] = { 3521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217 3531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}; 3541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic const int32_t kCoeff32[] = { 3551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1812, 1812, 1812, 1812, 3561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 937, 937, 937, 937, 3571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 12000, 12000, 12000, 12000, 3581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 51000, 51000, 51000, 51000 3591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}; 3601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic void FTransform(const uint8_t* src, const uint8_t* ref, 3621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora int16_t* out) { 3631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int kBPS = BPS; 3641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint8_t* src_ptr = src; 3651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint8_t* ref_ptr = ref; 3661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int16_t* coeff16 = kCoeff16; 3671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int32_t* coeff32 = kCoeff32; 3681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora __asm__ volatile ( 3701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // load src into q4, q5 in high half 3711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n" 3721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n" 3731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n" 3741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d11}, [%[src_ptr]] \n" 3751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // load ref into q6, q7 in high half 3771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n" 3781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n" 3791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n" 3801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.8 {d15}, [%[ref_ptr]] \n" 3811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Pack the high values in to q4 and q6 3831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q4, q5 \n" 3841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q6, q7 \n" 3851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d[0-3] = src - ref 3871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.u8 q0, d8, d12 \n" 3881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.u8 q1, d9, d13 \n" 3891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // load coeff16 into q8(d16=5352, d17=2217) 3911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.16 {q8}, [%[coeff16]] \n" 3921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // load coeff32 high half into q9 = 1812, q10 = 937 3941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 {q9, q10}, [%[coeff32]]! \n" 3951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // load coeff32 low half into q11=12000, q12=51000 3971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 {q11,q12}, [%[coeff32]] \n" 3981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // part 1 4001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Transpose. Register dN is the same as dN in C 4011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 d0, d2 \n" 4021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 d1, d3 \n" 4031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 d0, d1 \n" 4041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 d2, d3 \n" 4051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3 4071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2 4081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2 4091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3 4101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d0, d4, d5 \n" // a0 + a1 4121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3 4131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d2, d4, d5 \n" // a0 - a1 4141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3 4151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812 4171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937 4181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812 4191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352 4201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9 4221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9 4231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshrn.s32 d1, q9, #9 \n" 4241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshrn.s32 d3, q10, #9 \n" 4251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // part 2 4271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12] 4281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 d0, d2 \n" 4291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 d1, d3 \n" 4301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 d0, d1 \n" 4311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 d2, d3 \n" 4321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmov.s16 d26, #7 \n" 4341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12] 4361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8] 4371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8] 4381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d4, d4, d26 \n" // a1 + 7 4391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12] 4401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7 4421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7 4431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000 4451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000 4461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vceq.s16 d4, d7, #0 \n" 4481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshr.s16 d0, d0, #4 \n" 4501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshr.s16 d2, d2, #4 \n" 4511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000 4531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000 4541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4550406ce1417f76f2034833414dcecc9f56253640cVikas Arora "vmvn d4, d4 \n" // !(d1 == 0) 4561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // op[4] = (c1*2217 + d1*5352 + 12000)>>16 4571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshrn.s32 d1, q11, #16 \n" 4581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // op[4] += (d1!=0) 4591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s16 d1, d1, d4 \n" 4601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // op[12]= (d1*2217 - c1*5352 + 51000)>>16 4611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshrn.s32 d3, q12, #16 \n" 4621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 4631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // set result to out array 4641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vst1.16 {q0, q1}, [%[out]] \n" 4651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr), 4661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora [coeff32] "+r"(coeff32) // modified registers 4671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16), 4681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora [out] "r"(out) // constants 4691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", 4701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "q10", "q11", "q12", "q13" // clobbered 4711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora ); 4721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 4731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 47433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif 47533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 47633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define LOAD_LANE_16b(VALUE, LANE) do { \ 47733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora (VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \ 47833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora src += stride; \ 47933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} while (0) 48033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 48133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void FTransformWHT(const int16_t* src, int16_t* out) { 48233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int stride = 16; 48333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t zero = vdup_n_s16(0); 48433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int32x4x4_t tmp0; 48533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int16x4x4_t in; 48633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora INIT_VECTOR4(in, zero, zero, zero, zero); 48733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[0], 0); 48833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[1], 0); 48933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[2], 0); 49033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[3], 0); 49133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[0], 1); 49233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[1], 1); 49333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[2], 1); 49433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[3], 1); 49533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[0], 2); 49633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[1], 2); 49733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[2], 2); 49833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[3], 2); 49933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[0], 3); 50033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[1], 3); 50133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[2], 3); 50233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_16b(in.val[3], 3); 50333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 50433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora { 50533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a0 = in[0 * 16] + in[2 * 16] 50633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a1 = in[1 * 16] + in[3 * 16] 50733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a2 = in[1 * 16] - in[3 * 16] 50833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a3 = in[0 * 16] - in[2 * 16] 50933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]); 51033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]); 51133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]); 51233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]); 51333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora tmp0.val[0] = vaddq_s32(a0, a1); 51433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora tmp0.val[1] = vaddq_s32(a3, a2); 51533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora tmp0.val[2] = vsubq_s32(a3, a2); 51633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora tmp0.val[3] = vsubq_s32(a0, a1); 51733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 51833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora { 51933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4x4_t tmp1 = Transpose4x4(tmp0); 52033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a0 = tmp[0 + i] + tmp[ 8 + i] 52133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a1 = tmp[4 + i] + tmp[12 + i] 52233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a2 = tmp[4 + i] - tmp[12 + i] 52333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a3 = tmp[0 + i] - tmp[ 8 + i] 52433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]); 52533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]); 52633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]); 52733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]); 52833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1 52933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1 53033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1 53133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1 53233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t out0 = vmovn_s32(b0); 53333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t out1 = vmovn_s32(b1); 53433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t out2 = vmovn_s32(b2); 53533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x4_t out3 = vmovn_s32(b3); 53633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 53733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_s16(out + 0, out0); 53833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_s16(out + 4, out1); 53933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_s16(out + 8, out2); 54033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1_s16(out + 12, out3); 54133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 5421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 54333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#undef LOAD_LANE_16b 5441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 5451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------ 5461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Texture distortion 5471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// 5481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// We try to match the spectral content (weighted) between source and 5491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// reconstructed samples. 5501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 55133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// This code works but is *slower* than the inlined-asm version below 55233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to 55333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// USE_INTRINSICS define. 55433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// With gcc-4.8, it's only slightly slower than the inlined. 55533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#if defined(USE_INTRINSICS) 55633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 55733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Zero extend an uint16x4_t 'v' to an int32x4_t. 55833f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) { 55933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return vreinterpretq_s32_u32(vmovl_u16(v)); 56033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 56133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 56233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Does a regular 4x4 transpose followed by an adjustment of the upper columns 56333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// in the inner rows to restore the source order of differences, 56433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// i.e., a0 - a1 | a3 - a2. 56533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) { 56633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int32x4x4_t out = Transpose4x4(rows); 56733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // restore source order in the columns containing differences. 56833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x2_t r1h = vget_high_s32(out.val[1]); 56933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x2_t r2h = vget_high_s32(out.val[2]); 57033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h); 57133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h); 57233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return out; 57333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 57433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 57533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1, 57633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x8_t r2r3) { 57733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a0 = in[0] + in[2] | a1 = in[1] + in[3] 57833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3); 57933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a3 = in[0] - in[2] | a2 = in[1] - in[3] 58033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3); 58133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1)); // a0 + a1 58233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2)); // a3 + a2 58333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations. 58433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 58533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 58633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8x2_t transpose = 58733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2)); 58833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // tmp[3] = a0 - a1 | tmp[2] = a3 - a2 58933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]), 59033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vget_low_s16(transpose.val[1])); 59133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]), 59233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vget_high_s16(transpose.val[1])); 59333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // [0]: tmp[3] [1]: tmp[2] 59433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2); 59533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } }; 59633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return res; 59733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 59833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 59933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) { 60033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a0 = tmp[0 + i] + tmp[8 + i]; 60133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]); 60233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a1 = tmp[4 + i] + tmp[12+ i]; 60333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]); 60433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a2 = tmp[4 + i] - tmp[12+ i]; 60533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]); 60633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a3 = tmp[0 + i] - tmp[8 + i]; 60733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]); 60833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1)); // abs(a0 + a1) 60933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2)); // abs(a3 + a2) 61033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t b2 = vabdq_s32(a3, a2); // abs(a3 - a2) 61133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t b3 = vabdq_s32(a0, a1); // abs(a0 - a1) 61233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4x4_t res = { { b0, b1, b2, b3 } }; 61333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return res; 61433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 61533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 61633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Calculate the weighted sum of the rows in 'b'. 61733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b, 61833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t w0, const int32x4_t w1, 61933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t w2, const int32x4_t w3) { 62033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t s0 = vmulq_s32(w0, b.val[0]); 62133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]); 62233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]); 62333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]); 62433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int64x2_t sum1 = vpaddlq_s32(s3); 62533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1)); 62633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return sum2; 62733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 62833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 62933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#define LOAD_LANE_32b(src, VALUE, LANE) \ 63033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE)) 63133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 63233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Hadamard transform 63333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Returns the weighted sum of the absolute value of transformed coefficients. 63433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b, 63533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16_t* const w) { 63633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint32x4_t d0d1 = { 0, 0, 0, 0 }; 63733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint32x4_t d2d3 = { 0, 0, 0, 0 }; 63833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_32b(a + 0 * BPS, d0d1, 0); // a00 a01 a02 a03 63933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_32b(a + 1 * BPS, d0d1, 1); // a10 a11 a12 a13 64033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_32b(b + 0 * BPS, d0d1, 2); // b00 b01 b02 b03 64133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_32b(b + 1 * BPS, d0d1, 3); // b10 b11 b12 b13 64233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_32b(a + 2 * BPS, d2d3, 0); // a20 a21 a22 a23 64333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_32b(a + 3 * BPS, d2d3, 1); // a30 a31 a32 a33 64433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_32b(b + 2 * BPS, d2d3, 2); // b20 b21 b22 b23 64533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora LOAD_LANE_32b(b + 3 * BPS, d2d3, 3); // b30 b31 b32 b33 64633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 64733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora { 64833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31 64933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33 65033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8x2_t tmp = 65133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3)); 65233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]); 65333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]); 65433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8), 65533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vget_low_u8(d2d3u8)); 65633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8), 65733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vget_high_u8(d2d3u8)); 65833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a); 65933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b); 66033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a); 66133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b); 66233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0)); 66333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4)); 66433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8)); 66533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12)); 66633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3); 66733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3); 66833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1), 66933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vreinterpret_s32_s64(sum2)); 67033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int32x2_t res = vshr_n_s32(diff, 5); 67133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return vget_lane_s32(res, 0); 67233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 67333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 67433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 67533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#undef LOAD_LANE_32b 67633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 67733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#else 67833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 6791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Hadamard transform 6801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Returns the weighted sum of the absolute value of transformed coefficients. 6811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b, 6821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint16_t* const w) { 6831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const int kBPS = BPS; 6841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint8_t* A = a; 6851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint8_t* B = b; 6861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint16_t* W = w; 6871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora int sum; 6881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora __asm__ volatile ( 6891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d0[0], [%[a]], %[kBPS] \n" 6901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d0[1], [%[a]], %[kBPS] \n" 6911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d2[0], [%[a]], %[kBPS] \n" 6921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d2[1], [%[a]] \n" 6931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 6941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d1[0], [%[b]], %[kBPS] \n" 6951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d1[1], [%[b]], %[kBPS] \n" 6961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d3[0], [%[b]], %[kBPS] \n" 6971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.32 d3[1], [%[b]] \n" 6981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 6991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // a d0/d2, b d1/d3 7001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d0/d1: 01 01 01 01 7011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // d2/d3: 23 23 23 23 7021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // But: it goes 01 45 23 67 7031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Notice the middle values are transposed 7041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 q0, q1 \n" 7051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // {a0, a1} = {in[0] + in[2], in[1] + in[3]} 7071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vaddl.u8 q2, d0, d2 \n" 7081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vaddl.u8 q10, d1, d3 \n" 7091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // {a3, a2} = {in[0] - in[2], in[1] - in[3]} 7101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.u8 q3, d0, d2 \n" 7111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.u8 q11, d1, d3 \n" 7121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // tmp[0] = a0 + a1 7141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.s16 q0, q2 \n" 7151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.s16 q8, q10 \n" 7161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // tmp[1] = a3 + a2 7181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.s16 q1, q3 \n" 7191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.s16 q9, q11 \n" 7201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // No pair subtract 7221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q2 = {a0, a3} 7231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q3 = {a1, a2} 7241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 q2, q3 \n" 7251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.16 q10, q11 \n" 7261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2} 7281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.s16 q12, d4, d6 \n" 7291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.s16 q13, d5, d7 \n" 7301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.s16 q14, d20, d22 \n" 7311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsubl.s16 q15, d21, d23 \n" 7321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // separate tmp[3] and tmp[2] 7341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q12 = tmp[3] 7351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q13 = tmp[2] 7361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q12, q13 \n" 7371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q14, q15 \n" 7381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Transpose tmp for a 7401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d1, d26 \n" // vtrn.64 7411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d3, d24 \n" // vtrn.64 7421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q0, q1 \n" 7431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q13, q12 \n" 7441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Transpose tmp for b 7461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d17, d30 \n" // vtrn.64 7471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d19, d28 \n" // vtrn.64 7481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q8, q9 \n" 7491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vtrn.32 q15, q14 \n" 7501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // The first Q register is a, the second b. 7521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q0/8 tmp[0-3] 7531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q13/15 tmp[4-7] 7541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q1/9 tmp[8-11] 7551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // q12/14 tmp[12-15] 7561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // These are still in 01 45 23 67 order. We fix it easily in the addition 7588b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora // case but the subtraction propagates them. 7591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d3, d27 \n" 7601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vswp d19, d31 \n" 7611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // a0 = tmp[0] + tmp[8] 7631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q2, q0, q1 \n" 7641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q3, q8, q9 \n" 7651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // a1 = tmp[4] + tmp[12] 7671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q10, q13, q12 \n" 7681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q11, q15, q14 \n" 7691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // a2 = tmp[4] - tmp[12] 7711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q13, q13, q12 \n" 7721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q15, q15, q14 \n" 7731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // a3 = tmp[0] - tmp[8] 7751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q0, q0, q1 \n" 7761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q8, q8, q9 \n" 7771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // b0 = a0 + a1 7791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q1, q2, q10 \n" 7801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q9, q3, q11 \n" 7811e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // b1 = a3 + a2 7831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q12, q0, q13 \n" 7841e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.s32 q14, q8, q15 \n" 7851e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7861e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // b2 = a3 - a2 7871e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q0, q0, q13 \n" 7881e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q8, q8, q15 \n" 7891e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7901e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // b3 = a0 - a1 7911e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q2, q2, q10 \n" 7921e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.s32 q3, q3, q11 \n" 7931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7941e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vld1.64 {q10, q11}, [%[w]] \n" 7951e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 7961e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(b0) 7971e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q1, q1 \n" 7981e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q9, q9 \n" 7991e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(b1) 8001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q12, q12 \n" 8011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q14, q14 \n" 8021e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(b2) 8031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q0, q0 \n" 8041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q8, q8 \n" 8051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(b3) 8061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q2, q2 \n" 8071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 q3, q3 \n" 8081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8091e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // expand w before using. 8101e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u16 q13, d20 \n" 8111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u16 q15, d21 \n" 8121e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // w[0] * abs(b0) 8141e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmul.u32 q1, q1, q13 \n" 8151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmul.u32 q9, q9, q13 \n" 8161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // w[4] * abs(b1) 8181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q1, q12, q15 \n" 8191e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q9, q14, q15 \n" 8201e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8211e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // expand w before using. 8221e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u16 q13, d22 \n" 8231e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmovl.u16 q15, d23 \n" 8241e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // w[8] * abs(b1) 8261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q1, q0, q13 \n" 8271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q9, q8, q13 \n" 8281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // w[12] * abs(b1) 8301e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q1, q2, q15 \n" 8311e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmla.u32 q9, q3, q15 \n" 8321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8331e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Sum the arrays 8341e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.u32 q1, q1 \n" 8351e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vpaddl.u32 q9, q9 \n" 8361e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.u64 d2, d3 \n" 8371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vadd.u64 d18, d19 \n" 8381e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8391e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Hadamard transform needs 4 bits of extra precision (2 bits in each 8401e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum 8411e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // precision for coeff is 8bit of input + 4bits of Hadamard transform + 8421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // 16bits for w[] + 2 bits of abs() summation. 8431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // 8441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is 8451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // A-OK. 8461e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // sum2 - sum1 8481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vsub.u32 d0, d2, d18 \n" 8491e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(sum2 - sum1) 8501e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vabs.s32 d0, d0 \n" 8511e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // abs(sum2 - sum1) >> 5 8521e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vshr.u32 d0, #5 \n" 8531e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // It would be better to move the value straight into r0 but I'm not 8551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // entirely sure how this works with inline assembly. 8561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "vmov.32 %[sum], d0[0] \n" 8571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W) 8591e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : [kBPS] "r"(kBPS) 8601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", 8611e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora "q10", "q11", "q12", "q13", "q14", "q15" // clobbered 8621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora ) ; 8631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 8641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora return sum; 8651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 8661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 86733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif // USE_INTRINSICS 86833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 8691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arorastatic int Disto16x16(const uint8_t* const a, const uint8_t* const b, 8701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const uint16_t* const w) { 8711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora int D = 0; 8721e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora int x, y; 8731e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora for (y = 0; y < 16 * BPS; y += 4 * BPS) { 8741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora for (x = 0; x < 16; x += 4) { 8751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora D += Disto4x4(a + x + y, b + x + y, w); 8761e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora } 8771e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora } 8781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora return D; 8791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 8801e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 88133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------ 88233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 88333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void CollectHistogram(const uint8_t* ref, const uint8_t* pred, 88433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int start_block, int end_block, 88533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8Histogram* const histo) { 88633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH); 88733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int j; 88833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (j = start_block; j < end_block; ++j) { 88933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int16_t out[16]; 89033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); 89133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora { 89233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int k; 89333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t a0 = vld1q_s16(out + 0); 89433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t b0 = vld1q_s16(out + 8); 89533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0)); 89633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0)); 89733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t a2 = vshrq_n_u16(a1, 3); 89833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t b2 = vshrq_n_u16(b1, 3); 89933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh); 90033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh); 90133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1q_s16(out + 0, vreinterpretq_s16_u16(a3)); 90233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1q_s16(out + 8, vreinterpretq_s16_u16(b3)); 90333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Convert coefficients to bin. 90433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (k = 0; k < 16; ++k) { 90533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora histo->distribution[out[k]]++; 90633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 90733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 90833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 90933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 91033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 91133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------ 91233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 91333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE void AccumulateSSE16(const uint8_t* const a, 91433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8_t* const b, 91533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint32x4_t* const sum) { 91633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x16_t a0 = vld1q_u8(a); 91733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x16_t b0 = vld1q_u8(b); 91833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x16_t abs_diff = vabdq_u8(a0, b0); 91933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); 92033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); 92133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora *sum = vpadalq_u16(*sum, prod); // pair-wise add and accumulate 92233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 92333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 92433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora// Horizontal sum of all four uint32_t values in 'sum'. 92533f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SumToInt(uint32x4_t sum) { 92633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint64x2_t sum2 = vpaddlq_u32(sum); 92733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1); 92833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return (int)sum3; 92933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 93033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 93133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE16x16(const uint8_t* a, const uint8_t* b) { 9328c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora uint32x4_t sum = vdupq_n_u32(0); 93333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int y; 93433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (y = 0; y < 16; ++y) { 93533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); 93633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 93733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return SumToInt(sum); 93833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 93933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 94033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE16x8(const uint8_t* a, const uint8_t* b) { 9418c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora uint32x4_t sum = vdupq_n_u32(0); 94233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int y; 94333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (y = 0; y < 8; ++y) { 94433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora AccumulateSSE16(a + y * BPS, b + y * BPS, &sum); 94533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 94633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return SumToInt(sum); 94733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 94833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 94933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE8x8(const uint8_t* a, const uint8_t* b) { 9508c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora uint32x4_t sum = vdupq_n_u32(0); 95133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int y; 95233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora for (y = 0; y < 8; ++y) { 95333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x8_t a0 = vld1_u8(a + y * BPS); 95433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x8_t b0 = vld1_u8(b + y * BPS); 95533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x8_t abs_diff = vabd_u8(a0, b0); 95633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t prod = vmull_u8(abs_diff, abs_diff); 95733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora sum = vpadalq_u16(sum, prod); 95833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 95933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return SumToInt(sum); 96033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 96133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 96233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE4x4(const uint8_t* a, const uint8_t* b) { 96333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x16_t a0 = Load4x4(a); 96433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x16_t b0 = Load4x4(b); 96533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint8x16_t abs_diff = vabdq_u8(a0, b0); 96633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff)); 96733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff)); 96833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return SumToInt(vpaddlq_u16(prod)); 96933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 97033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 97133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora//------------------------------------------------------------------------------ 97233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 9738c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora// Compilation with gcc-4.6.x is problematic for now. 9748c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#if !defined(WORK_AROUND_GCC) 97533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 97633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int16x8_t Quantize(int16_t* const in, 97733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const VP8Matrix* const mtx, int offset) { 97833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]); 97933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t q = vld1q_u16(&mtx->q_[offset]); 98033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]); 98133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]); 98233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]); 98333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 98433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t a = vld1q_s16(in + offset); // in 98533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in) 98633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t sign = vshrq_n_s16(a, 15); // sign 98733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen 98833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq)); 98933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq)); 99033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32x4_t m2 = vhaddq_u32(m0, bias0); 99133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1 99233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16), 99333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1 99433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL)); 99533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign); 99633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign 99733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q)); 99833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vst1q_s16(in + offset, c4); 99933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora assert(QFIX == 17); // this function can't work as is if QFIX != 16+1 100033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return c3; 100133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 100233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 100333f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic const uint8_t kShuffles[4][8] = { 10048c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora { 0, 1, 2, 3, 8, 9, 16, 17 }, 10058c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora { 10, 11, 4, 5, 6, 7, 12, 13 }, 10068c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora { 18, 19, 24, 25, 26, 27, 20, 21 }, 10078c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora { 14, 15, 22, 23, 28, 29, 30, 31 } 100833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}; 100933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 101033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int QuantizeBlock(int16_t in[16], int16_t out[16], 101133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const VP8Matrix* const mtx) { 101233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t out0 = Quantize(in, mtx, 0); 101333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int16x8_t out1 = Quantize(in, mtx, 8); 10148c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora uint8x8x4_t shuffles; 10159e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use 10169e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern // non-standard versions there. 10179e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern#if defined(__APPLE__) && defined(__aarch64__) && \ 10189e80ee991168a0a6c2a906dd2c17c5e17df4566eJames Zern defined(__apple_build_version__) && (__apple_build_version__< 6020037) 10198c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora uint8x16x2_t all_out; 10208c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1)); 10218c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora INIT_VECTOR4(shuffles, 10228c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vtbl2q_u8(all_out, vld1_u8(kShuffles[0])), 10238c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vtbl2q_u8(all_out, vld1_u8(kShuffles[1])), 10248c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vtbl2q_u8(all_out, vld1_u8(kShuffles[2])), 10258c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vtbl2q_u8(all_out, vld1_u8(kShuffles[3]))); 10268c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#else 102733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora uint8x8x4_t all_out; 102833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora INIT_VECTOR4(all_out, 102933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vreinterpret_u8_s16(vget_low_s16(out0)), 103033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vreinterpret_u8_s16(vget_high_s16(out0)), 103133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vreinterpret_u8_s16(vget_low_s16(out1)), 103233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora vreinterpret_u8_s16(vget_high_s16(out1))); 10338c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora INIT_VECTOR4(shuffles, 10348c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vtbl4_u8(all_out, vld1_u8(kShuffles[0])), 10358c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vtbl4_u8(all_out, vld1_u8(kShuffles[1])), 10368c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vtbl4_u8(all_out, vld1_u8(kShuffles[2])), 10378c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vtbl4_u8(all_out, vld1_u8(kShuffles[3]))); 10388c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#endif 103933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // Zigzag reordering 10408c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vst1_u8((uint8_t*)(out + 0), shuffles.val[0]); 10418c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vst1_u8((uint8_t*)(out + 4), shuffles.val[1]); 10428c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vst1_u8((uint8_t*)(out + 8), shuffles.val[2]); 10438c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora vst1_u8((uint8_t*)(out + 12), shuffles.val[3]); 104433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // test zeros 104533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora if (*(uint64_t*)(out + 0) != 0) return 1; 104633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora if (*(uint64_t*)(out + 4) != 0) return 1; 104733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora if (*(uint64_t*)(out + 8) != 0) return 1; 104833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora if (*(uint64_t*)(out + 12) != 0) return 1; 104933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return 0; 105033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 105133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 10528c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#endif // !WORK_AROUND_GCC 105333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 10541e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#endif // WEBP_USE_NEON 10551e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 10561e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------ 10571e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Entry point 10581e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 10591e7bf8805bd030c19924a5306837ecd72c295751Vikas Aroraextern void VP8EncDspInitNEON(void); 10601e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 10611e7bf8805bd030c19924a5306837ecd72c295751Vikas Aroravoid VP8EncDspInitNEON(void) { 10621e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#if defined(WEBP_USE_NEON) 10631e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora VP8ITransform = ITransform; 10641e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora VP8FTransform = FTransform; 10651e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 10661e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora VP8FTransformWHT = FTransformWHT; 10671e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 10681e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora VP8TDisto4x4 = Disto4x4; 10691e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora VP8TDisto16x16 = Disto16x16; 107033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8CollectHistogram = CollectHistogram; 107133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8SSE16x16 = SSE16x16; 107233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8SSE16x8 = SSE16x8; 107333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8SSE8x8 = SSE8x8; 107433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8SSE4x4 = SSE4x4; 10758c098653157979e397d3954fc2ea0ee43bae6ab2Vikas Arora#if !defined(WORK_AROUND_GCC) 107633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8EncQuantizeBlock = QuantizeBlock; 107733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora#endif 10781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora#endif // WEBP_USE_NEON 10791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 1080