1a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora// Copyright 2011 Google Inc. All Rights Reserved. 2466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// 30406ce1417f76f2034833414dcecc9f56253640cVikas Arora// Use of this source code is governed by a BSD-style license 40406ce1417f76f2034833414dcecc9f56253640cVikas Arora// that can be found in the COPYING file in the root of the source 50406ce1417f76f2034833414dcecc9f56253640cVikas Arora// tree. An additional intellectual property rights grant can be found 60406ce1417f76f2034833414dcecc9f56253640cVikas Arora// in the file PATENTS. All contributing project authors may 70406ce1417f76f2034833414dcecc9f56253640cVikas Arora// be found in the AUTHORS file in the root of the source tree. 8466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// ----------------------------------------------------------------------------- 9466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// 10a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora// SSE2 version of speed-critical encoding functions. 11466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// 12466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Author: Christian Duvivier (cduvivier@google.com) 13466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 14a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#include "./dsp.h" 15a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora 16a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#if defined(WEBP_USE_SSE2) 17fa39824bb690c5806358871f46940d0450973d8aJames Zern#include <assert.h> 18a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#include <stdlib.h> // for abs() 19466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora#include <emmintrin.h> 20466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 210912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern#include "./common_sse2.h" 22fa39824bb690c5806358871f46940d0450973d8aJames Zern#include "../enc/cost_enc.h" 23fa39824bb690c5806358871f46940d0450973d8aJames Zern#include "../enc/vp8i_enc.h" 24466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------ 26466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Transforms (Paragraph 14.4) 27466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 28466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Does one or two inverse transforms. 2933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, 3033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora int do_two) { 31466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // This implementation makes use of 16-bit fixed point versions of two 32466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // multiply constants: 33466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 34466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 35466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // 36466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // To be able to use signed 16-bit integers, we use the following trick to 37466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // have constants within range: 38466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // - Associated constants are obtained by subtracting the 16-bit fixed point 39466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // version of one: 40466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // k = K - (1 << 16) => K = k + (1 << 16) 41466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // K1 = 85267 => k1 = 20091 42466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // K2 = 35468 => k2 = -30068 43466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // - The multiplication of a variable by a constant become the sum of the 44466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // variable and the multiplication of that variable by the associated 45466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // constant: 46466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x 47466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i k1 = _mm_set1_epi16(20091); 48466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i k2 = _mm_set1_epi16(-30068); 49466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i T0, T1, T2, T3; 50466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 51466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Load and concatenate the transform coefficients (we'll do two inverse 52466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // transforms in parallel). In the case of only one inverse transform, the 53466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // second half of the vectors will just contain random value we'll never 54466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // use nor store. 55466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i in0, in1, in2, in3; 56466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora { 577c8da7ce66017295a65ec028084b90800be377f8James Zern in0 = _mm_loadl_epi64((const __m128i*)&in[0]); 587c8da7ce66017295a65ec028084b90800be377f8James Zern in1 = _mm_loadl_epi64((const __m128i*)&in[4]); 597c8da7ce66017295a65ec028084b90800be377f8James Zern in2 = _mm_loadl_epi64((const __m128i*)&in[8]); 607c8da7ce66017295a65ec028084b90800be377f8James Zern in3 = _mm_loadl_epi64((const __m128i*)&in[12]); 61466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a00 a10 a20 a30 x x x x 62466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a01 a11 a21 a31 x x x x 63466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a02 a12 a22 a32 x x x x 64466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a03 a13 a23 a33 x x x x 65466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora if (do_two) { 667c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]); 677c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]); 687c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]); 697c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]); 70466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora in0 = _mm_unpacklo_epi64(in0, inB0); 71466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora in1 = _mm_unpacklo_epi64(in1, inB1); 72466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora in2 = _mm_unpacklo_epi64(in2, inB2); 73466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora in3 = _mm_unpacklo_epi64(in3, inB3); 74466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a00 a10 a20 a30 b00 b10 b20 b30 75466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a01 a11 a21 a31 b01 b11 b21 b31 76466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a02 a12 a22 a32 b02 b12 b22 b32 77466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a03 a13 a23 a33 b03 b13 b23 b33 78466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 79466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 80466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 81466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Vertical pass and subsequent transpose. 82466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora { 83466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // First pass, c and d calculations are longer because of the "trick" 84466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // multiplications. 85466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i a = _mm_add_epi16(in0, in2); 86466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b = _mm_sub_epi16(in0, in2); 87466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 88466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i c1 = _mm_mulhi_epi16(in1, k2); 89466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i c2 = _mm_mulhi_epi16(in3, k1); 90466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i c3 = _mm_sub_epi16(in1, in3); 91466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i c4 = _mm_sub_epi16(c1, c2); 92466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i c = _mm_add_epi16(c3, c4); 93466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 94466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i d1 = _mm_mulhi_epi16(in1, k1); 95466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i d2 = _mm_mulhi_epi16(in3, k2); 96466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i d3 = _mm_add_epi16(in1, in3); 97466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i d4 = _mm_add_epi16(d1, d2); 98466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i d = _mm_add_epi16(d3, d4); 99466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 100466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Second pass. 101466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i tmp0 = _mm_add_epi16(a, d); 102466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i tmp1 = _mm_add_epi16(b, c); 103466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i tmp2 = _mm_sub_epi16(b, c); 104466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i tmp3 = _mm_sub_epi16(a, d); 105466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 106466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Transpose the two 4x4. 1070912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3); 108466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 109466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 110466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Horizontal pass and subsequent transpose. 111466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora { 112466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // First pass, c and d calculations are longer because of the "trick" 113466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // multiplications. 114466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i four = _mm_set1_epi16(4); 115466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i dc = _mm_add_epi16(T0, four); 116466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i a = _mm_add_epi16(dc, T2); 117466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b = _mm_sub_epi16(dc, T2); 118466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 119466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i c1 = _mm_mulhi_epi16(T1, k2); 120466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i c2 = _mm_mulhi_epi16(T3, k1); 121466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i c3 = _mm_sub_epi16(T1, T3); 122466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i c4 = _mm_sub_epi16(c1, c2); 123466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i c = _mm_add_epi16(c3, c4); 124466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 125466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i d1 = _mm_mulhi_epi16(T1, k1); 126466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i d2 = _mm_mulhi_epi16(T3, k2); 127466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i d3 = _mm_add_epi16(T1, T3); 128466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i d4 = _mm_add_epi16(d1, d2); 129466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i d = _mm_add_epi16(d3, d4); 130466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 131466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Second pass. 132466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i tmp0 = _mm_add_epi16(a, d); 133466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i tmp1 = _mm_add_epi16(b, c); 134466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i tmp2 = _mm_sub_epi16(b, c); 135466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i tmp3 = _mm_sub_epi16(a, d); 136466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); 137466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); 138466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); 139466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); 140466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 141466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Transpose the two 4x4. 1420912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, 143fa39824bb690c5806358871f46940d0450973d8aJames Zern &T2, &T3); 144466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 145466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 146466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Add inverse transform to 'ref' and store. 147466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora { 1481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i zero = _mm_setzero_si128(); 149466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Load the reference(s). 150466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i ref0, ref1, ref2, ref3; 151466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora if (do_two) { 152466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Load eight bytes/pixels per line. 1537c8da7ce66017295a65ec028084b90800be377f8James Zern ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 1547c8da7ce66017295a65ec028084b90800be377f8James Zern ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); 1557c8da7ce66017295a65ec028084b90800be377f8James Zern ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); 1567c8da7ce66017295a65ec028084b90800be377f8James Zern ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); 157466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } else { 158466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Load four bytes/pixels per line. 1597c8da7ce66017295a65ec028084b90800be377f8James Zern ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS])); 1607c8da7ce66017295a65ec028084b90800be377f8James Zern ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS])); 1617c8da7ce66017295a65ec028084b90800be377f8James Zern ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS])); 1627c8da7ce66017295a65ec028084b90800be377f8James Zern ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS])); 163466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 164466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Convert to 16b. 165466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref0 = _mm_unpacklo_epi8(ref0, zero); 166466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref1 = _mm_unpacklo_epi8(ref1, zero); 167466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref2 = _mm_unpacklo_epi8(ref2, zero); 168466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref3 = _mm_unpacklo_epi8(ref3, zero); 169466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Add the inverse transform(s). 170466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref0 = _mm_add_epi16(ref0, T0); 171466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref1 = _mm_add_epi16(ref1, T1); 172466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref2 = _mm_add_epi16(ref2, T2); 173466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref3 = _mm_add_epi16(ref3, T3); 174466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Unsigned saturate to 8b. 175466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref0 = _mm_packus_epi16(ref0, ref0); 176466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref1 = _mm_packus_epi16(ref1, ref1); 177466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref2 = _mm_packus_epi16(ref2, ref2); 178466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora ref3 = _mm_packus_epi16(ref3, ref3); 179466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Store the results. 180466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora if (do_two) { 181466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Store eight bytes/pixels per line. 182466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0); 183466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1); 184466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2); 185466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); 186466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } else { 187466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Store four bytes/pixels per line. 1887c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0)); 1897c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1)); 1907c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2)); 1917c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3)); 192466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 193466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 194466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora} 195466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1967c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void FTransformPass1(const __m128i* const in01, 1977c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i* const in23, 1987c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i* const out01, 1997c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i* const out32) { 2001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i k937 = _mm_set1_epi32(937); 2011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i k1812 = _mm_set1_epi32(1812); 2027c8da7ce66017295a65ec028084b90800be377f8James Zern 2031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8); 2041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8); 2051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, 2061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 2217, 5352, 2217, 5352); 2071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, 2081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora -5352, 2217, -5352, 2217); 2097c8da7ce66017295a65ec028084b90800be377f8James Zern 2107c8da7ce66017295a65ec028084b90800be377f8James Zern // *in01 = 00 01 10 11 02 03 12 13 2117c8da7ce66017295a65ec028084b90800be377f8James Zern // *in23 = 20 21 30 31 22 23 32 33 2127c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1)); 2137c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1)); 2147c8da7ce66017295a65ec028084b90800be377f8James Zern // 00 01 10 11 03 02 13 12 2157c8da7ce66017295a65ec028084b90800be377f8James Zern // 20 21 30 31 23 22 33 32 2167c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); 2177c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); 2187c8da7ce66017295a65ec028084b90800be377f8James Zern // 00 01 10 11 20 21 30 31 2197c8da7ce66017295a65ec028084b90800be377f8James Zern // 03 02 13 12 23 22 33 32 2207c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a01 = _mm_add_epi16(s01, s32); 2217c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a32 = _mm_sub_epi16(s01, s32); 2227c8da7ce66017295a65ec028084b90800be377f8James Zern // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] 2237c8da7ce66017295a65ec028084b90800be377f8James Zern // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] 2247c8da7ce66017295a65ec028084b90800be377f8James Zern 2257c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ] 2267c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ] 2277c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); 2287c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); 2297c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); 2307c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); 2317c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9); 2327c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9); 2337c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); 2347c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); 2357c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1... 2367c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3 2377c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); 2387c8da7ce66017295a65ec028084b90800be377f8James Zern *out01 = _mm_unpacklo_epi32(s_lo, s_hi); 2397c8da7ce66017295a65ec028084b90800be377f8James Zern *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. 2407c8da7ce66017295a65ec028084b90800be377f8James Zern} 2417c8da7ce66017295a65ec028084b90800be377f8James Zern 2427c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void FTransformPass2(const __m128i* const v01, const __m128i* const v32, 2437c8da7ce66017295a65ec028084b90800be377f8James Zern int16_t* out) { 2447c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 2457c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i seven = _mm_set1_epi16(7); 2467c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 2477c8da7ce66017295a65ec028084b90800be377f8James Zern 5352, 2217, 5352, 2217); 2487c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2497c8da7ce66017295a65ec028084b90800be377f8James Zern 2217, -5352, 2217, -5352); 2507c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); 2517c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i k51000 = _mm_set1_epi32(51000); 2527c8da7ce66017295a65ec028084b90800be377f8James Zern 2537c8da7ce66017295a65ec028084b90800be377f8James Zern // Same operations are done on the (0,3) and (1,2) pairs. 2547c8da7ce66017295a65ec028084b90800be377f8James Zern // a3 = v0 - v3 2557c8da7ce66017295a65ec028084b90800be377f8James Zern // a2 = v1 - v2 2567c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a32 = _mm_sub_epi16(*v01, *v32); 2577c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a22 = _mm_unpackhi_epi64(a32, a32); 2587c8da7ce66017295a65ec028084b90800be377f8James Zern 2597c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b23 = _mm_unpacklo_epi16(a22, a32); 2607c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); 2617c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); 2627c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); 2637c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d3 = _mm_add_epi32(c3, k51000); 2647c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i e1 = _mm_srai_epi32(d1, 16); 2657c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i e3 = _mm_srai_epi32(d3, 16); 266fa39824bb690c5806358871f46940d0450973d8aJames Zern // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) 267fa39824bb690c5806358871f46940d0450973d8aJames Zern // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) 2687c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i f1 = _mm_packs_epi32(e1, e1); 2697c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i f3 = _mm_packs_epi32(e3, e3); 270fa39824bb690c5806358871f46940d0450973d8aJames Zern // g1 = f1 + (a3 != 0); 2717c8da7ce66017295a65ec028084b90800be377f8James Zern // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the 2727c8da7ce66017295a65ec028084b90800be377f8James Zern // desired (0, 1), we add one earlier through k12000_plus_one. 273fa39824bb690c5806358871f46940d0450973d8aJames Zern // -> g1 = f1 + 1 - (a3 == 0) 2747c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); 2757c8da7ce66017295a65ec028084b90800be377f8James Zern 276fa39824bb690c5806358871f46940d0450973d8aJames Zern // a0 = v0 + v3 277fa39824bb690c5806358871f46940d0450973d8aJames Zern // a1 = v1 + v2 278fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a01 = _mm_add_epi16(*v01, *v32); 279fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); 280fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a11 = _mm_unpackhi_epi64(a01, a01); 281fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); 282fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); 283fa39824bb690c5806358871f46940d0450973d8aJames Zern // d0 = (a0 + a1 + 7) >> 4; 284fa39824bb690c5806358871f46940d0450973d8aJames Zern // d2 = (a0 - a1 + 7) >> 4; 285fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i d0 = _mm_srai_epi16(c0, 4); 286fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i d2 = _mm_srai_epi16(c2, 4); 287fa39824bb690c5806358871f46940d0450973d8aJames Zern 2887c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); 2897c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); 2907c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[0], d0_g1); 2917c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[8], d2_f3); 2927c8da7ce66017295a65ec028084b90800be377f8James Zern} 2937c8da7ce66017295a65ec028084b90800be377f8James Zern 2947c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { 2957c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 2960912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // Load src. 2977c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); 2987c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); 2997c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); 3007c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); 3010912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // 00 01 02 03 * 3020912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // 10 11 12 13 * 3030912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // 20 21 22 23 * 3040912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // 30 31 32 33 * 3050912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // Shuffle. 3060912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i src_0 = _mm_unpacklo_epi16(src0, src1); 3070912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i src_1 = _mm_unpacklo_epi16(src2, src3); 3080912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // 00 01 10 11 02 03 12 13 * * ... 3090912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // 20 21 30 31 22 22 32 33 * * ... 3100912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern 3110912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // Load ref. 3127c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 3137c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); 3147c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); 3157c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); 3160912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1); 3170912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3); 3180912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern 3190912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // Convert both to 16 bit. 3200912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero); 3210912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero); 3220912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero); 3230912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero); 3240912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern 3250912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // Compute the difference. 3260912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b); 3270912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b); 328b6dbce6bfeaabde2a7b581c4c6888d532d32f3acDerek Sollenberger __m128i v01, v32; 3294b2196c929b70f2cdc1c2556580d349db89356d8Vikas Arora 3307c8da7ce66017295a65ec028084b90800be377f8James Zern // First pass 3310912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern FTransformPass1(&row01, &row23, &v01, &v32); 3321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 3337c8da7ce66017295a65ec028084b90800be377f8James Zern // Second pass 3347c8da7ce66017295a65ec028084b90800be377f8James Zern FTransformPass2(&v01, &v32, out); 3357c8da7ce66017295a65ec028084b90800be377f8James Zern} 3367c8da7ce66017295a65ec028084b90800be377f8James Zern 3377c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { 3387c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 3397c8da7ce66017295a65ec028084b90800be377f8James Zern 3407c8da7ce66017295a65ec028084b90800be377f8James Zern // Load src and convert to 16b. 3417c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); 3427c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); 3437c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); 3447c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); 3457c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); 3467c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); 3477c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); 3487c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); 3497c8da7ce66017295a65ec028084b90800be377f8James Zern // Load ref and convert to 16b. 3507c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); 3517c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); 3527c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); 3537c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); 3547c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); 3557c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); 3567c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); 3577c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); 3587c8da7ce66017295a65ec028084b90800be377f8James Zern // Compute difference. -> 00 01 02 03 00' 01' 02' 03' 3597c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); 3607c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); 3617c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); 3627c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); 3637c8da7ce66017295a65ec028084b90800be377f8James Zern 3647c8da7ce66017295a65ec028084b90800be377f8James Zern // Unpack and shuffle 3657c8da7ce66017295a65ec028084b90800be377f8James Zern // 00 01 02 03 0 0 0 0 3667c8da7ce66017295a65ec028084b90800be377f8James Zern // 10 11 12 13 0 0 0 0 3677c8da7ce66017295a65ec028084b90800be377f8James Zern // 20 21 22 23 0 0 0 0 3687c8da7ce66017295a65ec028084b90800be377f8James Zern // 30 31 32 33 0 0 0 0 3697c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1); 3707c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3); 3717c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1); 3727c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3); 3737c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i v01l, v32l; 3747c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i v01h, v32h; 3757c8da7ce66017295a65ec028084b90800be377f8James Zern 3767c8da7ce66017295a65ec028084b90800be377f8James Zern // First pass 3777c8da7ce66017295a65ec028084b90800be377f8James Zern FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l); 3787c8da7ce66017295a65ec028084b90800be377f8James Zern FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h); 379466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 380466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Second pass 3817c8da7ce66017295a65ec028084b90800be377f8James Zern FTransformPass2(&v01l, &v32l, out + 0); 3827c8da7ce66017295a65ec028084b90800be377f8James Zern FTransformPass2(&v01h, &v32h, out + 16); 3837c8da7ce66017295a65ec028084b90800be377f8James Zern} 3847c8da7ce66017295a65ec028084b90800be377f8James Zern 3857c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void FTransformWHTRow(const int16_t* const in, __m128i* const out) { 3860912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1); 3877c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]); 3887c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]); 3897c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]); 3907c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i src3 = _mm_loadl_epi64((__m128i*)&in[3 * 16]); 3917c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i A01 = _mm_unpacklo_epi16(src0, src1); // A0 A1 | ... 3927c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i A23 = _mm_unpacklo_epi16(src2, src3); // A2 A3 | ... 3937c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i B0 = _mm_adds_epi16(A01, A23); // a0 | a1 | ... 3947c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i B1 = _mm_subs_epi16(A01, A23); // a3 | a2 | ... 3950912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 | a1 | a3 | a2 | ... 3960912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 | a2 | a0 | a1 | ... 3970912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i D = _mm_unpacklo_epi64(C0, C1); // a0 a1 a3 a2 a3 a2 a0 a1 3980912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern *out = _mm_madd_epi16(D, kMult); 399466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora} 400466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 40133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void FTransformWHT(const int16_t* in, int16_t* out) { 4020912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // Input is 12b signed. 4037c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i row0, row1, row2, row3; 4040912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // Rows are 14b signed. 4057c8da7ce66017295a65ec028084b90800be377f8James Zern FTransformWHTRow(in + 0 * 64, &row0); 4067c8da7ce66017295a65ec028084b90800be377f8James Zern FTransformWHTRow(in + 1 * 64, &row1); 4077c8da7ce66017295a65ec028084b90800be377f8James Zern FTransformWHTRow(in + 2 * 64, &row2); 4087c8da7ce66017295a65ec028084b90800be377f8James Zern FTransformWHTRow(in + 3 * 64, &row3); 4097c8da7ce66017295a65ec028084b90800be377f8James Zern 4100406ce1417f76f2034833414dcecc9f56253640cVikas Arora { 4110912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // The a* are 15b signed. 4127c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a0 = _mm_add_epi32(row0, row2); 4137c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a1 = _mm_add_epi32(row1, row3); 4147c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a2 = _mm_sub_epi32(row1, row3); 4157c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a3 = _mm_sub_epi32(row0, row2); 4160912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i a0a3 = _mm_packs_epi32(a0, a3); 4170912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i a1a2 = _mm_packs_epi32(a1, a2); 4180912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern 4190912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // The b* are 16b signed. 4200912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2); 4210912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2); 4220912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2); 4230912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2); 4240912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern 4250912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1)); 4260912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1)); 4270406ce1417f76f2034833414dcecc9f56253640cVikas Arora } 4280406ce1417f76f2034833414dcecc9f56253640cVikas Arora} 4290406ce1417f76f2034833414dcecc9f56253640cVikas Arora 430a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora//------------------------------------------------------------------------------ 4317c8da7ce66017295a65ec028084b90800be377f8James Zern// Compute susceptibility based on DCT-coeff histograms: 4327c8da7ce66017295a65ec028084b90800be377f8James Zern// the higher, the "easier" the macroblock is to compress. 433466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 4347c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void CollectHistogram(const uint8_t* ref, const uint8_t* pred, 4357c8da7ce66017295a65ec028084b90800be377f8James Zern int start_block, int end_block, 4367c8da7ce66017295a65ec028084b90800be377f8James Zern VP8Histogram* const histo) { 4371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i zero = _mm_setzero_si128(); 4387c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); 4397c8da7ce66017295a65ec028084b90800be377f8James Zern int j; 4407c8da7ce66017295a65ec028084b90800be377f8James Zern int distribution[MAX_COEFF_THRESH + 1] = { 0 }; 4417c8da7ce66017295a65ec028084b90800be377f8James Zern for (j = start_block; j < end_block; ++j) { 4427c8da7ce66017295a65ec028084b90800be377f8James Zern int16_t out[16]; 4437c8da7ce66017295a65ec028084b90800be377f8James Zern int k; 4447c8da7ce66017295a65ec028084b90800be377f8James Zern 4457c8da7ce66017295a65ec028084b90800be377f8James Zern FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); 4467c8da7ce66017295a65ec028084b90800be377f8James Zern 4477c8da7ce66017295a65ec028084b90800be377f8James Zern // Convert coefficients to bin (within out[]). 4487c8da7ce66017295a65ec028084b90800be377f8James Zern { 4497c8da7ce66017295a65ec028084b90800be377f8James Zern // Load. 4507c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); 4517c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); 4527c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d0 = _mm_sub_epi16(zero, out0); 4537c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d1 = _mm_sub_epi16(zero, out1); 4547c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i abs0 = _mm_max_epi16(out0, d0); // abs(v), 16b 4557c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i abs1 = _mm_max_epi16(out1, d1); 4567c8da7ce66017295a65ec028084b90800be377f8James Zern // v = abs(out) >> 3 4577c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i v0 = _mm_srai_epi16(abs0, 3); 4587c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i v1 = _mm_srai_epi16(abs1, 3); 4597c8da7ce66017295a65ec028084b90800be377f8James Zern // bin = min(v, MAX_COEFF_THRESH) 4607c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); 4617c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); 4627c8da7ce66017295a65ec028084b90800be377f8James Zern // Store. 4637c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[0], bin0); 4647c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)&out[8], bin1); 4657c8da7ce66017295a65ec028084b90800be377f8James Zern } 4667c8da7ce66017295a65ec028084b90800be377f8James Zern 4677c8da7ce66017295a65ec028084b90800be377f8James Zern // Convert coefficients to bin. 4687c8da7ce66017295a65ec028084b90800be377f8James Zern for (k = 0; k < 16; ++k) { 4697c8da7ce66017295a65ec028084b90800be377f8James Zern ++distribution[out[k]]; 4701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora } 4711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora } 4727c8da7ce66017295a65ec028084b90800be377f8James Zern VP8SetHistogramData(distribution, histo); 4737c8da7ce66017295a65ec028084b90800be377f8James Zern} 4747c8da7ce66017295a65ec028084b90800be377f8James Zern 4757c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 4767c8da7ce66017295a65ec028084b90800be377f8James Zern// Intra predictions 4777c8da7ce66017295a65ec028084b90800be377f8James Zern 4787c8da7ce66017295a65ec028084b90800be377f8James Zern// helper for chroma-DC predictions 4797c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) { 4807c8da7ce66017295a65ec028084b90800be377f8James Zern int j; 4817c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i values = _mm_set1_epi8(v); 4827c8da7ce66017295a65ec028084b90800be377f8James Zern for (j = 0; j < 8; ++j) { 4837c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storel_epi64((__m128i*)(dst + j * BPS), values); 4847c8da7ce66017295a65ec028084b90800be377f8James Zern } 4857c8da7ce66017295a65ec028084b90800be377f8James Zern} 4867c8da7ce66017295a65ec028084b90800be377f8James Zern 4877c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) { 4887c8da7ce66017295a65ec028084b90800be377f8James Zern int j; 4897c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i values = _mm_set1_epi8(v); 4907c8da7ce66017295a65ec028084b90800be377f8James Zern for (j = 0; j < 16; ++j) { 4917c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_store_si128((__m128i*)(dst + j * BPS), values); 4927c8da7ce66017295a65ec028084b90800be377f8James Zern } 4937c8da7ce66017295a65ec028084b90800be377f8James Zern} 4947c8da7ce66017295a65ec028084b90800be377f8James Zern 4957c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void Fill(uint8_t* dst, int value, int size) { 4967c8da7ce66017295a65ec028084b90800be377f8James Zern if (size == 4) { 4977c8da7ce66017295a65ec028084b90800be377f8James Zern int j; 4987c8da7ce66017295a65ec028084b90800be377f8James Zern for (j = 0; j < 4; ++j) { 4997c8da7ce66017295a65ec028084b90800be377f8James Zern memset(dst + j * BPS, value, 4); 5007c8da7ce66017295a65ec028084b90800be377f8James Zern } 5017c8da7ce66017295a65ec028084b90800be377f8James Zern } else if (size == 8) { 5027c8da7ce66017295a65ec028084b90800be377f8James Zern Put8x8uv(value, dst); 5037c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 5047c8da7ce66017295a65ec028084b90800be377f8James Zern Put16(value, dst); 5057c8da7ce66017295a65ec028084b90800be377f8James Zern } 5067c8da7ce66017295a65ec028084b90800be377f8James Zern} 5077c8da7ce66017295a65ec028084b90800be377f8James Zern 5087c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) { 5097c8da7ce66017295a65ec028084b90800be377f8James Zern int j; 5107c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 5117c8da7ce66017295a65ec028084b90800be377f8James Zern for (j = 0; j < 8; ++j) { 5127c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storel_epi64((__m128i*)(dst + j * BPS), top_values); 5137c8da7ce66017295a65ec028084b90800be377f8James Zern } 5147c8da7ce66017295a65ec028084b90800be377f8James Zern} 5157c8da7ce66017295a65ec028084b90800be377f8James Zern 5167c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) { 5177c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_values = _mm_load_si128((const __m128i*)top); 5187c8da7ce66017295a65ec028084b90800be377f8James Zern int j; 5197c8da7ce66017295a65ec028084b90800be377f8James Zern for (j = 0; j < 16; ++j) { 5207c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_store_si128((__m128i*)(dst + j * BPS), top_values); 5217c8da7ce66017295a65ec028084b90800be377f8James Zern } 5227c8da7ce66017295a65ec028084b90800be377f8James Zern} 5237c8da7ce66017295a65ec028084b90800be377f8James Zern 5247c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VerticalPred(uint8_t* dst, 5257c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top, int size) { 5267c8da7ce66017295a65ec028084b90800be377f8James Zern if (top != NULL) { 5277c8da7ce66017295a65ec028084b90800be377f8James Zern if (size == 8) { 5287c8da7ce66017295a65ec028084b90800be377f8James Zern VE8uv(dst, top); 5297c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 5307c8da7ce66017295a65ec028084b90800be377f8James Zern VE16(dst, top); 5317c8da7ce66017295a65ec028084b90800be377f8James Zern } 5327c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 5337c8da7ce66017295a65ec028084b90800be377f8James Zern Fill(dst, 127, size); 5347c8da7ce66017295a65ec028084b90800be377f8James Zern } 5357c8da7ce66017295a65ec028084b90800be377f8James Zern} 5367c8da7ce66017295a65ec028084b90800be377f8James Zern 5377c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) { 5387c8da7ce66017295a65ec028084b90800be377f8James Zern int j; 5397c8da7ce66017295a65ec028084b90800be377f8James Zern for (j = 0; j < 8; ++j) { 5407c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i values = _mm_set1_epi8(left[j]); 5417c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storel_epi64((__m128i*)dst, values); 5427c8da7ce66017295a65ec028084b90800be377f8James Zern dst += BPS; 5437c8da7ce66017295a65ec028084b90800be377f8James Zern } 5447c8da7ce66017295a65ec028084b90800be377f8James Zern} 5457c8da7ce66017295a65ec028084b90800be377f8James Zern 5467c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) { 5477c8da7ce66017295a65ec028084b90800be377f8James Zern int j; 5487c8da7ce66017295a65ec028084b90800be377f8James Zern for (j = 0; j < 16; ++j) { 5497c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i values = _mm_set1_epi8(left[j]); 5507c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_store_si128((__m128i*)dst, values); 5517c8da7ce66017295a65ec028084b90800be377f8James Zern dst += BPS; 5527c8da7ce66017295a65ec028084b90800be377f8James Zern } 5537c8da7ce66017295a65ec028084b90800be377f8James Zern} 5547c8da7ce66017295a65ec028084b90800be377f8James Zern 5557c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HorizontalPred(uint8_t* dst, 5567c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* left, int size) { 5577c8da7ce66017295a65ec028084b90800be377f8James Zern if (left != NULL) { 5587c8da7ce66017295a65ec028084b90800be377f8James Zern if (size == 8) { 5597c8da7ce66017295a65ec028084b90800be377f8James Zern HE8uv(dst, left); 5607c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 5617c8da7ce66017295a65ec028084b90800be377f8James Zern HE16(dst, left); 5627c8da7ce66017295a65ec028084b90800be377f8James Zern } 5637c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 5647c8da7ce66017295a65ec028084b90800be377f8James Zern Fill(dst, 129, size); 5657c8da7ce66017295a65ec028084b90800be377f8James Zern } 5667c8da7ce66017295a65ec028084b90800be377f8James Zern} 5677c8da7ce66017295a65ec028084b90800be377f8James Zern 5687c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left, 5697c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top, int size) { 5707c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 5717c8da7ce66017295a65ec028084b90800be377f8James Zern int y; 5727c8da7ce66017295a65ec028084b90800be377f8James Zern if (size == 8) { 5737c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 5747c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); 5757c8da7ce66017295a65ec028084b90800be377f8James Zern for (y = 0; y < 8; ++y, dst += BPS) { 5767c8da7ce66017295a65ec028084b90800be377f8James Zern const int val = left[y] - left[-1]; 5777c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i base = _mm_set1_epi16(val); 5787c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); 5797c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storel_epi64((__m128i*)dst, out); 5807c8da7ce66017295a65ec028084b90800be377f8James Zern } 5817c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 5827c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_values = _mm_load_si128((const __m128i*)top); 5837c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero); 5847c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero); 5857c8da7ce66017295a65ec028084b90800be377f8James Zern for (y = 0; y < 16; ++y, dst += BPS) { 5867c8da7ce66017295a65ec028084b90800be377f8James Zern const int val = left[y] - left[-1]; 5877c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i base = _mm_set1_epi16(val); 5887c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i out_0 = _mm_add_epi16(base, top_base_0); 5897c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i out_1 = _mm_add_epi16(base, top_base_1); 5907c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i out = _mm_packus_epi16(out_0, out_1); 5917c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_store_si128((__m128i*)dst, out); 5927c8da7ce66017295a65ec028084b90800be377f8James Zern } 5937c8da7ce66017295a65ec028084b90800be377f8James Zern } 5947c8da7ce66017295a65ec028084b90800be377f8James Zern} 5957c8da7ce66017295a65ec028084b90800be377f8James Zern 5967c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, 5977c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top, int size) { 5987c8da7ce66017295a65ec028084b90800be377f8James Zern if (left != NULL) { 5997c8da7ce66017295a65ec028084b90800be377f8James Zern if (top != NULL) { 6007c8da7ce66017295a65ec028084b90800be377f8James Zern TM(dst, left, top, size); 6017c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 6027c8da7ce66017295a65ec028084b90800be377f8James Zern HorizontalPred(dst, left, size); 6037c8da7ce66017295a65ec028084b90800be377f8James Zern } 6047c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 6057c8da7ce66017295a65ec028084b90800be377f8James Zern // true motion without left samples (hence: with default 129 value) 6067c8da7ce66017295a65ec028084b90800be377f8James Zern // is equivalent to VE prediction where you just copy the top samples. 6077c8da7ce66017295a65ec028084b90800be377f8James Zern // Note that if top samples are not available, the default value is 6087c8da7ce66017295a65ec028084b90800be377f8James Zern // then 129, and not 127 as in the VerticalPred case. 6097c8da7ce66017295a65ec028084b90800be377f8James Zern if (top != NULL) { 6107c8da7ce66017295a65ec028084b90800be377f8James Zern VerticalPred(dst, top, size); 6117c8da7ce66017295a65ec028084b90800be377f8James Zern } else { 6127c8da7ce66017295a65ec028084b90800be377f8James Zern Fill(dst, 129, size); 6137c8da7ce66017295a65ec028084b90800be377f8James Zern } 6147c8da7ce66017295a65ec028084b90800be377f8James Zern } 6157c8da7ce66017295a65ec028084b90800be377f8James Zern} 6167c8da7ce66017295a65ec028084b90800be377f8James Zern 6177c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left, 6187c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top) { 6197c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 6207c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i left_values = _mm_loadl_epi64((const __m128i*)left); 6210912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i combined = _mm_unpacklo_epi64(top_values, left_values); 6220912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const int DC = VP8HorizontalAdd8b(&combined) + 8; 6237c8da7ce66017295a65ec028084b90800be377f8James Zern Put8x8uv(DC >> 4, dst); 6247c8da7ce66017295a65ec028084b90800be377f8James Zern} 6257c8da7ce66017295a65ec028084b90800be377f8James Zern 6267c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) { 6277c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 6287c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); 6297c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i sum = _mm_sad_epu8(top_values, zero); 6307c8da7ce66017295a65ec028084b90800be377f8James Zern const int DC = _mm_cvtsi128_si32(sum) + 4; 6317c8da7ce66017295a65ec028084b90800be377f8James Zern Put8x8uv(DC >> 3, dst); 6327c8da7ce66017295a65ec028084b90800be377f8James Zern} 6337c8da7ce66017295a65ec028084b90800be377f8James Zern 6347c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) { 6357c8da7ce66017295a65ec028084b90800be377f8James Zern // 'left' is contiguous so we can reuse the top summation. 6367c8da7ce66017295a65ec028084b90800be377f8James Zern DC8uvNoLeft(dst, left); 6377c8da7ce66017295a65ec028084b90800be377f8James Zern} 6387c8da7ce66017295a65ec028084b90800be377f8James Zern 6397c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) { 6407c8da7ce66017295a65ec028084b90800be377f8James Zern Put8x8uv(0x80, dst); 6417c8da7ce66017295a65ec028084b90800be377f8James Zern} 6427c8da7ce66017295a65ec028084b90800be377f8James Zern 6437c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left, 6447c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top) { 6457c8da7ce66017295a65ec028084b90800be377f8James Zern if (top != NULL) { 6467c8da7ce66017295a65ec028084b90800be377f8James Zern if (left != NULL) { // top and left present 6477c8da7ce66017295a65ec028084b90800be377f8James Zern DC8uv(dst, left, top); 6487c8da7ce66017295a65ec028084b90800be377f8James Zern } else { // top, but no left 6497c8da7ce66017295a65ec028084b90800be377f8James Zern DC8uvNoLeft(dst, top); 6507c8da7ce66017295a65ec028084b90800be377f8James Zern } 6517c8da7ce66017295a65ec028084b90800be377f8James Zern } else if (left != NULL) { // left but no top 6527c8da7ce66017295a65ec028084b90800be377f8James Zern DC8uvNoTop(dst, left); 6537c8da7ce66017295a65ec028084b90800be377f8James Zern } else { // no top, no left, nothing. 6547c8da7ce66017295a65ec028084b90800be377f8James Zern DC8uvNoTopLeft(dst); 6557c8da7ce66017295a65ec028084b90800be377f8James Zern } 6567c8da7ce66017295a65ec028084b90800be377f8James Zern} 6577c8da7ce66017295a65ec028084b90800be377f8James Zern 6587c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left, 6597c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top) { 6607c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_row = _mm_load_si128((const __m128i*)top); 6617c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i left_row = _mm_load_si128((const __m128i*)left); 6620912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const int DC = 6630912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16; 6647c8da7ce66017295a65ec028084b90800be377f8James Zern Put16(DC >> 5, dst); 6657c8da7ce66017295a65ec028084b90800be377f8James Zern} 6667c8da7ce66017295a65ec028084b90800be377f8James Zern 6677c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) { 6687c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_row = _mm_load_si128((const __m128i*)top); 6690912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const int DC = VP8HorizontalAdd8b(&top_row) + 8; 6707c8da7ce66017295a65ec028084b90800be377f8James Zern Put16(DC >> 4, dst); 6717c8da7ce66017295a65ec028084b90800be377f8James Zern} 6727c8da7ce66017295a65ec028084b90800be377f8James Zern 6737c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) { 6747c8da7ce66017295a65ec028084b90800be377f8James Zern // 'left' is contiguous so we can reuse the top summation. 6757c8da7ce66017295a65ec028084b90800be377f8James Zern DC16NoLeft(dst, left); 6767c8da7ce66017295a65ec028084b90800be377f8James Zern} 6777c8da7ce66017295a65ec028084b90800be377f8James Zern 6787c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) { 6797c8da7ce66017295a65ec028084b90800be377f8James Zern Put16(0x80, dst); 6807c8da7ce66017295a65ec028084b90800be377f8James Zern} 6817c8da7ce66017295a65ec028084b90800be377f8James Zern 6827c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left, 6837c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top) { 6847c8da7ce66017295a65ec028084b90800be377f8James Zern if (top != NULL) { 6857c8da7ce66017295a65ec028084b90800be377f8James Zern if (left != NULL) { // top and left present 6867c8da7ce66017295a65ec028084b90800be377f8James Zern DC16(dst, left, top); 6877c8da7ce66017295a65ec028084b90800be377f8James Zern } else { // top, but no left 6887c8da7ce66017295a65ec028084b90800be377f8James Zern DC16NoLeft(dst, top); 6897c8da7ce66017295a65ec028084b90800be377f8James Zern } 6907c8da7ce66017295a65ec028084b90800be377f8James Zern } else if (left != NULL) { // left but no top 6917c8da7ce66017295a65ec028084b90800be377f8James Zern DC16NoTop(dst, left); 6927c8da7ce66017295a65ec028084b90800be377f8James Zern } else { // no top, no left, nothing. 6937c8da7ce66017295a65ec028084b90800be377f8James Zern DC16NoTopLeft(dst); 6947c8da7ce66017295a65ec028084b90800be377f8James Zern } 6957c8da7ce66017295a65ec028084b90800be377f8James Zern} 6967c8da7ce66017295a65ec028084b90800be377f8James Zern 6977c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 6987c8da7ce66017295a65ec028084b90800be377f8James Zern// 4x4 predictions 6997c8da7ce66017295a65ec028084b90800be377f8James Zern 7007c8da7ce66017295a65ec028084b90800be377f8James Zern#define DST(x, y) dst[(x) + (y) * BPS] 7017c8da7ce66017295a65ec028084b90800be377f8James Zern#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) 7027c8da7ce66017295a65ec028084b90800be377f8James Zern#define AVG2(a, b) (((a) + (b) + 1) >> 1) 7037c8da7ce66017295a65ec028084b90800be377f8James Zern 7047c8da7ce66017295a65ec028084b90800be377f8James Zern// We use the following 8b-arithmetic tricks: 7057c8da7ce66017295a65ec028084b90800be377f8James Zern// (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1 7067c8da7ce66017295a65ec028084b90800be377f8James Zern// where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1] 7077c8da7ce66017295a65ec028084b90800be377f8James Zern// and: 7087c8da7ce66017295a65ec028084b90800be377f8James Zern// (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb 7097c8da7ce66017295a65ec028084b90800be377f8James Zern// where: AC = (a + b + 1) >> 1, BC = (b + c + 1) >> 1 7107c8da7ce66017295a65ec028084b90800be377f8James Zern// and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1 7117c8da7ce66017295a65ec028084b90800be377f8James Zern 7127c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical 7137c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i one = _mm_set1_epi8(1); 7147c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1)); 7157c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); 7167c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2); 7177c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00); 7187c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one); 7197c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b = _mm_subs_epu8(a, lsb); 7207c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i avg = _mm_avg_epu8(b, BCDEFGH0); 7217c8da7ce66017295a65ec028084b90800be377f8James Zern const uint32_t vals = _mm_cvtsi128_si32(avg); 7227c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 7237c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i < 4; ++i) { 7247c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + i * BPS, vals); 7257c8da7ce66017295a65ec028084b90800be377f8James Zern } 7267c8da7ce66017295a65ec028084b90800be377f8James Zern} 7277c8da7ce66017295a65ec028084b90800be377f8James Zern 7287c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal 7297c8da7ce66017295a65ec028084b90800be377f8James Zern const int X = top[-1]; 7307c8da7ce66017295a65ec028084b90800be377f8James Zern const int I = top[-2]; 7317c8da7ce66017295a65ec028084b90800be377f8James Zern const int J = top[-3]; 7327c8da7ce66017295a65ec028084b90800be377f8James Zern const int K = top[-4]; 7337c8da7ce66017295a65ec028084b90800be377f8James Zern const int L = top[-5]; 7347c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J)); 7357c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K)); 7367c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L)); 7377c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L)); 7387c8da7ce66017295a65ec028084b90800be377f8James Zern} 7397c8da7ce66017295a65ec028084b90800be377f8James Zern 7407c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) { 7417c8da7ce66017295a65ec028084b90800be377f8James Zern uint32_t dc = 4; 7427c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 7437c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; 7447c8da7ce66017295a65ec028084b90800be377f8James Zern Fill(dst, dc >> 3, 4); 7457c8da7ce66017295a65ec028084b90800be377f8James Zern} 7467c8da7ce66017295a65ec028084b90800be377f8James Zern 7477c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) { // Down-Left 7487c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i one = _mm_set1_epi8(1); 7497c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); 7507c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1); 7517c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2); 7527c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, top[7], 3); 7537c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0); 7547c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one); 7557c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i avg2 = _mm_subs_epu8(avg1, lsb); 7567c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0); 7577c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcdefg )); 7587c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); 7597c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); 7607c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); 7617c8da7ce66017295a65ec028084b90800be377f8James Zern} 7627c8da7ce66017295a65ec028084b90800be377f8James Zern 7637c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VR4(uint8_t* dst, 7647c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top) { // Vertical-Right 7657c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i one = _mm_set1_epi8(1); 7667c8da7ce66017295a65ec028084b90800be377f8James Zern const int I = top[-2]; 7677c8da7ce66017295a65ec028084b90800be377f8James Zern const int J = top[-3]; 7687c8da7ce66017295a65ec028084b90800be377f8James Zern const int K = top[-4]; 7697c8da7ce66017295a65ec028084b90800be377f8James Zern const int X = top[-1]; 7707c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i XABCD = _mm_loadl_epi64((const __m128i*)(top - 1)); 7717c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ABCD0 = _mm_srli_si128(XABCD, 1); 7727c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0); 7737c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i _XABCD = _mm_slli_si128(XABCD, 1); 7747c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0); 7757c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0); 7767c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one); 7777c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i avg2 = _mm_subs_epu8(avg1, lsb); 7787c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i efgh = _mm_avg_epu8(avg2, XABCD); 7797c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( abcd )); 7807c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( efgh )); 7817c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1))); 7827c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1))); 7837c8da7ce66017295a65ec028084b90800be377f8James Zern 7847c8da7ce66017295a65ec028084b90800be377f8James Zern // these two are hard to implement in SSE2, so we keep the C-version: 7857c8da7ce66017295a65ec028084b90800be377f8James Zern DST(0, 2) = AVG3(J, I, X); 7867c8da7ce66017295a65ec028084b90800be377f8James Zern DST(0, 3) = AVG3(K, J, I); 7877c8da7ce66017295a65ec028084b90800be377f8James Zern} 7887c8da7ce66017295a65ec028084b90800be377f8James Zern 7897c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VL4(uint8_t* dst, 7907c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top) { // Vertical-Left 7917c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i one = _mm_set1_epi8(1); 7927c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top); 7937c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1); 7947c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2); 7957c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_); 7967c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_); 7977c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i avg3 = _mm_avg_epu8(avg1, avg2); 7987c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one); 7997c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_); 8007c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_); 8017c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i abbc = _mm_or_si128(ab, bc); 8027c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i lsb2 = _mm_and_si128(abbc, lsb1); 8037c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i avg4 = _mm_subs_epu8(avg3, lsb2); 8047c8da7ce66017295a65ec028084b90800be377f8James Zern const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4)); 8057c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32( avg1 )); 8067c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32( avg4 )); 8077c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1))); 8087c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1))); 8097c8da7ce66017295a65ec028084b90800be377f8James Zern 8107c8da7ce66017295a65ec028084b90800be377f8James Zern // these two are hard to get and irregular 8117c8da7ce66017295a65ec028084b90800be377f8James Zern DST(3, 2) = (extra_out >> 0) & 0xff; 8127c8da7ce66017295a65ec028084b90800be377f8James Zern DST(3, 3) = (extra_out >> 8) & 0xff; 8137c8da7ce66017295a65ec028084b90800be377f8James Zern} 8147c8da7ce66017295a65ec028084b90800be377f8James Zern 8157c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) { // Down-right 8167c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i one = _mm_set1_epi8(1); 8177c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5)); 8187c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4); 8197c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1); 8207c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2); 8217c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD); 8227c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one); 8237c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i avg2 = _mm_subs_epu8(avg1, lsb); 8247c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_); 8257c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32( abcdefg )); 8267c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1))); 8277c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2))); 8287c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3))); 8297c8da7ce66017295a65ec028084b90800be377f8James Zern} 8307c8da7ce66017295a65ec028084b90800be377f8James Zern 8317c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) { 8327c8da7ce66017295a65ec028084b90800be377f8James Zern const int I = top[-2]; 8337c8da7ce66017295a65ec028084b90800be377f8James Zern const int J = top[-3]; 8347c8da7ce66017295a65ec028084b90800be377f8James Zern const int K = top[-4]; 8357c8da7ce66017295a65ec028084b90800be377f8James Zern const int L = top[-5]; 8367c8da7ce66017295a65ec028084b90800be377f8James Zern DST(0, 0) = AVG2(I, J); 8377c8da7ce66017295a65ec028084b90800be377f8James Zern DST(2, 0) = DST(0, 1) = AVG2(J, K); 8387c8da7ce66017295a65ec028084b90800be377f8James Zern DST(2, 1) = DST(0, 2) = AVG2(K, L); 8397c8da7ce66017295a65ec028084b90800be377f8James Zern DST(1, 0) = AVG3(I, J, K); 8407c8da7ce66017295a65ec028084b90800be377f8James Zern DST(3, 0) = DST(1, 1) = AVG3(J, K, L); 8417c8da7ce66017295a65ec028084b90800be377f8James Zern DST(3, 1) = DST(1, 2) = AVG3(K, L, L); 8427c8da7ce66017295a65ec028084b90800be377f8James Zern DST(3, 2) = DST(2, 2) = 8437c8da7ce66017295a65ec028084b90800be377f8James Zern DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; 8447c8da7ce66017295a65ec028084b90800be377f8James Zern} 8457c8da7ce66017295a65ec028084b90800be377f8James Zern 8467c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) { 8477c8da7ce66017295a65ec028084b90800be377f8James Zern const int X = top[-1]; 8487c8da7ce66017295a65ec028084b90800be377f8James Zern const int I = top[-2]; 8497c8da7ce66017295a65ec028084b90800be377f8James Zern const int J = top[-3]; 8507c8da7ce66017295a65ec028084b90800be377f8James Zern const int K = top[-4]; 8517c8da7ce66017295a65ec028084b90800be377f8James Zern const int L = top[-5]; 8527c8da7ce66017295a65ec028084b90800be377f8James Zern const int A = top[0]; 8537c8da7ce66017295a65ec028084b90800be377f8James Zern const int B = top[1]; 8547c8da7ce66017295a65ec028084b90800be377f8James Zern const int C = top[2]; 8557c8da7ce66017295a65ec028084b90800be377f8James Zern 8567c8da7ce66017295a65ec028084b90800be377f8James Zern DST(0, 0) = DST(2, 1) = AVG2(I, X); 8577c8da7ce66017295a65ec028084b90800be377f8James Zern DST(0, 1) = DST(2, 2) = AVG2(J, I); 8587c8da7ce66017295a65ec028084b90800be377f8James Zern DST(0, 2) = DST(2, 3) = AVG2(K, J); 8597c8da7ce66017295a65ec028084b90800be377f8James Zern DST(0, 3) = AVG2(L, K); 8607c8da7ce66017295a65ec028084b90800be377f8James Zern 8617c8da7ce66017295a65ec028084b90800be377f8James Zern DST(3, 0) = AVG3(A, B, C); 8627c8da7ce66017295a65ec028084b90800be377f8James Zern DST(2, 0) = AVG3(X, A, B); 8637c8da7ce66017295a65ec028084b90800be377f8James Zern DST(1, 0) = DST(3, 1) = AVG3(I, X, A); 8647c8da7ce66017295a65ec028084b90800be377f8James Zern DST(1, 1) = DST(3, 2) = AVG3(J, I, X); 8657c8da7ce66017295a65ec028084b90800be377f8James Zern DST(1, 2) = DST(3, 3) = AVG3(K, J, I); 8667c8da7ce66017295a65ec028084b90800be377f8James Zern DST(1, 3) = AVG3(L, K, J); 8677c8da7ce66017295a65ec028084b90800be377f8James Zern} 8687c8da7ce66017295a65ec028084b90800be377f8James Zern 8697c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { 8707c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 8717c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top)); 8727c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i top_base = _mm_unpacklo_epi8(top_values, zero); 8737c8da7ce66017295a65ec028084b90800be377f8James Zern int y; 8747c8da7ce66017295a65ec028084b90800be377f8James Zern for (y = 0; y < 4; ++y, dst += BPS) { 8757c8da7ce66017295a65ec028084b90800be377f8James Zern const int val = top[-2 - y] - top[-1]; 8767c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i base = _mm_set1_epi16(val); 8777c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero); 8787c8da7ce66017295a65ec028084b90800be377f8James Zern WebPUint32ToMem(dst, _mm_cvtsi128_si32(out)); 8797c8da7ce66017295a65ec028084b90800be377f8James Zern } 8807c8da7ce66017295a65ec028084b90800be377f8James Zern} 8817c8da7ce66017295a65ec028084b90800be377f8James Zern 8827c8da7ce66017295a65ec028084b90800be377f8James Zern#undef DST 8837c8da7ce66017295a65ec028084b90800be377f8James Zern#undef AVG3 8847c8da7ce66017295a65ec028084b90800be377f8James Zern#undef AVG2 8857c8da7ce66017295a65ec028084b90800be377f8James Zern 8867c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 8877c8da7ce66017295a65ec028084b90800be377f8James Zern// luma 4x4 prediction 8887c8da7ce66017295a65ec028084b90800be377f8James Zern 8897c8da7ce66017295a65ec028084b90800be377f8James Zern// Left samples are top[-5 .. -2], top_left is top[-1], top are 8907c8da7ce66017295a65ec028084b90800be377f8James Zern// located at top[0..3], and top right is top[4..7] 8917c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void Intra4Preds(uint8_t* dst, const uint8_t* top) { 8927c8da7ce66017295a65ec028084b90800be377f8James Zern DC4(I4DC4 + dst, top); 8937c8da7ce66017295a65ec028084b90800be377f8James Zern TM4(I4TM4 + dst, top); 8947c8da7ce66017295a65ec028084b90800be377f8James Zern VE4(I4VE4 + dst, top); 8957c8da7ce66017295a65ec028084b90800be377f8James Zern HE4(I4HE4 + dst, top); 8967c8da7ce66017295a65ec028084b90800be377f8James Zern RD4(I4RD4 + dst, top); 8977c8da7ce66017295a65ec028084b90800be377f8James Zern VR4(I4VR4 + dst, top); 8987c8da7ce66017295a65ec028084b90800be377f8James Zern LD4(I4LD4 + dst, top); 8997c8da7ce66017295a65ec028084b90800be377f8James Zern VL4(I4VL4 + dst, top); 9007c8da7ce66017295a65ec028084b90800be377f8James Zern HD4(I4HD4 + dst, top); 9017c8da7ce66017295a65ec028084b90800be377f8James Zern HU4(I4HU4 + dst, top); 9027c8da7ce66017295a65ec028084b90800be377f8James Zern} 9037c8da7ce66017295a65ec028084b90800be377f8James Zern 9047c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 9057c8da7ce66017295a65ec028084b90800be377f8James Zern// Chroma 8x8 prediction (paragraph 12.2) 9067c8da7ce66017295a65ec028084b90800be377f8James Zern 9077c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void IntraChromaPreds(uint8_t* dst, const uint8_t* left, 9087c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* top) { 9097c8da7ce66017295a65ec028084b90800be377f8James Zern // U block 9107c8da7ce66017295a65ec028084b90800be377f8James Zern DC8uvMode(C8DC8 + dst, left, top); 9117c8da7ce66017295a65ec028084b90800be377f8James Zern VerticalPred(C8VE8 + dst, top, 8); 9127c8da7ce66017295a65ec028084b90800be377f8James Zern HorizontalPred(C8HE8 + dst, left, 8); 9137c8da7ce66017295a65ec028084b90800be377f8James Zern TrueMotion(C8TM8 + dst, left, top, 8); 9147c8da7ce66017295a65ec028084b90800be377f8James Zern // V block 9157c8da7ce66017295a65ec028084b90800be377f8James Zern dst += 8; 9167c8da7ce66017295a65ec028084b90800be377f8James Zern if (top != NULL) top += 8; 9177c8da7ce66017295a65ec028084b90800be377f8James Zern if (left != NULL) left += 16; 9187c8da7ce66017295a65ec028084b90800be377f8James Zern DC8uvMode(C8DC8 + dst, left, top); 9197c8da7ce66017295a65ec028084b90800be377f8James Zern VerticalPred(C8VE8 + dst, top, 8); 9207c8da7ce66017295a65ec028084b90800be377f8James Zern HorizontalPred(C8HE8 + dst, left, 8); 9217c8da7ce66017295a65ec028084b90800be377f8James Zern TrueMotion(C8TM8 + dst, left, top, 8); 9227c8da7ce66017295a65ec028084b90800be377f8James Zern} 9237c8da7ce66017295a65ec028084b90800be377f8James Zern 9247c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 9257c8da7ce66017295a65ec028084b90800be377f8James Zern// luma 16x16 prediction (paragraph 12.3) 9267c8da7ce66017295a65ec028084b90800be377f8James Zern 9277c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void Intra16Preds(uint8_t* dst, 9287c8da7ce66017295a65ec028084b90800be377f8James Zern const uint8_t* left, const uint8_t* top) { 9297c8da7ce66017295a65ec028084b90800be377f8James Zern DC16Mode(I16DC16 + dst, left, top); 9307c8da7ce66017295a65ec028084b90800be377f8James Zern VerticalPred(I16VE16 + dst, top, 16); 9317c8da7ce66017295a65ec028084b90800be377f8James Zern HorizontalPred(I16HE16 + dst, left, 16); 9327c8da7ce66017295a65ec028084b90800be377f8James Zern TrueMotion(I16TM16 + dst, left, top, 16); 9337c8da7ce66017295a65ec028084b90800be377f8James Zern} 9347c8da7ce66017295a65ec028084b90800be377f8James Zern 9357c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------ 9367c8da7ce66017295a65ec028084b90800be377f8James Zern// Metric 9377c8da7ce66017295a65ec028084b90800be377f8James Zern 9387c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b, 9397c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i* const sum) { 9407c8da7ce66017295a65ec028084b90800be377f8James Zern // take abs(a-b) in 8b 9417c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a_b = _mm_subs_epu8(a, b); 9427c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b_a = _mm_subs_epu8(b, a); 9437c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i abs_a_b = _mm_or_si128(a_b, b_a); 9447c8da7ce66017295a65ec028084b90800be377f8James Zern // zero-extend to 16b 9457c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 9467c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero); 9477c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero); 9487c8da7ce66017295a65ec028084b90800be377f8James Zern // multiply with self 9497c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i sum1 = _mm_madd_epi16(C0, C0); 9507c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i sum2 = _mm_madd_epi16(C1, C1); 9517c8da7ce66017295a65ec028084b90800be377f8James Zern *sum = _mm_add_epi32(sum1, sum2); 9527c8da7ce66017295a65ec028084b90800be377f8James Zern} 9537c8da7ce66017295a65ec028084b90800be377f8James Zern 9547c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b, 9557c8da7ce66017295a65ec028084b90800be377f8James Zern int num_pairs) { 9567c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i sum = _mm_setzero_si128(); 9577c8da7ce66017295a65ec028084b90800be377f8James Zern int32_t tmp[4]; 9587c8da7ce66017295a65ec028084b90800be377f8James Zern int i; 9597c8da7ce66017295a65ec028084b90800be377f8James Zern 9607c8da7ce66017295a65ec028084b90800be377f8James Zern for (i = 0; i < num_pairs; ++i) { 9617c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[BPS * 0]); 9627c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[BPS * 0]); 9637c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]); 9647c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]); 9657c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i sum1, sum2; 9667c8da7ce66017295a65ec028084b90800be377f8James Zern SubtractAndAccumulate(a0, b0, &sum1); 9677c8da7ce66017295a65ec028084b90800be377f8James Zern SubtractAndAccumulate(a1, b1, &sum2); 9687c8da7ce66017295a65ec028084b90800be377f8James Zern sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2)); 9697c8da7ce66017295a65ec028084b90800be377f8James Zern a += 2 * BPS; 9707c8da7ce66017295a65ec028084b90800be377f8James Zern b += 2 * BPS; 9711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora } 9727c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)tmp, sum); 9737c8da7ce66017295a65ec028084b90800be377f8James Zern return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 9741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 9751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 97633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE16x16(const uint8_t* a, const uint8_t* b) { 9777c8da7ce66017295a65ec028084b90800be377f8James Zern return SSE_16xN(a, b, 8); 9781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 9791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 98033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE16x8(const uint8_t* a, const uint8_t* b) { 9817c8da7ce66017295a65ec028084b90800be377f8James Zern return SSE_16xN(a, b, 4); 9821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 9831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 9847c8da7ce66017295a65ec028084b90800be377f8James Zern#define LOAD_8x16b(ptr) \ 9857c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero) 9867c8da7ce66017295a65ec028084b90800be377f8James Zern 98733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE8x8(const uint8_t* a, const uint8_t* b) { 9887c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i zero = _mm_setzero_si128(); 9897c8da7ce66017295a65ec028084b90800be377f8James Zern int num_pairs = 4; 9907c8da7ce66017295a65ec028084b90800be377f8James Zern __m128i sum = zero; 9917c8da7ce66017295a65ec028084b90800be377f8James Zern int32_t tmp[4]; 9927c8da7ce66017295a65ec028084b90800be377f8James Zern while (num_pairs-- > 0) { 9937c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a0 = LOAD_8x16b(&a[BPS * 0]); 9947c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a1 = LOAD_8x16b(&a[BPS * 1]); 9957c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b0 = LOAD_8x16b(&b[BPS * 0]); 9967c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b1 = LOAD_8x16b(&b[BPS * 1]); 9977c8da7ce66017295a65ec028084b90800be377f8James Zern // subtract 9987c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i c0 = _mm_subs_epi16(a0, b0); 9997c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i c1 = _mm_subs_epi16(a1, b1); 10007c8da7ce66017295a65ec028084b90800be377f8James Zern // multiply/accumulate with self 10017c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d0 = _mm_madd_epi16(c0, c0); 10027c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d1 = _mm_madd_epi16(c1, c1); 10037c8da7ce66017295a65ec028084b90800be377f8James Zern // collect 10047c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i sum01 = _mm_add_epi32(d0, d1); 10057c8da7ce66017295a65ec028084b90800be377f8James Zern sum = _mm_add_epi32(sum, sum01); 10067c8da7ce66017295a65ec028084b90800be377f8James Zern a += 2 * BPS; 10077c8da7ce66017295a65ec028084b90800be377f8James Zern b += 2 * BPS; 10087c8da7ce66017295a65ec028084b90800be377f8James Zern } 10097c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)tmp, sum); 10107c8da7ce66017295a65ec028084b90800be377f8James Zern return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 10111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora} 10127c8da7ce66017295a65ec028084b90800be377f8James Zern#undef LOAD_8x16b 10131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 101433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE4x4(const uint8_t* a, const uint8_t* b) { 10151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i zero = _mm_setzero_si128(); 1016466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 10171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // Load values. Note that we read 8 pixels instead of 4, 10181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora // but the a/b buffers are over-allocated to that effect. 10197c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]); 10207c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]); 10217c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]); 10227c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]); 10237c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]); 10247c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]); 10257c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]); 10267c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]); 10277c8da7ce66017295a65ec028084b90800be377f8James Zern // Combine pair of lines. 1028466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i a01 = _mm_unpacklo_epi32(a0, a1); 1029466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i a23 = _mm_unpacklo_epi32(a2, a3); 1030466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b01 = _mm_unpacklo_epi32(b0, b1); 1031466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b23 = _mm_unpacklo_epi32(b2, b3); 10327c8da7ce66017295a65ec028084b90800be377f8James Zern // Convert to 16b. 1033466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i a01s = _mm_unpacklo_epi8(a01, zero); 1034466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i a23s = _mm_unpacklo_epi8(a23, zero); 1035466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b01s = _mm_unpacklo_epi8(b01, zero); 1036466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b23s = _mm_unpacklo_epi8(b23, zero); 10377c8da7ce66017295a65ec028084b90800be377f8James Zern // subtract, square and accumulate 10387c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d0 = _mm_subs_epi16(a01s, b01s); 10397c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d1 = _mm_subs_epi16(a23s, b23s); 10407c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i e0 = _mm_madd_epi16(d0, d0); 10417c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i e1 = _mm_madd_epi16(d1, d1); 10427c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i sum = _mm_add_epi32(e0, e1); 10431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1044466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora int32_t tmp[4]; 10457c8da7ce66017295a65ec028084b90800be377f8James Zern _mm_storeu_si128((__m128i*)tmp, sum); 1046466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 1047466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora} 1048466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1049a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora//------------------------------------------------------------------------------ 1050fa39824bb690c5806358871f46940d0450973d8aJames Zern 1051fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void Mean16x4(const uint8_t* ref, uint32_t dc[4]) { 1052fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i mask = _mm_set1_epi16(0x00ff); 1053fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]); 1054fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]); 1055fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]); 1056fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]); 1057fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i b0 = _mm_srli_epi16(a0, 8); // hi byte 1058fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i b1 = _mm_srli_epi16(a1, 8); 1059fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i b2 = _mm_srli_epi16(a2, 8); 1060fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i b3 = _mm_srli_epi16(a3, 8); 1061fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i c0 = _mm_and_si128(a0, mask); // lo byte 1062fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i c1 = _mm_and_si128(a1, mask); 1063fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i c2 = _mm_and_si128(a2, mask); 1064fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i c3 = _mm_and_si128(a3, mask); 1065fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i d0 = _mm_add_epi32(b0, c0); 1066fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i d1 = _mm_add_epi32(b1, c1); 1067fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i d2 = _mm_add_epi32(b2, c2); 1068fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i d3 = _mm_add_epi32(b3, c3); 1069fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i e0 = _mm_add_epi32(d0, d1); 1070fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i e1 = _mm_add_epi32(d2, d3); 1071fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i f0 = _mm_add_epi32(e0, e1); 1072fa39824bb690c5806358871f46940d0450973d8aJames Zern uint16_t tmp[8]; 1073fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128((__m128i*)tmp, f0); 1074fa39824bb690c5806358871f46940d0450973d8aJames Zern dc[0] = tmp[0] + tmp[1]; 1075fa39824bb690c5806358871f46940d0450973d8aJames Zern dc[1] = tmp[2] + tmp[3]; 1076fa39824bb690c5806358871f46940d0450973d8aJames Zern dc[2] = tmp[4] + tmp[5]; 1077fa39824bb690c5806358871f46940d0450973d8aJames Zern dc[3] = tmp[6] + tmp[7]; 1078fa39824bb690c5806358871f46940d0450973d8aJames Zern} 1079fa39824bb690c5806358871f46940d0450973d8aJames Zern 1080fa39824bb690c5806358871f46940d0450973d8aJames Zern//------------------------------------------------------------------------------ 1081466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Texture distortion 1082466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// 1083466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// We try to match the spectral content (weighted) between source and 1084466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// reconstructed samples. 1085466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1086466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Hadamard transform 10870912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern// Returns the weighted sum of the absolute value of transformed coefficients. 10880912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern// w[] contains a row-major 4 by 4 symmetric matrix. 108933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int TTransform(const uint8_t* inA, const uint8_t* inB, 109033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16_t* const w) { 1091466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora int32_t sum[4]; 1092466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i tmp_0, tmp_1, tmp_2, tmp_3; 1093466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i zero = _mm_setzero_si128(); 1094466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 10950912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // Load and combine inputs. 1096466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora { 10977c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); 10987c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); 10997c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); 11007c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); 11017c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); 11027c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); 11037c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); 11047c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); 1105466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1106466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Combine inA and inB (we'll do two transforms in parallel). 11070912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0); 11080912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1); 11090912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2); 11100912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3); 11110912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern tmp_0 = _mm_unpacklo_epi8(inAB_0, zero); 11120912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern tmp_1 = _mm_unpacklo_epi8(inAB_1, zero); 11130912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern tmp_2 = _mm_unpacklo_epi8(inAB_2, zero); 11140912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern tmp_3 = _mm_unpacklo_epi8(inAB_3, zero); 11150912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // a00 a01 a02 a03 b00 b01 b02 b03 11160912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // a10 a11 a12 a13 b10 b11 b12 b13 11170912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // a20 a21 a22 a23 b20 b21 b22 b23 11180912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // a30 a31 a32 a33 b30 b31 b32 b33 1119466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 1120466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 11210912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // Vertical pass first to avoid a transpose (vertical and horizontal passes 11220912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // are commutative because w/kWeightY is symmetric) and subsequent transpose. 1123466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora { 1124466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Calculate a and b (two 4x4 at once). 11251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); 11261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); 11271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); 11281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); 11291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i b0 = _mm_add_epi16(a0, a1); 1130466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b1 = _mm_add_epi16(a3, a2); 1131466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b2 = _mm_sub_epi16(a3, a2); 1132466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b3 = _mm_sub_epi16(a0, a1); 1133466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a00 a01 a02 a03 b00 b01 b02 b03 1134466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a10 a11 a12 a13 b10 b11 b12 b13 1135466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a20 a21 a22 a23 b20 b21 b22 b23 1136466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // a30 a31 a32 a33 b30 b31 b32 b33 1137466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1138466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Transpose the two 4x4. 11390912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3); 1140466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 1141466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 11420912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern // Horizontal pass and difference of weighted sums. 1143466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora { 1144466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Load all inputs. 11457c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); 11467c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); 1147466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1148466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Calculate a and b (two 4x4 at once). 1149466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); 1150466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); 1151466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); 1152466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); 1153466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b0 = _mm_add_epi16(a0, a1); 1154466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b1 = _mm_add_epi16(a3, a2); 1155466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b2 = _mm_sub_epi16(a3, a2); 1156466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const __m128i b3 = _mm_sub_epi16(a0, a1); 1157466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1158466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Separate the transforms of inA and inB. 1159466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); 1160466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); 1161466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); 1162466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); 1163466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1164466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora { 11657c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d0 = _mm_sub_epi16(zero, A_b0); 11667c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d1 = _mm_sub_epi16(zero, A_b2); 11677c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d2 = _mm_sub_epi16(zero, B_b0); 11687c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i d3 = _mm_sub_epi16(zero, B_b2); 11697c8da7ce66017295a65ec028084b90800be377f8James Zern A_b0 = _mm_max_epi16(A_b0, d0); // abs(v), 16b 11707c8da7ce66017295a65ec028084b90800be377f8James Zern A_b2 = _mm_max_epi16(A_b2, d1); 11717c8da7ce66017295a65ec028084b90800be377f8James Zern B_b0 = _mm_max_epi16(B_b0, d2); 11727c8da7ce66017295a65ec028084b90800be377f8James Zern B_b2 = _mm_max_epi16(B_b2, d3); 1173466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 1174466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1175466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // weighted sums 1176466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora A_b0 = _mm_madd_epi16(A_b0, w_0); 1177466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora A_b2 = _mm_madd_epi16(A_b2, w_8); 1178466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora B_b0 = _mm_madd_epi16(B_b0, w_0); 1179466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora B_b2 = _mm_madd_epi16(B_b2, w_8); 1180466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora A_b0 = _mm_add_epi32(A_b0, A_b2); 1181466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora B_b0 = _mm_add_epi32(B_b0, B_b2); 1182466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1183466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // difference of weighted sums 1184466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora A_b0 = _mm_sub_epi32(A_b0, B_b0); 1185466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora _mm_storeu_si128((__m128i*)&sum[0], A_b0); 1186466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 1187466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora return sum[0] + sum[1] + sum[2] + sum[3]; 1188466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora} 1189466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 119033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b, 119133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16_t* const w) { 119233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const int diff_sum = TTransform(a, b, w); 11931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora return abs(diff_sum) >> 5; 1194466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora} 1195466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 119633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int Disto16x16(const uint8_t* const a, const uint8_t* const b, 119733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16_t* const w) { 1198466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora int D = 0; 1199466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora int x, y; 1200466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora for (y = 0; y < 16 * BPS; y += 4 * BPS) { 1201466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora for (x = 0; x < 16; x += 4) { 120233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora D += Disto4x4(a + x + y, b + x + y, w); 1203466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 1204466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 1205466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora return D; 1206466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora} 1207466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1208a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora//------------------------------------------------------------------------------ 1209466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Quantization 1210466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// 1211466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 121233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], 121333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const uint16_t* const sharpen, 121433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const VP8Matrix* const mtx) { 12150406ce1417f76f2034833414dcecc9f56253640cVikas Arora const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); 12161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora const __m128i zero = _mm_setzero_si128(); 1217466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i coeff0, coeff8; 1218466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i out0, out8; 1219466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i packed_out; 1220466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1221466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // Load all inputs. 1222466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); 1223466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); 12247c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); 12257c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]); 12267c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]); 12277c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]); 1228466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 122933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // extract sign(in) (0x0000 if positive, 0xffff if negative) 123033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i sign0 = _mm_cmpgt_epi16(zero, in0); 123133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i sign8 = _mm_cmpgt_epi16(zero, in8); 1232466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1233466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // coeff = abs(in) = (in ^ sign) - sign 1234466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora coeff0 = _mm_xor_si128(in0, sign0); 1235466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora coeff8 = _mm_xor_si128(in8, sign8); 1236466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora coeff0 = _mm_sub_epi16(coeff0, sign0); 1237466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora coeff8 = _mm_sub_epi16(coeff8, sign8); 1238466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1239466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // coeff = abs(in) + sharpen 124033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora if (sharpen != NULL) { 12417c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]); 12427c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]); 124333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora coeff0 = _mm_add_epi16(coeff0, sharpen0); 124433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora coeff8 = _mm_add_epi16(coeff8, sharpen8); 124533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora } 1246466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 124733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // out = (coeff * iQ + B) >> QFIX 1248466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora { 1249466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // doing calculations with 32b precision (QFIX=17) 1250466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // out = (coeff * iQ) 125133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); 125233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); 125333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); 125433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); 1255466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); 1256466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); 1257466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); 1258466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); 1259466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // out = (coeff * iQ + B) 12607c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]); 12617c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]); 12627c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]); 12637c8da7ce66017295a65ec028084b90800be377f8James Zern const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]); 1264466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out_00 = _mm_add_epi32(out_00, bias_00); 1265466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out_04 = _mm_add_epi32(out_04, bias_04); 1266466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out_08 = _mm_add_epi32(out_08, bias_08); 1267466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out_12 = _mm_add_epi32(out_12, bias_12); 126833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora // out = QUANTDIV(coeff, iQ, B, QFIX) 1269466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out_00 = _mm_srai_epi32(out_00, QFIX); 1270466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out_04 = _mm_srai_epi32(out_04, QFIX); 1271466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out_08 = _mm_srai_epi32(out_08, QFIX); 1272466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out_12 = _mm_srai_epi32(out_12, QFIX); 12730406ce1417f76f2034833414dcecc9f56253640cVikas Arora 1274466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // pack result as 16b 1275466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out0 = _mm_packs_epi32(out_00, out_04); 1276466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out8 = _mm_packs_epi32(out_08, out_12); 12770406ce1417f76f2034833414dcecc9f56253640cVikas Arora 12780406ce1417f76f2034833414dcecc9f56253640cVikas Arora // if (coeff > 2047) coeff = 2047 12790406ce1417f76f2034833414dcecc9f56253640cVikas Arora out0 = _mm_min_epi16(out0, max_coeff_2047); 12800406ce1417f76f2034833414dcecc9f56253640cVikas Arora out8 = _mm_min_epi16(out8, max_coeff_2047); 1281466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 1282466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1283466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // get sign back (if (sign[j]) out_n = -out_n) 1284466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out0 = _mm_xor_si128(out0, sign0); 1285466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out8 = _mm_xor_si128(out8, sign8); 1286466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out0 = _mm_sub_epi16(out0, sign0); 1287466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out8 = _mm_sub_epi16(out8, sign8); 1288466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1289466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // in = out * Q 1290466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora in0 = _mm_mullo_epi16(out0, q0); 1291466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora in8 = _mm_mullo_epi16(out8, q8); 1292466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 12938b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora _mm_storeu_si128((__m128i*)&in[0], in0); 12948b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora _mm_storeu_si128((__m128i*)&in[8], in8); 1295466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1296466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // zigzag the output before storing it. 1297466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // 1298466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // The zigzag pattern can almost be reproduced with a small sequence of 1299466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // shuffles. After it, we only need to swap the 7th (ending up in third 1300466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // position instead of twelfth) and 8th values. 1301466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora { 1302466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora __m128i outZ0, outZ8; 1303466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); 1304466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); 1305466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); 1306466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); 1307466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); 1308466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); 1309466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora _mm_storeu_si128((__m128i*)&out[0], outZ0); 1310466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora _mm_storeu_si128((__m128i*)&out[8], outZ8); 1311466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora packed_out = _mm_packs_epi16(outZ0, outZ8); 1312466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 1313466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora { 1314466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const int16_t outZ_12 = out[12]; 1315466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora const int16_t outZ_3 = out[3]; 1316466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out[3] = outZ_12; 1317466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora out[12] = outZ_3; 1318466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora } 1319466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 1320466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora // detect if all 'out' values are zeroes or not 132133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); 1322466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora} 1323466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 132433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int QuantizeBlock(int16_t in[16], int16_t out[16], 132533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const VP8Matrix* const mtx) { 132633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx); 132733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 132833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 132933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int QuantizeBlockWHT(int16_t in[16], int16_t out[16], 133033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora const VP8Matrix* const mtx) { 133133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora return DoQuantizeBlock(in, out, NULL, mtx); 133233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora} 133333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora 13347c8da7ce66017295a65ec028084b90800be377f8James Zernstatic int Quantize2Blocks(int16_t in[32], int16_t out[32], 13357c8da7ce66017295a65ec028084b90800be377f8James Zern const VP8Matrix* const mtx) { 13367c8da7ce66017295a65ec028084b90800be377f8James Zern int nz; 13377c8da7ce66017295a65ec028084b90800be377f8James Zern const uint16_t* const sharpen = &mtx->sharpen_[0]; 13387c8da7ce66017295a65ec028084b90800be377f8James Zern nz = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0; 13397c8da7ce66017295a65ec028084b90800be377f8James Zern nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1; 13407c8da7ce66017295a65ec028084b90800be377f8James Zern return nz; 13417c8da7ce66017295a65ec028084b90800be377f8James Zern} 13421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 13431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------ 13441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Entry point 13451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 1346466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Aroraextern void VP8EncDspInitSSE2(void); 13471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora 13487c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) { 134933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8CollectHistogram = CollectHistogram; 13507c8da7ce66017295a65ec028084b90800be377f8James Zern VP8EncPredLuma16 = Intra16Preds; 13517c8da7ce66017295a65ec028084b90800be377f8James Zern VP8EncPredChroma8 = IntraChromaPreds; 13527c8da7ce66017295a65ec028084b90800be377f8James Zern VP8EncPredLuma4 = Intra4Preds; 135333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8EncQuantizeBlock = QuantizeBlock; 13547c8da7ce66017295a65ec028084b90800be377f8James Zern VP8EncQuantize2Blocks = Quantize2Blocks; 135533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8EncQuantizeBlockWHT = QuantizeBlockWHT; 135633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8ITransform = ITransform; 135733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8FTransform = FTransform; 13587c8da7ce66017295a65ec028084b90800be377f8James Zern VP8FTransform2 = FTransform2; 135933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8FTransformWHT = FTransformWHT; 136033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8SSE16x16 = SSE16x16; 136133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8SSE16x8 = SSE16x8; 136233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8SSE8x8 = SSE8x8; 136333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8SSE4x4 = SSE4x4; 136433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8TDisto4x4 = Disto4x4; 136533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora VP8TDisto16x16 = Disto16x16; 1366fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8Mean16x4 = Mean16x4; 1367fa39824bb690c5806358871f46940d0450973d8aJames Zern} 1368fa39824bb690c5806358871f46940d0450973d8aJames Zern 1369fa39824bb690c5806358871f46940d0450973d8aJames Zern//------------------------------------------------------------------------------ 1370fa39824bb690c5806358871f46940d0450973d8aJames Zern// SSIM / PSNR entry point (TODO(skal): move to its own file later) 1371fa39824bb690c5806358871f46940d0450973d8aJames Zern 1372fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t AccumulateSSE_SSE2(const uint8_t* src1, 1373fa39824bb690c5806358871f46940d0450973d8aJames Zern const uint8_t* src2, int len) { 1374fa39824bb690c5806358871f46940d0450973d8aJames Zern int i = 0; 1375fa39824bb690c5806358871f46940d0450973d8aJames Zern uint32_t sse2 = 0; 1376fa39824bb690c5806358871f46940d0450973d8aJames Zern if (len >= 16) { 1377fa39824bb690c5806358871f46940d0450973d8aJames Zern const int limit = len - 32; 1378fa39824bb690c5806358871f46940d0450973d8aJames Zern int32_t tmp[4]; 1379fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i sum1; 1380fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i sum = _mm_setzero_si128(); 1381fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]); 1382fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]); 1383fa39824bb690c5806358871f46940d0450973d8aJames Zern i += 16; 1384fa39824bb690c5806358871f46940d0450973d8aJames Zern while (i <= limit) { 1385fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]); 1386fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]); 1387fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i sum2; 1388fa39824bb690c5806358871f46940d0450973d8aJames Zern i += 16; 1389fa39824bb690c5806358871f46940d0450973d8aJames Zern SubtractAndAccumulate(a0, b0, &sum1); 1390fa39824bb690c5806358871f46940d0450973d8aJames Zern sum = _mm_add_epi32(sum, sum1); 1391fa39824bb690c5806358871f46940d0450973d8aJames Zern a0 = _mm_loadu_si128((const __m128i*)&src1[i]); 1392fa39824bb690c5806358871f46940d0450973d8aJames Zern b0 = _mm_loadu_si128((const __m128i*)&src2[i]); 1393fa39824bb690c5806358871f46940d0450973d8aJames Zern i += 16; 1394fa39824bb690c5806358871f46940d0450973d8aJames Zern SubtractAndAccumulate(a1, b1, &sum2); 1395fa39824bb690c5806358871f46940d0450973d8aJames Zern sum = _mm_add_epi32(sum, sum2); 1396fa39824bb690c5806358871f46940d0450973d8aJames Zern } 1397fa39824bb690c5806358871f46940d0450973d8aJames Zern SubtractAndAccumulate(a0, b0, &sum1); 1398fa39824bb690c5806358871f46940d0450973d8aJames Zern sum = _mm_add_epi32(sum, sum1); 1399fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128((__m128i*)tmp, sum); 1400fa39824bb690c5806358871f46940d0450973d8aJames Zern sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]); 1401fa39824bb690c5806358871f46940d0450973d8aJames Zern } 1402fa39824bb690c5806358871f46940d0450973d8aJames Zern 1403fa39824bb690c5806358871f46940d0450973d8aJames Zern for (; i < len; ++i) { 1404fa39824bb690c5806358871f46940d0450973d8aJames Zern const int32_t diff = src1[i] - src2[i]; 1405fa39824bb690c5806358871f46940d0450973d8aJames Zern sse2 += diff * diff; 1406fa39824bb690c5806358871f46940d0450973d8aJames Zern } 1407fa39824bb690c5806358871f46940d0450973d8aJames Zern return sse2; 1408fa39824bb690c5806358871f46940d0450973d8aJames Zern} 1409fa39824bb690c5806358871f46940d0450973d8aJames Zern 1410fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t HorizontalAdd16b(const __m128i* const m) { 1411fa39824bb690c5806358871f46940d0450973d8aJames Zern uint16_t tmp[8]; 1412fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a = _mm_srli_si128(*m, 8); 1413fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i b = _mm_add_epi16(*m, a); 1414fa39824bb690c5806358871f46940d0450973d8aJames Zern _mm_storeu_si128((__m128i*)tmp, b); 1415fa39824bb690c5806358871f46940d0450973d8aJames Zern return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0]; 1416fa39824bb690c5806358871f46940d0450973d8aJames Zern} 1417fa39824bb690c5806358871f46940d0450973d8aJames Zern 1418fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t HorizontalAdd32b(const __m128i* const m) { 1419fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a = _mm_srli_si128(*m, 8); 1420fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i b = _mm_add_epi32(*m, a); 1421fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4)); 1422fa39824bb690c5806358871f46940d0450973d8aJames Zern return (uint32_t)_mm_cvtsi128_si32(c); 1423fa39824bb690c5806358871f46940d0450973d8aJames Zern} 1424fa39824bb690c5806358871f46940d0450973d8aJames Zern 1425fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 }; 1426fa39824bb690c5806358871f46940d0450973d8aJames Zern 1427fa39824bb690c5806358871f46940d0450973d8aJames Zern#define ACCUMULATE_ROW(WEIGHT) do { \ 1428fa39824bb690c5806358871f46940d0450973d8aJames Zern /* compute row weight (Wx * Wy) */ \ 1429fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i Wy = _mm_set1_epi16((WEIGHT)); \ 1430fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i W = _mm_mullo_epi16(Wx, Wy); \ 1431fa39824bb690c5806358871f46940d0450973d8aJames Zern /* process 8 bytes at a time (7 bytes, actually) */ \ 1432fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \ 1433fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \ 1434fa39824bb690c5806358871f46940d0450973d8aJames Zern /* convert to 16b and multiply by weight */ \ 1435fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \ 1436fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \ 1437fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i wa1 = _mm_mullo_epi16(a1, W); \ 1438fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i wb1 = _mm_mullo_epi16(b1, W); \ 1439fa39824bb690c5806358871f46940d0450973d8aJames Zern /* accumulate */ \ 1440fa39824bb690c5806358871f46940d0450973d8aJames Zern xm = _mm_add_epi16(xm, wa1); \ 1441fa39824bb690c5806358871f46940d0450973d8aJames Zern ym = _mm_add_epi16(ym, wb1); \ 1442fa39824bb690c5806358871f46940d0450973d8aJames Zern xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \ 1443fa39824bb690c5806358871f46940d0450973d8aJames Zern xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \ 1444fa39824bb690c5806358871f46940d0450973d8aJames Zern yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \ 1445fa39824bb690c5806358871f46940d0450973d8aJames Zern src1 += stride1; \ 1446fa39824bb690c5806358871f46940d0450973d8aJames Zern src2 += stride2; \ 1447fa39824bb690c5806358871f46940d0450973d8aJames Zern} while (0) 1448fa39824bb690c5806358871f46940d0450973d8aJames Zern 1449fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic double SSIMGet_SSE2(const uint8_t* src1, int stride1, 1450fa39824bb690c5806358871f46940d0450973d8aJames Zern const uint8_t* src2, int stride2) { 1451fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8DistoStats stats; 1452fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i zero = _mm_setzero_si128(); 1453fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i xm = zero, ym = zero; // 16b accums 1454fa39824bb690c5806358871f46940d0450973d8aJames Zern __m128i xxm = zero, yym = zero, xym = zero; // 32b accum 1455fa39824bb690c5806358871f46940d0450973d8aJames Zern const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight); 1456fa39824bb690c5806358871f46940d0450973d8aJames Zern assert(2 * VP8_SSIM_KERNEL + 1 == 7); 1457fa39824bb690c5806358871f46940d0450973d8aJames Zern ACCUMULATE_ROW(1); 1458fa39824bb690c5806358871f46940d0450973d8aJames Zern ACCUMULATE_ROW(2); 1459fa39824bb690c5806358871f46940d0450973d8aJames Zern ACCUMULATE_ROW(3); 1460fa39824bb690c5806358871f46940d0450973d8aJames Zern ACCUMULATE_ROW(4); 1461fa39824bb690c5806358871f46940d0450973d8aJames Zern ACCUMULATE_ROW(3); 1462fa39824bb690c5806358871f46940d0450973d8aJames Zern ACCUMULATE_ROW(2); 1463fa39824bb690c5806358871f46940d0450973d8aJames Zern ACCUMULATE_ROW(1); 1464fa39824bb690c5806358871f46940d0450973d8aJames Zern stats.xm = HorizontalAdd16b(&xm); 1465fa39824bb690c5806358871f46940d0450973d8aJames Zern stats.ym = HorizontalAdd16b(&ym); 1466fa39824bb690c5806358871f46940d0450973d8aJames Zern stats.xxm = HorizontalAdd32b(&xxm); 1467fa39824bb690c5806358871f46940d0450973d8aJames Zern stats.xym = HorizontalAdd32b(&xym); 1468fa39824bb690c5806358871f46940d0450973d8aJames Zern stats.yym = HorizontalAdd32b(&yym); 1469fa39824bb690c5806358871f46940d0450973d8aJames Zern return VP8SSIMFromStats(&stats); 1470fa39824bb690c5806358871f46940d0450973d8aJames Zern} 1471fa39824bb690c5806358871f46940d0450973d8aJames Zern 1472fa39824bb690c5806358871f46940d0450973d8aJames Zernextern void VP8SSIMDspInitSSE2(void); 1473fa39824bb690c5806358871f46940d0450973d8aJames Zern 1474fa39824bb690c5806358871f46940d0450973d8aJames ZernWEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) { 1475fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8AccumulateSSE = AccumulateSSE_SSE2; 1476fa39824bb690c5806358871f46940d0450973d8aJames Zern VP8SSIMGet = SSIMGet_SSE2; 1477466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora} 1478466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora 14797c8da7ce66017295a65ec028084b90800be377f8James Zern#else // !WEBP_USE_SSE2 14807c8da7ce66017295a65ec028084b90800be377f8James Zern 14817c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_DSP_INIT_STUB(VP8EncDspInitSSE2) 1482fa39824bb690c5806358871f46940d0450973d8aJames ZernWEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2) 14837c8da7ce66017295a65ec028084b90800be377f8James Zern 14847c8da7ce66017295a65ec028084b90800be377f8James Zern#endif // WEBP_USE_SSE2 1485