1a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora// Copyright 2011 Google Inc. All Rights Reserved.
2466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora//
30406ce1417f76f2034833414dcecc9f56253640cVikas Arora// Use of this source code is governed by a BSD-style license
40406ce1417f76f2034833414dcecc9f56253640cVikas Arora// that can be found in the COPYING file in the root of the source
50406ce1417f76f2034833414dcecc9f56253640cVikas Arora// tree. An additional intellectual property rights grant can be found
60406ce1417f76f2034833414dcecc9f56253640cVikas Arora// in the file PATENTS. All contributing project authors may
70406ce1417f76f2034833414dcecc9f56253640cVikas Arora// be found in the AUTHORS file in the root of the source tree.
8466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// -----------------------------------------------------------------------------
9466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora//
10a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora// SSE2 version of speed-critical encoding functions.
11466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora//
12466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Author: Christian Duvivier (cduvivier@google.com)
13466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
14a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#include "./dsp.h"
15a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora
16a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#if defined(WEBP_USE_SSE2)
17fa39824bb690c5806358871f46940d0450973d8aJames Zern#include <assert.h>
18a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora#include <stdlib.h>  // for abs()
19466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora#include <emmintrin.h>
20466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
210912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern#include "./common_sse2.h"
22fa39824bb690c5806358871f46940d0450973d8aJames Zern#include "../enc/cost_enc.h"
23fa39824bb690c5806358871f46940d0450973d8aJames Zern#include "../enc/vp8i_enc.h"
24466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
26466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Transforms (Paragraph 14.4)
27466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
28466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Does one or two inverse transforms.
2933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
3033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                       int do_two) {
31466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // This implementation makes use of 16-bit fixed point versions of two
32466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // multiply constants:
33466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
34466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
35466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  //
36466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // To be able to use signed 16-bit integers, we use the following trick to
37466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // have constants within range:
38466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // - Associated constants are obtained by subtracting the 16-bit fixed point
39466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  //   version of one:
40466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
41466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  //      K1 = 85267  =>  k1 =  20091
42466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  //      K2 = 35468  =>  k2 = -30068
43466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // - The multiplication of a variable by a constant become the sum of the
44466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  //   variable and the multiplication of that variable by the associated
45466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  //   constant:
46466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
47466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  const __m128i k1 = _mm_set1_epi16(20091);
48466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  const __m128i k2 = _mm_set1_epi16(-30068);
49466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  __m128i T0, T1, T2, T3;
50466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
51466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // Load and concatenate the transform coefficients (we'll do two inverse
52466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // transforms in parallel). In the case of only one inverse transform, the
53466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // second half of the vectors will just contain random value we'll never
54466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // use nor store.
55466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  __m128i in0, in1, in2, in3;
56466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  {
577c8da7ce66017295a65ec028084b90800be377f8James Zern    in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
587c8da7ce66017295a65ec028084b90800be377f8James Zern    in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
597c8da7ce66017295a65ec028084b90800be377f8James Zern    in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
607c8da7ce66017295a65ec028084b90800be377f8James Zern    in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
61466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // a00 a10 a20 a30   x x x x
62466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // a01 a11 a21 a31   x x x x
63466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // a02 a12 a22 a32   x x x x
64466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // a03 a13 a23 a33   x x x x
65466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    if (do_two) {
667c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
677c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
687c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
697c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
70466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      in0 = _mm_unpacklo_epi64(in0, inB0);
71466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      in1 = _mm_unpacklo_epi64(in1, inB1);
72466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      in2 = _mm_unpacklo_epi64(in2, inB2);
73466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      in3 = _mm_unpacklo_epi64(in3, inB3);
74466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      // a00 a10 a20 a30   b00 b10 b20 b30
75466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      // a01 a11 a21 a31   b01 b11 b21 b31
76466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      // a02 a12 a22 a32   b02 b12 b22 b32
77466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      // a03 a13 a23 a33   b03 b13 b23 b33
78466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    }
79466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  }
80466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
81466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // Vertical pass and subsequent transpose.
82466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  {
83466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // First pass, c and d calculations are longer because of the "trick"
84466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // multiplications.
85466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i a = _mm_add_epi16(in0, in2);
86466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i b = _mm_sub_epi16(in0, in2);
87466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
88466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
89466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
90466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i c3 = _mm_sub_epi16(in1, in3);
91466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i c4 = _mm_sub_epi16(c1, c2);
92466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i c = _mm_add_epi16(c3, c4);
93466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
94466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
95466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
96466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i d3 = _mm_add_epi16(in1, in3);
97466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i d4 = _mm_add_epi16(d1, d2);
98466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i d = _mm_add_epi16(d3, d4);
99466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
100466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Second pass.
101466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i tmp0 = _mm_add_epi16(a, d);
102466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i tmp1 = _mm_add_epi16(b, c);
103466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i tmp2 = _mm_sub_epi16(b, c);
104466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i tmp3 = _mm_sub_epi16(a, d);
105466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
106466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Transpose the two 4x4.
1070912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3);
108466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  }
109466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
110466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // Horizontal pass and subsequent transpose.
111466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  {
112466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // First pass, c and d calculations are longer because of the "trick"
113466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // multiplications.
114466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i four = _mm_set1_epi16(4);
115466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i dc = _mm_add_epi16(T0, four);
116466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i a =  _mm_add_epi16(dc, T2);
117466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i b =  _mm_sub_epi16(dc, T2);
118466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
119466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
120466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
121466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i c3 = _mm_sub_epi16(T1, T3);
122466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i c4 = _mm_sub_epi16(c1, c2);
123466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i c = _mm_add_epi16(c3, c4);
124466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
125466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
126466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
127466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i d3 = _mm_add_epi16(T1, T3);
128466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i d4 = _mm_add_epi16(d1, d2);
129466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i d = _mm_add_epi16(d3, d4);
130466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
131466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Second pass.
132466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i tmp0 = _mm_add_epi16(a, d);
133466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i tmp1 = _mm_add_epi16(b, c);
134466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i tmp2 = _mm_sub_epi16(b, c);
135466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i tmp3 = _mm_sub_epi16(a, d);
136466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
137466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
138466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
139466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
140466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
141466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Transpose the two 4x4.
1420912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
143fa39824bb690c5806358871f46940d0450973d8aJames Zern                           &T2, &T3);
144466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  }
145466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
146466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // Add inverse transform to 'ref' and store.
147466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  {
1481e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    const __m128i zero = _mm_setzero_si128();
149466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Load the reference(s).
150466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    __m128i ref0, ref1, ref2, ref3;
151466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    if (do_two) {
152466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      // Load eight bytes/pixels per line.
1537c8da7ce66017295a65ec028084b90800be377f8James Zern      ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
1547c8da7ce66017295a65ec028084b90800be377f8James Zern      ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
1557c8da7ce66017295a65ec028084b90800be377f8James Zern      ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
1567c8da7ce66017295a65ec028084b90800be377f8James Zern      ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
157466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    } else {
158466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      // Load four bytes/pixels per line.
1597c8da7ce66017295a65ec028084b90800be377f8James Zern      ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS]));
1607c8da7ce66017295a65ec028084b90800be377f8James Zern      ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS]));
1617c8da7ce66017295a65ec028084b90800be377f8James Zern      ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS]));
1627c8da7ce66017295a65ec028084b90800be377f8James Zern      ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS]));
163466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    }
164466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Convert to 16b.
165466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref0 = _mm_unpacklo_epi8(ref0, zero);
166466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref1 = _mm_unpacklo_epi8(ref1, zero);
167466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref2 = _mm_unpacklo_epi8(ref2, zero);
168466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref3 = _mm_unpacklo_epi8(ref3, zero);
169466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Add the inverse transform(s).
170466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref0 = _mm_add_epi16(ref0, T0);
171466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref1 = _mm_add_epi16(ref1, T1);
172466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref2 = _mm_add_epi16(ref2, T2);
173466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref3 = _mm_add_epi16(ref3, T3);
174466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Unsigned saturate to 8b.
175466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref0 = _mm_packus_epi16(ref0, ref0);
176466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref1 = _mm_packus_epi16(ref1, ref1);
177466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref2 = _mm_packus_epi16(ref2, ref2);
178466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    ref3 = _mm_packus_epi16(ref3, ref3);
179466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Store the results.
180466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    if (do_two) {
181466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      // Store eight bytes/pixels per line.
182466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
183466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
184466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
185466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
186466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    } else {
187466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora      // Store four bytes/pixels per line.
1887c8da7ce66017295a65ec028084b90800be377f8James Zern      WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0));
1897c8da7ce66017295a65ec028084b90800be377f8James Zern      WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1));
1907c8da7ce66017295a65ec028084b90800be377f8James Zern      WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2));
1917c8da7ce66017295a65ec028084b90800be377f8James Zern      WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3));
192466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    }
193466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  }
194466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora}
195466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1967c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void FTransformPass1(const __m128i* const in01,
1977c8da7ce66017295a65ec028084b90800be377f8James Zern                            const __m128i* const in23,
1987c8da7ce66017295a65ec028084b90800be377f8James Zern                            __m128i* const out01,
1997c8da7ce66017295a65ec028084b90800be377f8James Zern                            __m128i* const out32) {
2001e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const __m128i k937 = _mm_set1_epi32(937);
2011e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const __m128i k1812 = _mm_set1_epi32(1812);
2027c8da7ce66017295a65ec028084b90800be377f8James Zern
2031e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
2041e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
2051e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
2061e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                                            2217, 5352, 2217, 5352);
2071e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
2081e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora                                            -5352, 2217, -5352, 2217);
2097c8da7ce66017295a65ec028084b90800be377f8James Zern
2107c8da7ce66017295a65ec028084b90800be377f8James Zern  // *in01 = 00 01 10 11 02 03 12 13
2117c8da7ce66017295a65ec028084b90800be377f8James Zern  // *in23 = 20 21 30 31 22 23 32 33
2127c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1));
2137c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1));
2147c8da7ce66017295a65ec028084b90800be377f8James Zern  // 00 01 10 11 03 02 13 12
2157c8da7ce66017295a65ec028084b90800be377f8James Zern  // 20 21 30 31 23 22 33 32
2167c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
2177c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
2187c8da7ce66017295a65ec028084b90800be377f8James Zern  // 00 01 10 11 20 21 30 31
2197c8da7ce66017295a65ec028084b90800be377f8James Zern  // 03 02 13 12 23 22 33 32
2207c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i a01 = _mm_add_epi16(s01, s32);
2217c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i a32 = _mm_sub_epi16(s01, s32);
2227c8da7ce66017295a65ec028084b90800be377f8James Zern  // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
2237c8da7ce66017295a65ec028084b90800be377f8James Zern  // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
2247c8da7ce66017295a65ec028084b90800be377f8James Zern
2257c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i tmp0   = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
2267c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i tmp2   = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
2277c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
2287c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
2297c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
2307c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
2317c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
2327c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
2337c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i s03    = _mm_packs_epi32(tmp0, tmp2);
2347c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i s12    = _mm_packs_epi32(tmp1, tmp3);
2357c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i s_lo   = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
2367c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i s_hi   = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
2377c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i v23    = _mm_unpackhi_epi32(s_lo, s_hi);
2387c8da7ce66017295a65ec028084b90800be377f8James Zern  *out01 = _mm_unpacklo_epi32(s_lo, s_hi);
2397c8da7ce66017295a65ec028084b90800be377f8James Zern  *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
2407c8da7ce66017295a65ec028084b90800be377f8James Zern}
2417c8da7ce66017295a65ec028084b90800be377f8James Zern
2427c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
2437c8da7ce66017295a65ec028084b90800be377f8James Zern                            int16_t* out) {
2447c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i zero = _mm_setzero_si128();
2457c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i seven = _mm_set1_epi16(7);
2467c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
2477c8da7ce66017295a65ec028084b90800be377f8James Zern                                           5352,  2217, 5352,  2217);
2487c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
2497c8da7ce66017295a65ec028084b90800be377f8James Zern                                           2217, -5352, 2217, -5352);
2507c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
2517c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i k51000 = _mm_set1_epi32(51000);
2527c8da7ce66017295a65ec028084b90800be377f8James Zern
2537c8da7ce66017295a65ec028084b90800be377f8James Zern  // Same operations are done on the (0,3) and (1,2) pairs.
2547c8da7ce66017295a65ec028084b90800be377f8James Zern  // a3 = v0 - v3
2557c8da7ce66017295a65ec028084b90800be377f8James Zern  // a2 = v1 - v2
2567c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i a32 = _mm_sub_epi16(*v01, *v32);
2577c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
2587c8da7ce66017295a65ec028084b90800be377f8James Zern
2597c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
2607c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
2617c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
2627c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
2637c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i d3 = _mm_add_epi32(c3, k51000);
2647c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i e1 = _mm_srai_epi32(d1, 16);
2657c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i e3 = _mm_srai_epi32(d3, 16);
266fa39824bb690c5806358871f46940d0450973d8aJames Zern  // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
267fa39824bb690c5806358871f46940d0450973d8aJames Zern  // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
2687c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i f1 = _mm_packs_epi32(e1, e1);
2697c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i f3 = _mm_packs_epi32(e3, e3);
270fa39824bb690c5806358871f46940d0450973d8aJames Zern  // g1 = f1 + (a3 != 0);
2717c8da7ce66017295a65ec028084b90800be377f8James Zern  // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
2727c8da7ce66017295a65ec028084b90800be377f8James Zern  // desired (0, 1), we add one earlier through k12000_plus_one.
273fa39824bb690c5806358871f46940d0450973d8aJames Zern  // -> g1 = f1 + 1 - (a3 == 0)
2747c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
2757c8da7ce66017295a65ec028084b90800be377f8James Zern
276fa39824bb690c5806358871f46940d0450973d8aJames Zern  // a0 = v0 + v3
277fa39824bb690c5806358871f46940d0450973d8aJames Zern  // a1 = v1 + v2
278fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i a01 = _mm_add_epi16(*v01, *v32);
279fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
280fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
281fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
282fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
283fa39824bb690c5806358871f46940d0450973d8aJames Zern  // d0 = (a0 + a1 + 7) >> 4;
284fa39824bb690c5806358871f46940d0450973d8aJames Zern  // d2 = (a0 - a1 + 7) >> 4;
285fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i d0 = _mm_srai_epi16(c0, 4);
286fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i d2 = _mm_srai_epi16(c2, 4);
287fa39824bb690c5806358871f46940d0450973d8aJames Zern
2887c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
2897c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
2907c8da7ce66017295a65ec028084b90800be377f8James Zern  _mm_storeu_si128((__m128i*)&out[0], d0_g1);
2917c8da7ce66017295a65ec028084b90800be377f8James Zern  _mm_storeu_si128((__m128i*)&out[8], d2_f3);
2927c8da7ce66017295a65ec028084b90800be377f8James Zern}
2937c8da7ce66017295a65ec028084b90800be377f8James Zern
2947c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
2957c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i zero = _mm_setzero_si128();
2960912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // Load src.
2977c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
2987c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
2997c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
3007c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
3010912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // 00 01 02 03 *
3020912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // 10 11 12 13 *
3030912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // 20 21 22 23 *
3040912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // 30 31 32 33 *
3050912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // Shuffle.
3060912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i src_0 = _mm_unpacklo_epi16(src0, src1);
3070912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i src_1 = _mm_unpacklo_epi16(src2, src3);
3080912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // 00 01 10 11 02 03 12 13 * * ...
3090912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // 20 21 30 31 22 22 32 33 * * ...
3100912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern
3110912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // Load ref.
3127c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
3137c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
3147c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
3157c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
3160912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1);
3170912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3);
3180912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern
3190912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // Convert both to 16 bit.
3200912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero);
3210912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero);
3220912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero);
3230912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero);
3240912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern
3250912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // Compute the difference.
3260912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b);
3270912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b);
328b6dbce6bfeaabde2a7b581c4c6888d532d32f3acDerek Sollenberger  __m128i v01, v32;
3294b2196c929b70f2cdc1c2556580d349db89356d8Vikas Arora
3307c8da7ce66017295a65ec028084b90800be377f8James Zern  // First pass
3310912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  FTransformPass1(&row01, &row23, &v01, &v32);
3321e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
3337c8da7ce66017295a65ec028084b90800be377f8James Zern  // Second pass
3347c8da7ce66017295a65ec028084b90800be377f8James Zern  FTransformPass2(&v01, &v32, out);
3357c8da7ce66017295a65ec028084b90800be377f8James Zern}
3367c8da7ce66017295a65ec028084b90800be377f8James Zern
3377c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
3387c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i zero = _mm_setzero_si128();
3397c8da7ce66017295a65ec028084b90800be377f8James Zern
3407c8da7ce66017295a65ec028084b90800be377f8James Zern  // Load src and convert to 16b.
3417c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
3427c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
3437c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
3447c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
3457c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
3467c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
3477c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
3487c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
3497c8da7ce66017295a65ec028084b90800be377f8James Zern  // Load ref and convert to 16b.
3507c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
3517c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
3527c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
3537c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
3547c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
3557c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
3567c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
3577c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
3587c8da7ce66017295a65ec028084b90800be377f8James Zern  // Compute difference. -> 00 01 02 03  00' 01' 02' 03'
3597c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
3607c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
3617c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
3627c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
3637c8da7ce66017295a65ec028084b90800be377f8James Zern
3647c8da7ce66017295a65ec028084b90800be377f8James Zern  // Unpack and shuffle
3657c8da7ce66017295a65ec028084b90800be377f8James Zern  // 00 01 02 03   0 0 0 0
3667c8da7ce66017295a65ec028084b90800be377f8James Zern  // 10 11 12 13   0 0 0 0
3677c8da7ce66017295a65ec028084b90800be377f8James Zern  // 20 21 22 23   0 0 0 0
3687c8da7ce66017295a65ec028084b90800be377f8James Zern  // 30 31 32 33   0 0 0 0
3697c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1);
3707c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3);
3717c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1);
3727c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3);
3737c8da7ce66017295a65ec028084b90800be377f8James Zern  __m128i v01l, v32l;
3747c8da7ce66017295a65ec028084b90800be377f8James Zern  __m128i v01h, v32h;
3757c8da7ce66017295a65ec028084b90800be377f8James Zern
3767c8da7ce66017295a65ec028084b90800be377f8James Zern  // First pass
3777c8da7ce66017295a65ec028084b90800be377f8James Zern  FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
3787c8da7ce66017295a65ec028084b90800be377f8James Zern  FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
379466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
380466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // Second pass
3817c8da7ce66017295a65ec028084b90800be377f8James Zern  FTransformPass2(&v01l, &v32l, out + 0);
3827c8da7ce66017295a65ec028084b90800be377f8James Zern  FTransformPass2(&v01h, &v32h, out + 16);
3837c8da7ce66017295a65ec028084b90800be377f8James Zern}
3847c8da7ce66017295a65ec028084b90800be377f8James Zern
3857c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
3860912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
3877c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
3887c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
3897c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]);
3907c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i src3 = _mm_loadl_epi64((__m128i*)&in[3 * 16]);
3917c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i A01 = _mm_unpacklo_epi16(src0, src1);  // A0 A1 | ...
3927c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i A23 = _mm_unpacklo_epi16(src2, src3);  // A2 A3 | ...
3937c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i B0 = _mm_adds_epi16(A01, A23);    // a0 | a1 | ...
3947c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i B1 = _mm_subs_epi16(A01, A23);    // a3 | a2 | ...
3950912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i C0 = _mm_unpacklo_epi32(B0, B1);  // a0 | a1 | a3 | a2 | ...
3960912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i C1 = _mm_unpacklo_epi32(B1, B0);  // a3 | a2 | a0 | a1 | ...
3970912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i D = _mm_unpacklo_epi64(C0, C1);   // a0 a1 a3 a2 a3 a2 a0 a1
3980912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  *out = _mm_madd_epi16(D, kMult);
399466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora}
400466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
40133f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic void FTransformWHT(const int16_t* in, int16_t* out) {
4020912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // Input is 12b signed.
4037c8da7ce66017295a65ec028084b90800be377f8James Zern  __m128i row0, row1, row2, row3;
4040912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // Rows are 14b signed.
4057c8da7ce66017295a65ec028084b90800be377f8James Zern  FTransformWHTRow(in + 0 * 64, &row0);
4067c8da7ce66017295a65ec028084b90800be377f8James Zern  FTransformWHTRow(in + 1 * 64, &row1);
4077c8da7ce66017295a65ec028084b90800be377f8James Zern  FTransformWHTRow(in + 2 * 64, &row2);
4087c8da7ce66017295a65ec028084b90800be377f8James Zern  FTransformWHTRow(in + 3 * 64, &row3);
4097c8da7ce66017295a65ec028084b90800be377f8James Zern
4100406ce1417f76f2034833414dcecc9f56253640cVikas Arora  {
4110912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    // The a* are 15b signed.
4127c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i a0 = _mm_add_epi32(row0, row2);
4137c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i a1 = _mm_add_epi32(row1, row3);
4147c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i a2 = _mm_sub_epi32(row1, row3);
4157c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i a3 = _mm_sub_epi32(row0, row2);
4160912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const __m128i a0a3 = _mm_packs_epi32(a0, a3);
4170912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const __m128i a1a2 = _mm_packs_epi32(a1, a2);
4180912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern
4190912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    // The b* are 16b signed.
4200912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2);
4210912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2);
4220912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2);
4230912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2);
4240912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern
4250912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1));
4260912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1));
4270406ce1417f76f2034833414dcecc9f56253640cVikas Arora  }
4280406ce1417f76f2034833414dcecc9f56253640cVikas Arora}
4290406ce1417f76f2034833414dcecc9f56253640cVikas Arora
430a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora//------------------------------------------------------------------------------
4317c8da7ce66017295a65ec028084b90800be377f8James Zern// Compute susceptibility based on DCT-coeff histograms:
4327c8da7ce66017295a65ec028084b90800be377f8James Zern// the higher, the "easier" the macroblock is to compress.
433466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
4347c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
4357c8da7ce66017295a65ec028084b90800be377f8James Zern                             int start_block, int end_block,
4367c8da7ce66017295a65ec028084b90800be377f8James Zern                             VP8Histogram* const histo) {
4371e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const __m128i zero = _mm_setzero_si128();
4387c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
4397c8da7ce66017295a65ec028084b90800be377f8James Zern  int j;
4407c8da7ce66017295a65ec028084b90800be377f8James Zern  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
4417c8da7ce66017295a65ec028084b90800be377f8James Zern  for (j = start_block; j < end_block; ++j) {
4427c8da7ce66017295a65ec028084b90800be377f8James Zern    int16_t out[16];
4437c8da7ce66017295a65ec028084b90800be377f8James Zern    int k;
4447c8da7ce66017295a65ec028084b90800be377f8James Zern
4457c8da7ce66017295a65ec028084b90800be377f8James Zern    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
4467c8da7ce66017295a65ec028084b90800be377f8James Zern
4477c8da7ce66017295a65ec028084b90800be377f8James Zern    // Convert coefficients to bin (within out[]).
4487c8da7ce66017295a65ec028084b90800be377f8James Zern    {
4497c8da7ce66017295a65ec028084b90800be377f8James Zern      // Load.
4507c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
4517c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
4527c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i d0 = _mm_sub_epi16(zero, out0);
4537c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i d1 = _mm_sub_epi16(zero, out1);
4547c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i abs0 = _mm_max_epi16(out0, d0);   // abs(v), 16b
4557c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i abs1 = _mm_max_epi16(out1, d1);
4567c8da7ce66017295a65ec028084b90800be377f8James Zern      // v = abs(out) >> 3
4577c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i v0 = _mm_srai_epi16(abs0, 3);
4587c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i v1 = _mm_srai_epi16(abs1, 3);
4597c8da7ce66017295a65ec028084b90800be377f8James Zern      // bin = min(v, MAX_COEFF_THRESH)
4607c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
4617c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
4627c8da7ce66017295a65ec028084b90800be377f8James Zern      // Store.
4637c8da7ce66017295a65ec028084b90800be377f8James Zern      _mm_storeu_si128((__m128i*)&out[0], bin0);
4647c8da7ce66017295a65ec028084b90800be377f8James Zern      _mm_storeu_si128((__m128i*)&out[8], bin1);
4657c8da7ce66017295a65ec028084b90800be377f8James Zern    }
4667c8da7ce66017295a65ec028084b90800be377f8James Zern
4677c8da7ce66017295a65ec028084b90800be377f8James Zern    // Convert coefficients to bin.
4687c8da7ce66017295a65ec028084b90800be377f8James Zern    for (k = 0; k < 16; ++k) {
4697c8da7ce66017295a65ec028084b90800be377f8James Zern      ++distribution[out[k]];
4701e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    }
4711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  }
4727c8da7ce66017295a65ec028084b90800be377f8James Zern  VP8SetHistogramData(distribution, histo);
4737c8da7ce66017295a65ec028084b90800be377f8James Zern}
4747c8da7ce66017295a65ec028084b90800be377f8James Zern
4757c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------
4767c8da7ce66017295a65ec028084b90800be377f8James Zern// Intra predictions
4777c8da7ce66017295a65ec028084b90800be377f8James Zern
4787c8da7ce66017295a65ec028084b90800be377f8James Zern// helper for chroma-DC predictions
4797c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
4807c8da7ce66017295a65ec028084b90800be377f8James Zern  int j;
4817c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i values = _mm_set1_epi8(v);
4827c8da7ce66017295a65ec028084b90800be377f8James Zern  for (j = 0; j < 8; ++j) {
4837c8da7ce66017295a65ec028084b90800be377f8James Zern    _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
4847c8da7ce66017295a65ec028084b90800be377f8James Zern  }
4857c8da7ce66017295a65ec028084b90800be377f8James Zern}
4867c8da7ce66017295a65ec028084b90800be377f8James Zern
4877c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
4887c8da7ce66017295a65ec028084b90800be377f8James Zern  int j;
4897c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i values = _mm_set1_epi8(v);
4907c8da7ce66017295a65ec028084b90800be377f8James Zern  for (j = 0; j < 16; ++j) {
4917c8da7ce66017295a65ec028084b90800be377f8James Zern    _mm_store_si128((__m128i*)(dst + j * BPS), values);
4927c8da7ce66017295a65ec028084b90800be377f8James Zern  }
4937c8da7ce66017295a65ec028084b90800be377f8James Zern}
4947c8da7ce66017295a65ec028084b90800be377f8James Zern
4957c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
4967c8da7ce66017295a65ec028084b90800be377f8James Zern  if (size == 4) {
4977c8da7ce66017295a65ec028084b90800be377f8James Zern    int j;
4987c8da7ce66017295a65ec028084b90800be377f8James Zern    for (j = 0; j < 4; ++j) {
4997c8da7ce66017295a65ec028084b90800be377f8James Zern      memset(dst + j * BPS, value, 4);
5007c8da7ce66017295a65ec028084b90800be377f8James Zern    }
5017c8da7ce66017295a65ec028084b90800be377f8James Zern  } else if (size == 8) {
5027c8da7ce66017295a65ec028084b90800be377f8James Zern    Put8x8uv(value, dst);
5037c8da7ce66017295a65ec028084b90800be377f8James Zern  } else {
5047c8da7ce66017295a65ec028084b90800be377f8James Zern    Put16(value, dst);
5057c8da7ce66017295a65ec028084b90800be377f8James Zern  }
5067c8da7ce66017295a65ec028084b90800be377f8James Zern}
5077c8da7ce66017295a65ec028084b90800be377f8James Zern
5087c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
5097c8da7ce66017295a65ec028084b90800be377f8James Zern  int j;
5107c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
5117c8da7ce66017295a65ec028084b90800be377f8James Zern  for (j = 0; j < 8; ++j) {
5127c8da7ce66017295a65ec028084b90800be377f8James Zern    _mm_storel_epi64((__m128i*)(dst + j * BPS), top_values);
5137c8da7ce66017295a65ec028084b90800be377f8James Zern  }
5147c8da7ce66017295a65ec028084b90800be377f8James Zern}
5157c8da7ce66017295a65ec028084b90800be377f8James Zern
5167c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
5177c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i top_values = _mm_load_si128((const __m128i*)top);
5187c8da7ce66017295a65ec028084b90800be377f8James Zern  int j;
5197c8da7ce66017295a65ec028084b90800be377f8James Zern  for (j = 0; j < 16; ++j) {
5207c8da7ce66017295a65ec028084b90800be377f8James Zern    _mm_store_si128((__m128i*)(dst + j * BPS), top_values);
5217c8da7ce66017295a65ec028084b90800be377f8James Zern  }
5227c8da7ce66017295a65ec028084b90800be377f8James Zern}
5237c8da7ce66017295a65ec028084b90800be377f8James Zern
5247c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VerticalPred(uint8_t* dst,
5257c8da7ce66017295a65ec028084b90800be377f8James Zern                                     const uint8_t* top, int size) {
5267c8da7ce66017295a65ec028084b90800be377f8James Zern  if (top != NULL) {
5277c8da7ce66017295a65ec028084b90800be377f8James Zern    if (size == 8) {
5287c8da7ce66017295a65ec028084b90800be377f8James Zern      VE8uv(dst, top);
5297c8da7ce66017295a65ec028084b90800be377f8James Zern    } else {
5307c8da7ce66017295a65ec028084b90800be377f8James Zern      VE16(dst, top);
5317c8da7ce66017295a65ec028084b90800be377f8James Zern    }
5327c8da7ce66017295a65ec028084b90800be377f8James Zern  } else {
5337c8da7ce66017295a65ec028084b90800be377f8James Zern    Fill(dst, 127, size);
5347c8da7ce66017295a65ec028084b90800be377f8James Zern  }
5357c8da7ce66017295a65ec028084b90800be377f8James Zern}
5367c8da7ce66017295a65ec028084b90800be377f8James Zern
5377c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
5387c8da7ce66017295a65ec028084b90800be377f8James Zern  int j;
5397c8da7ce66017295a65ec028084b90800be377f8James Zern  for (j = 0; j < 8; ++j) {
5407c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i values = _mm_set1_epi8(left[j]);
5417c8da7ce66017295a65ec028084b90800be377f8James Zern    _mm_storel_epi64((__m128i*)dst, values);
5427c8da7ce66017295a65ec028084b90800be377f8James Zern    dst += BPS;
5437c8da7ce66017295a65ec028084b90800be377f8James Zern  }
5447c8da7ce66017295a65ec028084b90800be377f8James Zern}
5457c8da7ce66017295a65ec028084b90800be377f8James Zern
5467c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
5477c8da7ce66017295a65ec028084b90800be377f8James Zern  int j;
5487c8da7ce66017295a65ec028084b90800be377f8James Zern  for (j = 0; j < 16; ++j) {
5497c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i values = _mm_set1_epi8(left[j]);
5507c8da7ce66017295a65ec028084b90800be377f8James Zern    _mm_store_si128((__m128i*)dst, values);
5517c8da7ce66017295a65ec028084b90800be377f8James Zern    dst += BPS;
5527c8da7ce66017295a65ec028084b90800be377f8James Zern  }
5537c8da7ce66017295a65ec028084b90800be377f8James Zern}
5547c8da7ce66017295a65ec028084b90800be377f8James Zern
5557c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HorizontalPred(uint8_t* dst,
5567c8da7ce66017295a65ec028084b90800be377f8James Zern                                       const uint8_t* left, int size) {
5577c8da7ce66017295a65ec028084b90800be377f8James Zern  if (left != NULL) {
5587c8da7ce66017295a65ec028084b90800be377f8James Zern    if (size == 8) {
5597c8da7ce66017295a65ec028084b90800be377f8James Zern      HE8uv(dst, left);
5607c8da7ce66017295a65ec028084b90800be377f8James Zern    } else {
5617c8da7ce66017295a65ec028084b90800be377f8James Zern      HE16(dst, left);
5627c8da7ce66017295a65ec028084b90800be377f8James Zern    }
5637c8da7ce66017295a65ec028084b90800be377f8James Zern  } else {
5647c8da7ce66017295a65ec028084b90800be377f8James Zern    Fill(dst, 129, size);
5657c8da7ce66017295a65ec028084b90800be377f8James Zern  }
5667c8da7ce66017295a65ec028084b90800be377f8James Zern}
5677c8da7ce66017295a65ec028084b90800be377f8James Zern
5687c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
5697c8da7ce66017295a65ec028084b90800be377f8James Zern                           const uint8_t* top, int size) {
5707c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i zero = _mm_setzero_si128();
5717c8da7ce66017295a65ec028084b90800be377f8James Zern  int y;
5727c8da7ce66017295a65ec028084b90800be377f8James Zern  if (size == 8) {
5737c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
5747c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
5757c8da7ce66017295a65ec028084b90800be377f8James Zern    for (y = 0; y < 8; ++y, dst += BPS) {
5767c8da7ce66017295a65ec028084b90800be377f8James Zern      const int val = left[y] - left[-1];
5777c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i base = _mm_set1_epi16(val);
5787c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
5797c8da7ce66017295a65ec028084b90800be377f8James Zern      _mm_storel_epi64((__m128i*)dst, out);
5807c8da7ce66017295a65ec028084b90800be377f8James Zern    }
5817c8da7ce66017295a65ec028084b90800be377f8James Zern  } else {
5827c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i top_values = _mm_load_si128((const __m128i*)top);
5837c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
5847c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
5857c8da7ce66017295a65ec028084b90800be377f8James Zern    for (y = 0; y < 16; ++y, dst += BPS) {
5867c8da7ce66017295a65ec028084b90800be377f8James Zern      const int val = left[y] - left[-1];
5877c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i base = _mm_set1_epi16(val);
5887c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i out_0 = _mm_add_epi16(base, top_base_0);
5897c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i out_1 = _mm_add_epi16(base, top_base_1);
5907c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i out = _mm_packus_epi16(out_0, out_1);
5917c8da7ce66017295a65ec028084b90800be377f8James Zern      _mm_store_si128((__m128i*)dst, out);
5927c8da7ce66017295a65ec028084b90800be377f8James Zern    }
5937c8da7ce66017295a65ec028084b90800be377f8James Zern  }
5947c8da7ce66017295a65ec028084b90800be377f8James Zern}
5957c8da7ce66017295a65ec028084b90800be377f8James Zern
5967c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
5977c8da7ce66017295a65ec028084b90800be377f8James Zern                                   const uint8_t* top, int size) {
5987c8da7ce66017295a65ec028084b90800be377f8James Zern  if (left != NULL) {
5997c8da7ce66017295a65ec028084b90800be377f8James Zern    if (top != NULL) {
6007c8da7ce66017295a65ec028084b90800be377f8James Zern      TM(dst, left, top, size);
6017c8da7ce66017295a65ec028084b90800be377f8James Zern    } else {
6027c8da7ce66017295a65ec028084b90800be377f8James Zern      HorizontalPred(dst, left, size);
6037c8da7ce66017295a65ec028084b90800be377f8James Zern    }
6047c8da7ce66017295a65ec028084b90800be377f8James Zern  } else {
6057c8da7ce66017295a65ec028084b90800be377f8James Zern    // true motion without left samples (hence: with default 129 value)
6067c8da7ce66017295a65ec028084b90800be377f8James Zern    // is equivalent to VE prediction where you just copy the top samples.
6077c8da7ce66017295a65ec028084b90800be377f8James Zern    // Note that if top samples are not available, the default value is
6087c8da7ce66017295a65ec028084b90800be377f8James Zern    // then 129, and not 127 as in the VerticalPred case.
6097c8da7ce66017295a65ec028084b90800be377f8James Zern    if (top != NULL) {
6107c8da7ce66017295a65ec028084b90800be377f8James Zern      VerticalPred(dst, top, size);
6117c8da7ce66017295a65ec028084b90800be377f8James Zern    } else {
6127c8da7ce66017295a65ec028084b90800be377f8James Zern      Fill(dst, 129, size);
6137c8da7ce66017295a65ec028084b90800be377f8James Zern    }
6147c8da7ce66017295a65ec028084b90800be377f8James Zern  }
6157c8da7ce66017295a65ec028084b90800be377f8James Zern}
6167c8da7ce66017295a65ec028084b90800be377f8James Zern
6177c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
6187c8da7ce66017295a65ec028084b90800be377f8James Zern                              const uint8_t* top) {
6197c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
6207c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
6210912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
6220912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const int DC = VP8HorizontalAdd8b(&combined) + 8;
6237c8da7ce66017295a65ec028084b90800be377f8James Zern  Put8x8uv(DC >> 4, dst);
6247c8da7ce66017295a65ec028084b90800be377f8James Zern}
6257c8da7ce66017295a65ec028084b90800be377f8James Zern
6267c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
6277c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i zero = _mm_setzero_si128();
6287c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
6297c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i sum = _mm_sad_epu8(top_values, zero);
6307c8da7ce66017295a65ec028084b90800be377f8James Zern  const int DC = _mm_cvtsi128_si32(sum) + 4;
6317c8da7ce66017295a65ec028084b90800be377f8James Zern  Put8x8uv(DC >> 3, dst);
6327c8da7ce66017295a65ec028084b90800be377f8James Zern}
6337c8da7ce66017295a65ec028084b90800be377f8James Zern
6347c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) {
6357c8da7ce66017295a65ec028084b90800be377f8James Zern  // 'left' is contiguous so we can reuse the top summation.
6367c8da7ce66017295a65ec028084b90800be377f8James Zern  DC8uvNoLeft(dst, left);
6377c8da7ce66017295a65ec028084b90800be377f8James Zern}
6387c8da7ce66017295a65ec028084b90800be377f8James Zern
6397c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) {
6407c8da7ce66017295a65ec028084b90800be377f8James Zern  Put8x8uv(0x80, dst);
6417c8da7ce66017295a65ec028084b90800be377f8James Zern}
6427c8da7ce66017295a65ec028084b90800be377f8James Zern
6437c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
6447c8da7ce66017295a65ec028084b90800be377f8James Zern                                  const uint8_t* top) {
6457c8da7ce66017295a65ec028084b90800be377f8James Zern  if (top != NULL) {
6467c8da7ce66017295a65ec028084b90800be377f8James Zern    if (left != NULL) {  // top and left present
6477c8da7ce66017295a65ec028084b90800be377f8James Zern      DC8uv(dst, left, top);
6487c8da7ce66017295a65ec028084b90800be377f8James Zern    } else {  // top, but no left
6497c8da7ce66017295a65ec028084b90800be377f8James Zern      DC8uvNoLeft(dst, top);
6507c8da7ce66017295a65ec028084b90800be377f8James Zern    }
6517c8da7ce66017295a65ec028084b90800be377f8James Zern  } else if (left != NULL) {  // left but no top
6527c8da7ce66017295a65ec028084b90800be377f8James Zern    DC8uvNoTop(dst, left);
6537c8da7ce66017295a65ec028084b90800be377f8James Zern  } else {  // no top, no left, nothing.
6547c8da7ce66017295a65ec028084b90800be377f8James Zern    DC8uvNoTopLeft(dst);
6557c8da7ce66017295a65ec028084b90800be377f8James Zern  }
6567c8da7ce66017295a65ec028084b90800be377f8James Zern}
6577c8da7ce66017295a65ec028084b90800be377f8James Zern
6587c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
6597c8da7ce66017295a65ec028084b90800be377f8James Zern                             const uint8_t* top) {
6607c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i top_row = _mm_load_si128((const __m128i*)top);
6617c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i left_row = _mm_load_si128((const __m128i*)left);
6620912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const int DC =
6630912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern      VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
6647c8da7ce66017295a65ec028084b90800be377f8James Zern  Put16(DC >> 5, dst);
6657c8da7ce66017295a65ec028084b90800be377f8James Zern}
6667c8da7ce66017295a65ec028084b90800be377f8James Zern
6677c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
6687c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i top_row = _mm_load_si128((const __m128i*)top);
6690912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  const int DC = VP8HorizontalAdd8b(&top_row) + 8;
6707c8da7ce66017295a65ec028084b90800be377f8James Zern  Put16(DC >> 4, dst);
6717c8da7ce66017295a65ec028084b90800be377f8James Zern}
6727c8da7ce66017295a65ec028084b90800be377f8James Zern
6737c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
6747c8da7ce66017295a65ec028084b90800be377f8James Zern  // 'left' is contiguous so we can reuse the top summation.
6757c8da7ce66017295a65ec028084b90800be377f8James Zern  DC16NoLeft(dst, left);
6767c8da7ce66017295a65ec028084b90800be377f8James Zern}
6777c8da7ce66017295a65ec028084b90800be377f8James Zern
6787c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
6797c8da7ce66017295a65ec028084b90800be377f8James Zern  Put16(0x80, dst);
6807c8da7ce66017295a65ec028084b90800be377f8James Zern}
6817c8da7ce66017295a65ec028084b90800be377f8James Zern
6827c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
6837c8da7ce66017295a65ec028084b90800be377f8James Zern                                 const uint8_t* top) {
6847c8da7ce66017295a65ec028084b90800be377f8James Zern  if (top != NULL) {
6857c8da7ce66017295a65ec028084b90800be377f8James Zern    if (left != NULL) {  // top and left present
6867c8da7ce66017295a65ec028084b90800be377f8James Zern      DC16(dst, left, top);
6877c8da7ce66017295a65ec028084b90800be377f8James Zern    } else {  // top, but no left
6887c8da7ce66017295a65ec028084b90800be377f8James Zern      DC16NoLeft(dst, top);
6897c8da7ce66017295a65ec028084b90800be377f8James Zern    }
6907c8da7ce66017295a65ec028084b90800be377f8James Zern  } else if (left != NULL) {  // left but no top
6917c8da7ce66017295a65ec028084b90800be377f8James Zern    DC16NoTop(dst, left);
6927c8da7ce66017295a65ec028084b90800be377f8James Zern  } else {  // no top, no left, nothing.
6937c8da7ce66017295a65ec028084b90800be377f8James Zern    DC16NoTopLeft(dst);
6947c8da7ce66017295a65ec028084b90800be377f8James Zern  }
6957c8da7ce66017295a65ec028084b90800be377f8James Zern}
6967c8da7ce66017295a65ec028084b90800be377f8James Zern
6977c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------
6987c8da7ce66017295a65ec028084b90800be377f8James Zern// 4x4 predictions
6997c8da7ce66017295a65ec028084b90800be377f8James Zern
7007c8da7ce66017295a65ec028084b90800be377f8James Zern#define DST(x, y) dst[(x) + (y) * BPS]
7017c8da7ce66017295a65ec028084b90800be377f8James Zern#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
7027c8da7ce66017295a65ec028084b90800be377f8James Zern#define AVG2(a, b) (((a) + (b) + 1) >> 1)
7037c8da7ce66017295a65ec028084b90800be377f8James Zern
7047c8da7ce66017295a65ec028084b90800be377f8James Zern// We use the following 8b-arithmetic tricks:
7057c8da7ce66017295a65ec028084b90800be377f8James Zern//     (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
7067c8da7ce66017295a65ec028084b90800be377f8James Zern//   where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
7077c8da7ce66017295a65ec028084b90800be377f8James Zern// and:
7087c8da7ce66017295a65ec028084b90800be377f8James Zern//     (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
7097c8da7ce66017295a65ec028084b90800be377f8James Zern//   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
7107c8da7ce66017295a65ec028084b90800be377f8James Zern//   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
7117c8da7ce66017295a65ec028084b90800be377f8James Zern
7127c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {  // vertical
7137c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i one = _mm_set1_epi8(1);
7147c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
7157c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
7167c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
7177c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
7187c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
7197c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i b = _mm_subs_epu8(a, lsb);
7207c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
7217c8da7ce66017295a65ec028084b90800be377f8James Zern  const uint32_t vals = _mm_cvtsi128_si32(avg);
7227c8da7ce66017295a65ec028084b90800be377f8James Zern  int i;
7237c8da7ce66017295a65ec028084b90800be377f8James Zern  for (i = 0; i < 4; ++i) {
7247c8da7ce66017295a65ec028084b90800be377f8James Zern    WebPUint32ToMem(dst + i * BPS, vals);
7257c8da7ce66017295a65ec028084b90800be377f8James Zern  }
7267c8da7ce66017295a65ec028084b90800be377f8James Zern}
7277c8da7ce66017295a65ec028084b90800be377f8James Zern
7287c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {  // horizontal
7297c8da7ce66017295a65ec028084b90800be377f8James Zern  const int X = top[-1];
7307c8da7ce66017295a65ec028084b90800be377f8James Zern  const int I = top[-2];
7317c8da7ce66017295a65ec028084b90800be377f8James Zern  const int J = top[-3];
7327c8da7ce66017295a65ec028084b90800be377f8James Zern  const int K = top[-4];
7337c8da7ce66017295a65ec028084b90800be377f8James Zern  const int L = top[-5];
7347c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
7357c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
7367c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
7377c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
7387c8da7ce66017295a65ec028084b90800be377f8James Zern}
7397c8da7ce66017295a65ec028084b90800be377f8James Zern
7407c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
7417c8da7ce66017295a65ec028084b90800be377f8James Zern  uint32_t dc = 4;
7427c8da7ce66017295a65ec028084b90800be377f8James Zern  int i;
7437c8da7ce66017295a65ec028084b90800be377f8James Zern  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
7447c8da7ce66017295a65ec028084b90800be377f8James Zern  Fill(dst, dc >> 3, 4);
7457c8da7ce66017295a65ec028084b90800be377f8James Zern}
7467c8da7ce66017295a65ec028084b90800be377f8James Zern
7477c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {  // Down-Left
7487c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i one = _mm_set1_epi8(1);
7497c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
7507c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
7517c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
7527c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, top[7], 3);
7537c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
7547c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
7557c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
7567c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
7577c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
7587c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
7597c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
7607c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
7617c8da7ce66017295a65ec028084b90800be377f8James Zern}
7627c8da7ce66017295a65ec028084b90800be377f8James Zern
7637c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VR4(uint8_t* dst,
7647c8da7ce66017295a65ec028084b90800be377f8James Zern                            const uint8_t* top) {  // Vertical-Right
7657c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i one = _mm_set1_epi8(1);
7667c8da7ce66017295a65ec028084b90800be377f8James Zern  const int I = top[-2];
7677c8da7ce66017295a65ec028084b90800be377f8James Zern  const int J = top[-3];
7687c8da7ce66017295a65ec028084b90800be377f8James Zern  const int K = top[-4];
7697c8da7ce66017295a65ec028084b90800be377f8James Zern  const int X = top[-1];
7707c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i XABCD = _mm_loadl_epi64((const __m128i*)(top - 1));
7717c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
7727c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
7737c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
7747c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
7757c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
7767c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
7777c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
7787c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
7797c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
7807c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
7817c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
7827c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
7837c8da7ce66017295a65ec028084b90800be377f8James Zern
7847c8da7ce66017295a65ec028084b90800be377f8James Zern  // these two are hard to implement in SSE2, so we keep the C-version:
7857c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(0, 2) = AVG3(J, I, X);
7867c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(0, 3) = AVG3(K, J, I);
7877c8da7ce66017295a65ec028084b90800be377f8James Zern}
7887c8da7ce66017295a65ec028084b90800be377f8James Zern
7897c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void VL4(uint8_t* dst,
7907c8da7ce66017295a65ec028084b90800be377f8James Zern                            const uint8_t* top) {  // Vertical-Left
7917c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i one = _mm_set1_epi8(1);
7927c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
7937c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
7947c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
7957c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
7967c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
7977c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
7987c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
7997c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
8007c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
8017c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i abbc = _mm_or_si128(ab, bc);
8027c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
8037c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
8047c8da7ce66017295a65ec028084b90800be377f8James Zern  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
8057c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
8067c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
8077c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
8087c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
8097c8da7ce66017295a65ec028084b90800be377f8James Zern
8107c8da7ce66017295a65ec028084b90800be377f8James Zern  // these two are hard to get and irregular
8117c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(3, 2) = (extra_out >> 0) & 0xff;
8127c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(3, 3) = (extra_out >> 8) & 0xff;
8137c8da7ce66017295a65ec028084b90800be377f8James Zern}
8147c8da7ce66017295a65ec028084b90800be377f8James Zern
8157c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {  // Down-right
8167c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i one = _mm_set1_epi8(1);
8177c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
8187c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
8197c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
8207c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
8217c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
8227c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
8237c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
8247c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
8257c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
8267c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
8277c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
8287c8da7ce66017295a65ec028084b90800be377f8James Zern  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
8297c8da7ce66017295a65ec028084b90800be377f8James Zern}
8307c8da7ce66017295a65ec028084b90800be377f8James Zern
8317c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
8327c8da7ce66017295a65ec028084b90800be377f8James Zern  const int I = top[-2];
8337c8da7ce66017295a65ec028084b90800be377f8James Zern  const int J = top[-3];
8347c8da7ce66017295a65ec028084b90800be377f8James Zern  const int K = top[-4];
8357c8da7ce66017295a65ec028084b90800be377f8James Zern  const int L = top[-5];
8367c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(0, 0) =             AVG2(I, J);
8377c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(2, 0) = DST(0, 1) = AVG2(J, K);
8387c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(2, 1) = DST(0, 2) = AVG2(K, L);
8397c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(1, 0) =             AVG3(I, J, K);
8407c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
8417c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
8427c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(3, 2) = DST(2, 2) =
8437c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
8447c8da7ce66017295a65ec028084b90800be377f8James Zern}
8457c8da7ce66017295a65ec028084b90800be377f8James Zern
8467c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
8477c8da7ce66017295a65ec028084b90800be377f8James Zern  const int X = top[-1];
8487c8da7ce66017295a65ec028084b90800be377f8James Zern  const int I = top[-2];
8497c8da7ce66017295a65ec028084b90800be377f8James Zern  const int J = top[-3];
8507c8da7ce66017295a65ec028084b90800be377f8James Zern  const int K = top[-4];
8517c8da7ce66017295a65ec028084b90800be377f8James Zern  const int L = top[-5];
8527c8da7ce66017295a65ec028084b90800be377f8James Zern  const int A = top[0];
8537c8da7ce66017295a65ec028084b90800be377f8James Zern  const int B = top[1];
8547c8da7ce66017295a65ec028084b90800be377f8James Zern  const int C = top[2];
8557c8da7ce66017295a65ec028084b90800be377f8James Zern
8567c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(0, 0) = DST(2, 1) = AVG2(I, X);
8577c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(0, 1) = DST(2, 2) = AVG2(J, I);
8587c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(0, 2) = DST(2, 3) = AVG2(K, J);
8597c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(0, 3)             = AVG2(L, K);
8607c8da7ce66017295a65ec028084b90800be377f8James Zern
8617c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(3, 0)             = AVG3(A, B, C);
8627c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(2, 0)             = AVG3(X, A, B);
8637c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
8647c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
8657c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
8667c8da7ce66017295a65ec028084b90800be377f8James Zern  DST(1, 3)             = AVG3(L, K, J);
8677c8da7ce66017295a65ec028084b90800be377f8James Zern}
8687c8da7ce66017295a65ec028084b90800be377f8James Zern
8697c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
8707c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i zero = _mm_setzero_si128();
8717c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
8727c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
8737c8da7ce66017295a65ec028084b90800be377f8James Zern  int y;
8747c8da7ce66017295a65ec028084b90800be377f8James Zern  for (y = 0; y < 4; ++y, dst += BPS) {
8757c8da7ce66017295a65ec028084b90800be377f8James Zern    const int val = top[-2 - y] - top[-1];
8767c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i base = _mm_set1_epi16(val);
8777c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
8787c8da7ce66017295a65ec028084b90800be377f8James Zern    WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
8797c8da7ce66017295a65ec028084b90800be377f8James Zern  }
8807c8da7ce66017295a65ec028084b90800be377f8James Zern}
8817c8da7ce66017295a65ec028084b90800be377f8James Zern
8827c8da7ce66017295a65ec028084b90800be377f8James Zern#undef DST
8837c8da7ce66017295a65ec028084b90800be377f8James Zern#undef AVG3
8847c8da7ce66017295a65ec028084b90800be377f8James Zern#undef AVG2
8857c8da7ce66017295a65ec028084b90800be377f8James Zern
8867c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------
8877c8da7ce66017295a65ec028084b90800be377f8James Zern// luma 4x4 prediction
8887c8da7ce66017295a65ec028084b90800be377f8James Zern
8897c8da7ce66017295a65ec028084b90800be377f8James Zern// Left samples are top[-5 .. -2], top_left is top[-1], top are
8907c8da7ce66017295a65ec028084b90800be377f8James Zern// located at top[0..3], and top right is top[4..7]
8917c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void Intra4Preds(uint8_t* dst, const uint8_t* top) {
8927c8da7ce66017295a65ec028084b90800be377f8James Zern  DC4(I4DC4 + dst, top);
8937c8da7ce66017295a65ec028084b90800be377f8James Zern  TM4(I4TM4 + dst, top);
8947c8da7ce66017295a65ec028084b90800be377f8James Zern  VE4(I4VE4 + dst, top);
8957c8da7ce66017295a65ec028084b90800be377f8James Zern  HE4(I4HE4 + dst, top);
8967c8da7ce66017295a65ec028084b90800be377f8James Zern  RD4(I4RD4 + dst, top);
8977c8da7ce66017295a65ec028084b90800be377f8James Zern  VR4(I4VR4 + dst, top);
8987c8da7ce66017295a65ec028084b90800be377f8James Zern  LD4(I4LD4 + dst, top);
8997c8da7ce66017295a65ec028084b90800be377f8James Zern  VL4(I4VL4 + dst, top);
9007c8da7ce66017295a65ec028084b90800be377f8James Zern  HD4(I4HD4 + dst, top);
9017c8da7ce66017295a65ec028084b90800be377f8James Zern  HU4(I4HU4 + dst, top);
9027c8da7ce66017295a65ec028084b90800be377f8James Zern}
9037c8da7ce66017295a65ec028084b90800be377f8James Zern
9047c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------
9057c8da7ce66017295a65ec028084b90800be377f8James Zern// Chroma 8x8 prediction (paragraph 12.2)
9067c8da7ce66017295a65ec028084b90800be377f8James Zern
9077c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
9087c8da7ce66017295a65ec028084b90800be377f8James Zern                             const uint8_t* top) {
9097c8da7ce66017295a65ec028084b90800be377f8James Zern  // U block
9107c8da7ce66017295a65ec028084b90800be377f8James Zern  DC8uvMode(C8DC8 + dst, left, top);
9117c8da7ce66017295a65ec028084b90800be377f8James Zern  VerticalPred(C8VE8 + dst, top, 8);
9127c8da7ce66017295a65ec028084b90800be377f8James Zern  HorizontalPred(C8HE8 + dst, left, 8);
9137c8da7ce66017295a65ec028084b90800be377f8James Zern  TrueMotion(C8TM8 + dst, left, top, 8);
9147c8da7ce66017295a65ec028084b90800be377f8James Zern  // V block
9157c8da7ce66017295a65ec028084b90800be377f8James Zern  dst += 8;
9167c8da7ce66017295a65ec028084b90800be377f8James Zern  if (top != NULL) top += 8;
9177c8da7ce66017295a65ec028084b90800be377f8James Zern  if (left != NULL) left += 16;
9187c8da7ce66017295a65ec028084b90800be377f8James Zern  DC8uvMode(C8DC8 + dst, left, top);
9197c8da7ce66017295a65ec028084b90800be377f8James Zern  VerticalPred(C8VE8 + dst, top, 8);
9207c8da7ce66017295a65ec028084b90800be377f8James Zern  HorizontalPred(C8HE8 + dst, left, 8);
9217c8da7ce66017295a65ec028084b90800be377f8James Zern  TrueMotion(C8TM8 + dst, left, top, 8);
9227c8da7ce66017295a65ec028084b90800be377f8James Zern}
9237c8da7ce66017295a65ec028084b90800be377f8James Zern
9247c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------
9257c8da7ce66017295a65ec028084b90800be377f8James Zern// luma 16x16 prediction (paragraph 12.3)
9267c8da7ce66017295a65ec028084b90800be377f8James Zern
9277c8da7ce66017295a65ec028084b90800be377f8James Zernstatic void Intra16Preds(uint8_t* dst,
9287c8da7ce66017295a65ec028084b90800be377f8James Zern                         const uint8_t* left, const uint8_t* top) {
9297c8da7ce66017295a65ec028084b90800be377f8James Zern  DC16Mode(I16DC16 + dst, left, top);
9307c8da7ce66017295a65ec028084b90800be377f8James Zern  VerticalPred(I16VE16 + dst, top, 16);
9317c8da7ce66017295a65ec028084b90800be377f8James Zern  HorizontalPred(I16HE16 + dst, left, 16);
9327c8da7ce66017295a65ec028084b90800be377f8James Zern  TrueMotion(I16TM16 + dst, left, top, 16);
9337c8da7ce66017295a65ec028084b90800be377f8James Zern}
9347c8da7ce66017295a65ec028084b90800be377f8James Zern
9357c8da7ce66017295a65ec028084b90800be377f8James Zern//------------------------------------------------------------------------------
9367c8da7ce66017295a65ec028084b90800be377f8James Zern// Metric
9377c8da7ce66017295a65ec028084b90800be377f8James Zern
9387c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
9397c8da7ce66017295a65ec028084b90800be377f8James Zern                                              __m128i* const sum) {
9407c8da7ce66017295a65ec028084b90800be377f8James Zern  // take abs(a-b) in 8b
9417c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i a_b = _mm_subs_epu8(a, b);
9427c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i b_a = _mm_subs_epu8(b, a);
9437c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
9447c8da7ce66017295a65ec028084b90800be377f8James Zern  // zero-extend to 16b
9457c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i zero = _mm_setzero_si128();
9467c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
9477c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
9487c8da7ce66017295a65ec028084b90800be377f8James Zern  // multiply with self
9497c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i sum1 = _mm_madd_epi16(C0, C0);
9507c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i sum2 = _mm_madd_epi16(C1, C1);
9517c8da7ce66017295a65ec028084b90800be377f8James Zern  *sum = _mm_add_epi32(sum1, sum2);
9527c8da7ce66017295a65ec028084b90800be377f8James Zern}
9537c8da7ce66017295a65ec028084b90800be377f8James Zern
9547c8da7ce66017295a65ec028084b90800be377f8James Zernstatic WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
9557c8da7ce66017295a65ec028084b90800be377f8James Zern                                int num_pairs) {
9567c8da7ce66017295a65ec028084b90800be377f8James Zern  __m128i sum = _mm_setzero_si128();
9577c8da7ce66017295a65ec028084b90800be377f8James Zern  int32_t tmp[4];
9587c8da7ce66017295a65ec028084b90800be377f8James Zern  int i;
9597c8da7ce66017295a65ec028084b90800be377f8James Zern
9607c8da7ce66017295a65ec028084b90800be377f8James Zern  for (i = 0; i < num_pairs; ++i) {
9617c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[BPS * 0]);
9627c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[BPS * 0]);
9637c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
9647c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
9657c8da7ce66017295a65ec028084b90800be377f8James Zern    __m128i sum1, sum2;
9667c8da7ce66017295a65ec028084b90800be377f8James Zern    SubtractAndAccumulate(a0, b0, &sum1);
9677c8da7ce66017295a65ec028084b90800be377f8James Zern    SubtractAndAccumulate(a1, b1, &sum2);
9687c8da7ce66017295a65ec028084b90800be377f8James Zern    sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
9697c8da7ce66017295a65ec028084b90800be377f8James Zern    a += 2 * BPS;
9707c8da7ce66017295a65ec028084b90800be377f8James Zern    b += 2 * BPS;
9711e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  }
9727c8da7ce66017295a65ec028084b90800be377f8James Zern  _mm_storeu_si128((__m128i*)tmp, sum);
9737c8da7ce66017295a65ec028084b90800be377f8James Zern  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
9741e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
9751e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
97633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE16x16(const uint8_t* a, const uint8_t* b) {
9777c8da7ce66017295a65ec028084b90800be377f8James Zern  return SSE_16xN(a, b, 8);
9781e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
9791e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
98033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE16x8(const uint8_t* a, const uint8_t* b) {
9817c8da7ce66017295a65ec028084b90800be377f8James Zern  return SSE_16xN(a, b, 4);
9821e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
9831e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
9847c8da7ce66017295a65ec028084b90800be377f8James Zern#define LOAD_8x16b(ptr) \
9857c8da7ce66017295a65ec028084b90800be377f8James Zern  _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
9867c8da7ce66017295a65ec028084b90800be377f8James Zern
98733f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE8x8(const uint8_t* a, const uint8_t* b) {
9887c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i zero = _mm_setzero_si128();
9897c8da7ce66017295a65ec028084b90800be377f8James Zern  int num_pairs = 4;
9907c8da7ce66017295a65ec028084b90800be377f8James Zern  __m128i sum = zero;
9917c8da7ce66017295a65ec028084b90800be377f8James Zern  int32_t tmp[4];
9927c8da7ce66017295a65ec028084b90800be377f8James Zern  while (num_pairs-- > 0) {
9937c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i a0 = LOAD_8x16b(&a[BPS * 0]);
9947c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i a1 = LOAD_8x16b(&a[BPS * 1]);
9957c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i b0 = LOAD_8x16b(&b[BPS * 0]);
9967c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i b1 = LOAD_8x16b(&b[BPS * 1]);
9977c8da7ce66017295a65ec028084b90800be377f8James Zern    // subtract
9987c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i c0 = _mm_subs_epi16(a0, b0);
9997c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i c1 = _mm_subs_epi16(a1, b1);
10007c8da7ce66017295a65ec028084b90800be377f8James Zern    // multiply/accumulate with self
10017c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i d0 = _mm_madd_epi16(c0, c0);
10027c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i d1 = _mm_madd_epi16(c1, c1);
10037c8da7ce66017295a65ec028084b90800be377f8James Zern    // collect
10047c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i sum01 = _mm_add_epi32(d0, d1);
10057c8da7ce66017295a65ec028084b90800be377f8James Zern    sum = _mm_add_epi32(sum, sum01);
10067c8da7ce66017295a65ec028084b90800be377f8James Zern    a += 2 * BPS;
10077c8da7ce66017295a65ec028084b90800be377f8James Zern    b += 2 * BPS;
10087c8da7ce66017295a65ec028084b90800be377f8James Zern  }
10097c8da7ce66017295a65ec028084b90800be377f8James Zern  _mm_storeu_si128((__m128i*)tmp, sum);
10107c8da7ce66017295a65ec028084b90800be377f8James Zern  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
10111e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora}
10127c8da7ce66017295a65ec028084b90800be377f8James Zern#undef LOAD_8x16b
10131e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
101433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int SSE4x4(const uint8_t* a, const uint8_t* b) {
10151e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const __m128i zero = _mm_setzero_si128();
1016466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
10171e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  // Load values. Note that we read 8 pixels instead of 4,
10181e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  // but the a/b buffers are over-allocated to that effect.
10197c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]);
10207c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]);
10217c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]);
10227c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]);
10237c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]);
10247c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]);
10257c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]);
10267c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]);
10277c8da7ce66017295a65ec028084b90800be377f8James Zern  // Combine pair of lines.
1028466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
1029466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
1030466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
1031466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
10327c8da7ce66017295a65ec028084b90800be377f8James Zern  // Convert to 16b.
1033466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
1034466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
1035466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
1036466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
10377c8da7ce66017295a65ec028084b90800be377f8James Zern  // subtract, square and accumulate
10387c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i d0 = _mm_subs_epi16(a01s, b01s);
10397c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i d1 = _mm_subs_epi16(a23s, b23s);
10407c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i e0 = _mm_madd_epi16(d0, d0);
10417c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i e1 = _mm_madd_epi16(d1, d1);
10427c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i sum = _mm_add_epi32(e0, e1);
10431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1044466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  int32_t tmp[4];
10457c8da7ce66017295a65ec028084b90800be377f8James Zern  _mm_storeu_si128((__m128i*)tmp, sum);
1046466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
1047466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora}
1048466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1049a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora//------------------------------------------------------------------------------
1050fa39824bb690c5806358871f46940d0450973d8aJames Zern
1051fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
1052fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i mask = _mm_set1_epi16(0x00ff);
1053fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
1054fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
1055fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]);
1056fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]);
1057fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i b0 = _mm_srli_epi16(a0, 8);     // hi byte
1058fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i b1 = _mm_srli_epi16(a1, 8);
1059fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i b2 = _mm_srli_epi16(a2, 8);
1060fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i b3 = _mm_srli_epi16(a3, 8);
1061fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i c0 = _mm_and_si128(a0, mask);   // lo byte
1062fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i c1 = _mm_and_si128(a1, mask);
1063fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i c2 = _mm_and_si128(a2, mask);
1064fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i c3 = _mm_and_si128(a3, mask);
1065fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i d0 = _mm_add_epi32(b0, c0);
1066fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i d1 = _mm_add_epi32(b1, c1);
1067fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i d2 = _mm_add_epi32(b2, c2);
1068fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i d3 = _mm_add_epi32(b3, c3);
1069fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i e0 = _mm_add_epi32(d0, d1);
1070fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i e1 = _mm_add_epi32(d2, d3);
1071fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i f0 = _mm_add_epi32(e0, e1);
1072fa39824bb690c5806358871f46940d0450973d8aJames Zern  uint16_t tmp[8];
1073fa39824bb690c5806358871f46940d0450973d8aJames Zern  _mm_storeu_si128((__m128i*)tmp, f0);
1074fa39824bb690c5806358871f46940d0450973d8aJames Zern  dc[0] = tmp[0] + tmp[1];
1075fa39824bb690c5806358871f46940d0450973d8aJames Zern  dc[1] = tmp[2] + tmp[3];
1076fa39824bb690c5806358871f46940d0450973d8aJames Zern  dc[2] = tmp[4] + tmp[5];
1077fa39824bb690c5806358871f46940d0450973d8aJames Zern  dc[3] = tmp[6] + tmp[7];
1078fa39824bb690c5806358871f46940d0450973d8aJames Zern}
1079fa39824bb690c5806358871f46940d0450973d8aJames Zern
1080fa39824bb690c5806358871f46940d0450973d8aJames Zern//------------------------------------------------------------------------------
1081466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Texture distortion
1082466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora//
1083466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// We try to match the spectral content (weighted) between source and
1084466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// reconstructed samples.
1085466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1086466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Hadamard transform
10870912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern// Returns the weighted sum of the absolute value of transformed coefficients.
10880912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern// w[] contains a row-major 4 by 4 symmetric matrix.
108933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int TTransform(const uint8_t* inA, const uint8_t* inB,
109033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      const uint16_t* const w) {
1091466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  int32_t sum[4];
1092466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
1093466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  const __m128i zero = _mm_setzero_si128();
1094466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
10950912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // Load and combine inputs.
1096466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  {
10977c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
10987c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
10997c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
11007c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
11017c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
11027c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
11037c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
11047c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
1105466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1106466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Combine inA and inB (we'll do two transforms in parallel).
11070912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
11080912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
11090912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
11100912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
11110912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    tmp_0 = _mm_unpacklo_epi8(inAB_0, zero);
11120912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    tmp_1 = _mm_unpacklo_epi8(inAB_1, zero);
11130912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    tmp_2 = _mm_unpacklo_epi8(inAB_2, zero);
11140912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    tmp_3 = _mm_unpacklo_epi8(inAB_3, zero);
11150912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    // a00 a01 a02 a03   b00 b01 b02 b03
11160912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    // a10 a11 a12 a13   b10 b11 b12 b13
11170912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    // a20 a21 a22 a23   b20 b21 b22 b23
11180912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    // a30 a31 a32 a33   b30 b31 b32 b33
1119466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  }
1120466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
11210912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // Vertical pass first to avoid a transpose (vertical and horizontal passes
11220912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // are commutative because w/kWeightY is symmetric) and subsequent transpose.
1123466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  {
1124466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Calculate a and b (two 4x4 at once).
11251e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
11261e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
11271e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
11281e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
11291e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora    const __m128i b0 = _mm_add_epi16(a0, a1);
1130466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i b1 = _mm_add_epi16(a3, a2);
1131466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i b2 = _mm_sub_epi16(a3, a2);
1132466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i b3 = _mm_sub_epi16(a0, a1);
1133466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // a00 a01 a02 a03   b00 b01 b02 b03
1134466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // a10 a11 a12 a13   b10 b11 b12 b13
1135466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // a20 a21 a22 a23   b20 b21 b22 b23
1136466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // a30 a31 a32 a33   b30 b31 b32 b33
1137466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1138466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Transpose the two 4x4.
11390912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern    VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
1140466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  }
1141466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
11420912efc2528d03c59d45dd9bdc9ff9ec800a3fc1James Zern  // Horizontal pass and difference of weighted sums.
1143466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  {
1144466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Load all inputs.
11457c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
11467c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
1147466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1148466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Calculate a and b (two 4x4 at once).
1149466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
1150466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
1151466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
1152466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
1153466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i b0 = _mm_add_epi16(a0, a1);
1154466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i b1 = _mm_add_epi16(a3, a2);
1155466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i b2 = _mm_sub_epi16(a3, a2);
1156466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const __m128i b3 = _mm_sub_epi16(a0, a1);
1157466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1158466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // Separate the transforms of inA and inB.
1159466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
1160466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
1161466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
1162466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
1163466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1164466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    {
11657c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i d0 = _mm_sub_epi16(zero, A_b0);
11667c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i d1 = _mm_sub_epi16(zero, A_b2);
11677c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i d2 = _mm_sub_epi16(zero, B_b0);
11687c8da7ce66017295a65ec028084b90800be377f8James Zern      const __m128i d3 = _mm_sub_epi16(zero, B_b2);
11697c8da7ce66017295a65ec028084b90800be377f8James Zern      A_b0 = _mm_max_epi16(A_b0, d0);   // abs(v), 16b
11707c8da7ce66017295a65ec028084b90800be377f8James Zern      A_b2 = _mm_max_epi16(A_b2, d1);
11717c8da7ce66017295a65ec028084b90800be377f8James Zern      B_b0 = _mm_max_epi16(B_b0, d2);
11727c8da7ce66017295a65ec028084b90800be377f8James Zern      B_b2 = _mm_max_epi16(B_b2, d3);
1173466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    }
1174466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1175466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // weighted sums
1176466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    A_b0 = _mm_madd_epi16(A_b0, w_0);
1177466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    A_b2 = _mm_madd_epi16(A_b2, w_8);
1178466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    B_b0 = _mm_madd_epi16(B_b0, w_0);
1179466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    B_b2 = _mm_madd_epi16(B_b2, w_8);
1180466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    A_b0 = _mm_add_epi32(A_b0, A_b2);
1181466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    B_b0 = _mm_add_epi32(B_b0, B_b2);
1182466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1183466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // difference of weighted sums
1184466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    A_b0 = _mm_sub_epi32(A_b0, B_b0);
1185466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
1186466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  }
1187466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  return sum[0] + sum[1] + sum[2] + sum[3];
1188466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora}
1189466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
119033f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int Disto4x4(const uint8_t* const a, const uint8_t* const b,
119133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                    const uint16_t* const w) {
119233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const int diff_sum = TTransform(a, b, w);
11931e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  return abs(diff_sum) >> 5;
1194466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora}
1195466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
119633f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int Disto16x16(const uint8_t* const a, const uint8_t* const b,
119733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                      const uint16_t* const w) {
1198466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  int D = 0;
1199466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  int x, y;
1200466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
1201466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    for (x = 0; x < 16; x += 4) {
120233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora      D += Disto4x4(a + x + y, b + x + y, w);
1203466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    }
1204466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  }
1205466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  return D;
1206466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora}
1207466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1208a2415724fb3466168b2af5b08bd94ba732c0e753Vikas Arora//------------------------------------------------------------------------------
1209466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora// Quantization
1210466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora//
1211466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
121233f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
121333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                       const uint16_t* const sharpen,
121433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                                       const VP8Matrix* const mtx) {
12150406ce1417f76f2034833414dcecc9f56253640cVikas Arora  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
12161e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora  const __m128i zero = _mm_setzero_si128();
1217466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  __m128i coeff0, coeff8;
1218466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  __m128i out0, out8;
1219466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  __m128i packed_out;
1220466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1221466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // Load all inputs.
1222466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
1223466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
12247c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
12257c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
12267c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
12277c8da7ce66017295a65ec028084b90800be377f8James Zern  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
1228466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
122933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // extract sign(in)  (0x0000 if positive, 0xffff if negative)
123033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
123133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
1232466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1233466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // coeff = abs(in) = (in ^ sign) - sign
1234466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  coeff0 = _mm_xor_si128(in0, sign0);
1235466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  coeff8 = _mm_xor_si128(in8, sign8);
1236466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  coeff0 = _mm_sub_epi16(coeff0, sign0);
1237466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  coeff8 = _mm_sub_epi16(coeff8, sign8);
1238466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1239466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // coeff = abs(in) + sharpen
124033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  if (sharpen != NULL) {
12417c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
12427c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
124333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    coeff0 = _mm_add_epi16(coeff0, sharpen0);
124433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    coeff8 = _mm_add_epi16(coeff8, sharpen8);
124533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  }
1246466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
124733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  // out = (coeff * iQ + B) >> QFIX
1248466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  {
1249466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // doing calculations with 32b precision (QFIX=17)
1250466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // out = (coeff * iQ)
125133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
125233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
125333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
125433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
1255466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
1256466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
1257466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
1258466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
1259466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // out = (coeff * iQ + B)
12607c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
12617c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
12627c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
12637c8da7ce66017295a65ec028084b90800be377f8James Zern    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
1264466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out_00 = _mm_add_epi32(out_00, bias_00);
1265466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out_04 = _mm_add_epi32(out_04, bias_04);
1266466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out_08 = _mm_add_epi32(out_08, bias_08);
1267466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out_12 = _mm_add_epi32(out_12, bias_12);
126833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora    // out = QUANTDIV(coeff, iQ, B, QFIX)
1269466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out_00 = _mm_srai_epi32(out_00, QFIX);
1270466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out_04 = _mm_srai_epi32(out_04, QFIX);
1271466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out_08 = _mm_srai_epi32(out_08, QFIX);
1272466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out_12 = _mm_srai_epi32(out_12, QFIX);
12730406ce1417f76f2034833414dcecc9f56253640cVikas Arora
1274466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    // pack result as 16b
1275466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out0 = _mm_packs_epi32(out_00, out_04);
1276466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out8 = _mm_packs_epi32(out_08, out_12);
12770406ce1417f76f2034833414dcecc9f56253640cVikas Arora
12780406ce1417f76f2034833414dcecc9f56253640cVikas Arora    // if (coeff > 2047) coeff = 2047
12790406ce1417f76f2034833414dcecc9f56253640cVikas Arora    out0 = _mm_min_epi16(out0, max_coeff_2047);
12800406ce1417f76f2034833414dcecc9f56253640cVikas Arora    out8 = _mm_min_epi16(out8, max_coeff_2047);
1281466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  }
1282466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1283466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // get sign back (if (sign[j]) out_n = -out_n)
1284466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  out0 = _mm_xor_si128(out0, sign0);
1285466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  out8 = _mm_xor_si128(out8, sign8);
1286466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  out0 = _mm_sub_epi16(out0, sign0);
1287466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  out8 = _mm_sub_epi16(out8, sign8);
1288466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1289466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // in = out * Q
1290466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  in0 = _mm_mullo_epi16(out0, q0);
1291466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  in8 = _mm_mullo_epi16(out8, q8);
1292466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
12938b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora  _mm_storeu_si128((__m128i*)&in[0], in0);
12948b720228d581a84fd173b6dcb2fa295b59db489aVikas Arora  _mm_storeu_si128((__m128i*)&in[8], in8);
1295466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1296466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // zigzag the output before storing it.
1297466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  //
1298466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // The zigzag pattern can almost be reproduced with a small sequence of
1299466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // shuffles. After it, we only need to swap the 7th (ending up in third
1300466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // position instead of twelfth) and 8th values.
1301466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  {
1302466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    __m128i outZ0, outZ8;
1303466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
1304466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
1305466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
1306466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
1307466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
1308466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
1309466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    _mm_storeu_si128((__m128i*)&out[0], outZ0);
1310466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    _mm_storeu_si128((__m128i*)&out[8], outZ8);
1311466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    packed_out = _mm_packs_epi16(outZ0, outZ8);
1312466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  }
1313466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  {
1314466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const int16_t outZ_12 = out[12];
1315466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    const int16_t outZ_3 = out[3];
1316466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out[3] = outZ_12;
1317466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora    out[12] = outZ_3;
1318466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  }
1319466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
1320466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora  // detect if all 'out' values are zeroes or not
132133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
1322466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora}
1323466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
132433f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int QuantizeBlock(int16_t in[16], int16_t out[16],
132533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                         const VP8Matrix* const mtx) {
132633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
132733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
132833f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
132933f74dabbc7920a65ed435d7417987589febdc16Vikas Arorastatic int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
133033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora                            const VP8Matrix* const mtx) {
133133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  return DoQuantizeBlock(in, out, NULL, mtx);
133233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora}
133333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora
13347c8da7ce66017295a65ec028084b90800be377f8James Zernstatic int Quantize2Blocks(int16_t in[32], int16_t out[32],
13357c8da7ce66017295a65ec028084b90800be377f8James Zern                           const VP8Matrix* const mtx) {
13367c8da7ce66017295a65ec028084b90800be377f8James Zern  int nz;
13377c8da7ce66017295a65ec028084b90800be377f8James Zern  const uint16_t* const sharpen = &mtx->sharpen_[0];
13387c8da7ce66017295a65ec028084b90800be377f8James Zern  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
13397c8da7ce66017295a65ec028084b90800be377f8James Zern  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
13407c8da7ce66017295a65ec028084b90800be377f8James Zern  return nz;
13417c8da7ce66017295a65ec028084b90800be377f8James Zern}
13421e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
13431e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora//------------------------------------------------------------------------------
13441e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora// Entry point
13451e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
1346466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Aroraextern void VP8EncDspInitSSE2(void);
13471e7bf8805bd030c19924a5306837ecd72c295751Vikas Arora
13487c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
134933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8CollectHistogram = CollectHistogram;
13507c8da7ce66017295a65ec028084b90800be377f8James Zern  VP8EncPredLuma16 = Intra16Preds;
13517c8da7ce66017295a65ec028084b90800be377f8James Zern  VP8EncPredChroma8 = IntraChromaPreds;
13527c8da7ce66017295a65ec028084b90800be377f8James Zern  VP8EncPredLuma4 = Intra4Preds;
135333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8EncQuantizeBlock = QuantizeBlock;
13547c8da7ce66017295a65ec028084b90800be377f8James Zern  VP8EncQuantize2Blocks = Quantize2Blocks;
135533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
135633f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8ITransform = ITransform;
135733f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8FTransform = FTransform;
13587c8da7ce66017295a65ec028084b90800be377f8James Zern  VP8FTransform2 = FTransform2;
135933f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8FTransformWHT = FTransformWHT;
136033f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SSE16x16 = SSE16x16;
136133f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SSE16x8 = SSE16x8;
136233f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SSE8x8 = SSE8x8;
136333f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8SSE4x4 = SSE4x4;
136433f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8TDisto4x4 = Disto4x4;
136533f74dabbc7920a65ed435d7417987589febdc16Vikas Arora  VP8TDisto16x16 = Disto16x16;
1366fa39824bb690c5806358871f46940d0450973d8aJames Zern  VP8Mean16x4 = Mean16x4;
1367fa39824bb690c5806358871f46940d0450973d8aJames Zern}
1368fa39824bb690c5806358871f46940d0450973d8aJames Zern
1369fa39824bb690c5806358871f46940d0450973d8aJames Zern//------------------------------------------------------------------------------
1370fa39824bb690c5806358871f46940d0450973d8aJames Zern// SSIM / PSNR entry point (TODO(skal): move to its own file later)
1371fa39824bb690c5806358871f46940d0450973d8aJames Zern
1372fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
1373fa39824bb690c5806358871f46940d0450973d8aJames Zern                                   const uint8_t* src2, int len) {
1374fa39824bb690c5806358871f46940d0450973d8aJames Zern  int i = 0;
1375fa39824bb690c5806358871f46940d0450973d8aJames Zern  uint32_t sse2 = 0;
1376fa39824bb690c5806358871f46940d0450973d8aJames Zern  if (len >= 16) {
1377fa39824bb690c5806358871f46940d0450973d8aJames Zern    const int limit = len - 32;
1378fa39824bb690c5806358871f46940d0450973d8aJames Zern    int32_t tmp[4];
1379fa39824bb690c5806358871f46940d0450973d8aJames Zern    __m128i sum1;
1380fa39824bb690c5806358871f46940d0450973d8aJames Zern    __m128i sum = _mm_setzero_si128();
1381fa39824bb690c5806358871f46940d0450973d8aJames Zern    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
1382fa39824bb690c5806358871f46940d0450973d8aJames Zern    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
1383fa39824bb690c5806358871f46940d0450973d8aJames Zern    i += 16;
1384fa39824bb690c5806358871f46940d0450973d8aJames Zern    while (i <= limit) {
1385fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
1386fa39824bb690c5806358871f46940d0450973d8aJames Zern      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
1387fa39824bb690c5806358871f46940d0450973d8aJames Zern      __m128i sum2;
1388fa39824bb690c5806358871f46940d0450973d8aJames Zern      i += 16;
1389fa39824bb690c5806358871f46940d0450973d8aJames Zern      SubtractAndAccumulate(a0, b0, &sum1);
1390fa39824bb690c5806358871f46940d0450973d8aJames Zern      sum = _mm_add_epi32(sum, sum1);
1391fa39824bb690c5806358871f46940d0450973d8aJames Zern      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
1392fa39824bb690c5806358871f46940d0450973d8aJames Zern      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
1393fa39824bb690c5806358871f46940d0450973d8aJames Zern      i += 16;
1394fa39824bb690c5806358871f46940d0450973d8aJames Zern      SubtractAndAccumulate(a1, b1, &sum2);
1395fa39824bb690c5806358871f46940d0450973d8aJames Zern      sum = _mm_add_epi32(sum, sum2);
1396fa39824bb690c5806358871f46940d0450973d8aJames Zern    }
1397fa39824bb690c5806358871f46940d0450973d8aJames Zern    SubtractAndAccumulate(a0, b0, &sum1);
1398fa39824bb690c5806358871f46940d0450973d8aJames Zern    sum = _mm_add_epi32(sum, sum1);
1399fa39824bb690c5806358871f46940d0450973d8aJames Zern    _mm_storeu_si128((__m128i*)tmp, sum);
1400fa39824bb690c5806358871f46940d0450973d8aJames Zern    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
1401fa39824bb690c5806358871f46940d0450973d8aJames Zern  }
1402fa39824bb690c5806358871f46940d0450973d8aJames Zern
1403fa39824bb690c5806358871f46940d0450973d8aJames Zern  for (; i < len; ++i) {
1404fa39824bb690c5806358871f46940d0450973d8aJames Zern    const int32_t diff = src1[i] - src2[i];
1405fa39824bb690c5806358871f46940d0450973d8aJames Zern    sse2 += diff * diff;
1406fa39824bb690c5806358871f46940d0450973d8aJames Zern  }
1407fa39824bb690c5806358871f46940d0450973d8aJames Zern  return sse2;
1408fa39824bb690c5806358871f46940d0450973d8aJames Zern}
1409fa39824bb690c5806358871f46940d0450973d8aJames Zern
1410fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t HorizontalAdd16b(const __m128i* const m) {
1411fa39824bb690c5806358871f46940d0450973d8aJames Zern  uint16_t tmp[8];
1412fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i a = _mm_srli_si128(*m, 8);
1413fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i b = _mm_add_epi16(*m, a);
1414fa39824bb690c5806358871f46940d0450973d8aJames Zern  _mm_storeu_si128((__m128i*)tmp, b);
1415fa39824bb690c5806358871f46940d0450973d8aJames Zern  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
1416fa39824bb690c5806358871f46940d0450973d8aJames Zern}
1417fa39824bb690c5806358871f46940d0450973d8aJames Zern
1418fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic uint32_t HorizontalAdd32b(const __m128i* const m) {
1419fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i a = _mm_srli_si128(*m, 8);
1420fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i b = _mm_add_epi32(*m, a);
1421fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
1422fa39824bb690c5806358871f46940d0450973d8aJames Zern  return (uint32_t)_mm_cvtsi128_si32(c);
1423fa39824bb690c5806358871f46940d0450973d8aJames Zern}
1424fa39824bb690c5806358871f46940d0450973d8aJames Zern
1425fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
1426fa39824bb690c5806358871f46940d0450973d8aJames Zern
1427fa39824bb690c5806358871f46940d0450973d8aJames Zern#define ACCUMULATE_ROW(WEIGHT) do {                         \
1428fa39824bb690c5806358871f46940d0450973d8aJames Zern  /* compute row weight (Wx * Wy) */                        \
1429fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
1430fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
1431fa39824bb690c5806358871f46940d0450973d8aJames Zern  /* process 8 bytes at a time (7 bytes, actually) */       \
1432fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
1433fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
1434fa39824bb690c5806358871f46940d0450973d8aJames Zern  /* convert to 16b and multiply by weight */               \
1435fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
1436fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
1437fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
1438fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
1439fa39824bb690c5806358871f46940d0450973d8aJames Zern  /* accumulate */                                          \
1440fa39824bb690c5806358871f46940d0450973d8aJames Zern  xm  = _mm_add_epi16(xm, wa1);                             \
1441fa39824bb690c5806358871f46940d0450973d8aJames Zern  ym  = _mm_add_epi16(ym, wb1);                             \
1442fa39824bb690c5806358871f46940d0450973d8aJames Zern  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
1443fa39824bb690c5806358871f46940d0450973d8aJames Zern  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
1444fa39824bb690c5806358871f46940d0450973d8aJames Zern  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
1445fa39824bb690c5806358871f46940d0450973d8aJames Zern  src1 += stride1;                                          \
1446fa39824bb690c5806358871f46940d0450973d8aJames Zern  src2 += stride2;                                          \
1447fa39824bb690c5806358871f46940d0450973d8aJames Zern} while (0)
1448fa39824bb690c5806358871f46940d0450973d8aJames Zern
1449fa39824bb690c5806358871f46940d0450973d8aJames Zernstatic double SSIMGet_SSE2(const uint8_t* src1, int stride1,
1450fa39824bb690c5806358871f46940d0450973d8aJames Zern                           const uint8_t* src2, int stride2) {
1451fa39824bb690c5806358871f46940d0450973d8aJames Zern  VP8DistoStats stats;
1452fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i zero = _mm_setzero_si128();
1453fa39824bb690c5806358871f46940d0450973d8aJames Zern  __m128i xm = zero, ym = zero;                // 16b accums
1454fa39824bb690c5806358871f46940d0450973d8aJames Zern  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
1455fa39824bb690c5806358871f46940d0450973d8aJames Zern  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
1456fa39824bb690c5806358871f46940d0450973d8aJames Zern  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
1457fa39824bb690c5806358871f46940d0450973d8aJames Zern  ACCUMULATE_ROW(1);
1458fa39824bb690c5806358871f46940d0450973d8aJames Zern  ACCUMULATE_ROW(2);
1459fa39824bb690c5806358871f46940d0450973d8aJames Zern  ACCUMULATE_ROW(3);
1460fa39824bb690c5806358871f46940d0450973d8aJames Zern  ACCUMULATE_ROW(4);
1461fa39824bb690c5806358871f46940d0450973d8aJames Zern  ACCUMULATE_ROW(3);
1462fa39824bb690c5806358871f46940d0450973d8aJames Zern  ACCUMULATE_ROW(2);
1463fa39824bb690c5806358871f46940d0450973d8aJames Zern  ACCUMULATE_ROW(1);
1464fa39824bb690c5806358871f46940d0450973d8aJames Zern  stats.xm  = HorizontalAdd16b(&xm);
1465fa39824bb690c5806358871f46940d0450973d8aJames Zern  stats.ym  = HorizontalAdd16b(&ym);
1466fa39824bb690c5806358871f46940d0450973d8aJames Zern  stats.xxm = HorizontalAdd32b(&xxm);
1467fa39824bb690c5806358871f46940d0450973d8aJames Zern  stats.xym = HorizontalAdd32b(&xym);
1468fa39824bb690c5806358871f46940d0450973d8aJames Zern  stats.yym = HorizontalAdd32b(&yym);
1469fa39824bb690c5806358871f46940d0450973d8aJames Zern  return VP8SSIMFromStats(&stats);
1470fa39824bb690c5806358871f46940d0450973d8aJames Zern}
1471fa39824bb690c5806358871f46940d0450973d8aJames Zern
1472fa39824bb690c5806358871f46940d0450973d8aJames Zernextern void VP8SSIMDspInitSSE2(void);
1473fa39824bb690c5806358871f46940d0450973d8aJames Zern
1474fa39824bb690c5806358871f46940d0450973d8aJames ZernWEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
1475fa39824bb690c5806358871f46940d0450973d8aJames Zern  VP8AccumulateSSE = AccumulateSSE_SSE2;
1476fa39824bb690c5806358871f46940d0450973d8aJames Zern  VP8SSIMGet = SSIMGet_SSE2;
1477466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora}
1478466727975bcc57c0c5597bcd0747a2fe4777b303Vikas Arora
14797c8da7ce66017295a65ec028084b90800be377f8James Zern#else  // !WEBP_USE_SSE2
14807c8da7ce66017295a65ec028084b90800be377f8James Zern
14817c8da7ce66017295a65ec028084b90800be377f8James ZernWEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
1482fa39824bb690c5806358871f46940d0450973d8aJames ZernWEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
14837c8da7ce66017295a65ec028084b90800be377f8James Zern
14847c8da7ce66017295a65ec028084b90800be377f8James Zern#endif  // WEBP_USE_SSE2
1485