1// Copyright 2011 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// SSE2 version of some decoding functions (idct, loop filtering).
11//
12// Author: somnath@google.com (Somnath Banerjee)
13//         cduvivier@google.com (Christian Duvivier)
14
15#include "./dsp.h"
16
17#if defined(WEBP_USE_SSE2)
18
19// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
20// one it seems => disable it by default. Uncomment the following to enable:
21// #define USE_TRANSFORM_AC3
22
23#include <emmintrin.h>
24#include "../dec/vp8i.h"
25
26//------------------------------------------------------------------------------
27// Transforms (Paragraph 14.4)
28
29static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
30  // This implementation makes use of 16-bit fixed point versions of two
31  // multiply constants:
32  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
33  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
34  //
35  // To be able to use signed 16-bit integers, we use the following trick to
36  // have constants within range:
37  // - Associated constants are obtained by subtracting the 16-bit fixed point
38  //   version of one:
39  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
40  //      K1 = 85267  =>  k1 =  20091
41  //      K2 = 35468  =>  k2 = -30068
42  // - The multiplication of a variable by a constant become the sum of the
43  //   variable and the multiplication of that variable by the associated
44  //   constant:
45  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
46  const __m128i k1 = _mm_set1_epi16(20091);
47  const __m128i k2 = _mm_set1_epi16(-30068);
48  __m128i T0, T1, T2, T3;
49
50  // Load and concatenate the transform coefficients (we'll do two transforms
51  // in parallel). In the case of only one transform, the second half of the
52  // vectors will just contain random value we'll never use nor store.
53  __m128i in0, in1, in2, in3;
54  {
55    in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
56    in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
57    in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
58    in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
59    // a00 a10 a20 a30   x x x x
60    // a01 a11 a21 a31   x x x x
61    // a02 a12 a22 a32   x x x x
62    // a03 a13 a23 a33   x x x x
63    if (do_two) {
64      const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
65      const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
66      const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
67      const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
68      in0 = _mm_unpacklo_epi64(in0, inB0);
69      in1 = _mm_unpacklo_epi64(in1, inB1);
70      in2 = _mm_unpacklo_epi64(in2, inB2);
71      in3 = _mm_unpacklo_epi64(in3, inB3);
72      // a00 a10 a20 a30   b00 b10 b20 b30
73      // a01 a11 a21 a31   b01 b11 b21 b31
74      // a02 a12 a22 a32   b02 b12 b22 b32
75      // a03 a13 a23 a33   b03 b13 b23 b33
76    }
77  }
78
79  // Vertical pass and subsequent transpose.
80  {
81    // First pass, c and d calculations are longer because of the "trick"
82    // multiplications.
83    const __m128i a = _mm_add_epi16(in0, in2);
84    const __m128i b = _mm_sub_epi16(in0, in2);
85    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
86    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
87    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
88    const __m128i c3 = _mm_sub_epi16(in1, in3);
89    const __m128i c4 = _mm_sub_epi16(c1, c2);
90    const __m128i c = _mm_add_epi16(c3, c4);
91    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
92    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
93    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
94    const __m128i d3 = _mm_add_epi16(in1, in3);
95    const __m128i d4 = _mm_add_epi16(d1, d2);
96    const __m128i d = _mm_add_epi16(d3, d4);
97
98    // Second pass.
99    const __m128i tmp0 = _mm_add_epi16(a, d);
100    const __m128i tmp1 = _mm_add_epi16(b, c);
101    const __m128i tmp2 = _mm_sub_epi16(b, c);
102    const __m128i tmp3 = _mm_sub_epi16(a, d);
103
104    // Transpose the two 4x4.
105    // a00 a01 a02 a03   b00 b01 b02 b03
106    // a10 a11 a12 a13   b10 b11 b12 b13
107    // a20 a21 a22 a23   b20 b21 b22 b23
108    // a30 a31 a32 a33   b30 b31 b32 b33
109    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
110    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
111    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
112    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
113    // a00 a10 a01 a11   a02 a12 a03 a13
114    // a20 a30 a21 a31   a22 a32 a23 a33
115    // b00 b10 b01 b11   b02 b12 b03 b13
116    // b20 b30 b21 b31   b22 b32 b23 b33
117    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
118    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
119    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
120    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
121    // a00 a10 a20 a30 a01 a11 a21 a31
122    // b00 b10 b20 b30 b01 b11 b21 b31
123    // a02 a12 a22 a32 a03 a13 a23 a33
124    // b02 b12 a22 b32 b03 b13 b23 b33
125    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
126    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
127    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
128    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
129    // a00 a10 a20 a30   b00 b10 b20 b30
130    // a01 a11 a21 a31   b01 b11 b21 b31
131    // a02 a12 a22 a32   b02 b12 b22 b32
132    // a03 a13 a23 a33   b03 b13 b23 b33
133  }
134
135  // Horizontal pass and subsequent transpose.
136  {
137    // First pass, c and d calculations are longer because of the "trick"
138    // multiplications.
139    const __m128i four = _mm_set1_epi16(4);
140    const __m128i dc = _mm_add_epi16(T0, four);
141    const __m128i a =  _mm_add_epi16(dc, T2);
142    const __m128i b =  _mm_sub_epi16(dc, T2);
143    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
144    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
145    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
146    const __m128i c3 = _mm_sub_epi16(T1, T3);
147    const __m128i c4 = _mm_sub_epi16(c1, c2);
148    const __m128i c = _mm_add_epi16(c3, c4);
149    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
150    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
151    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
152    const __m128i d3 = _mm_add_epi16(T1, T3);
153    const __m128i d4 = _mm_add_epi16(d1, d2);
154    const __m128i d = _mm_add_epi16(d3, d4);
155
156    // Second pass.
157    const __m128i tmp0 = _mm_add_epi16(a, d);
158    const __m128i tmp1 = _mm_add_epi16(b, c);
159    const __m128i tmp2 = _mm_sub_epi16(b, c);
160    const __m128i tmp3 = _mm_sub_epi16(a, d);
161    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
162    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
163    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
164    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
165
166    // Transpose the two 4x4.
167    // a00 a01 a02 a03   b00 b01 b02 b03
168    // a10 a11 a12 a13   b10 b11 b12 b13
169    // a20 a21 a22 a23   b20 b21 b22 b23
170    // a30 a31 a32 a33   b30 b31 b32 b33
171    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
172    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
173    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
174    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
175    // a00 a10 a01 a11   a02 a12 a03 a13
176    // a20 a30 a21 a31   a22 a32 a23 a33
177    // b00 b10 b01 b11   b02 b12 b03 b13
178    // b20 b30 b21 b31   b22 b32 b23 b33
179    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
180    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
181    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
182    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
183    // a00 a10 a20 a30 a01 a11 a21 a31
184    // b00 b10 b20 b30 b01 b11 b21 b31
185    // a02 a12 a22 a32 a03 a13 a23 a33
186    // b02 b12 a22 b32 b03 b13 b23 b33
187    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
188    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
189    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
190    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
191    // a00 a10 a20 a30   b00 b10 b20 b30
192    // a01 a11 a21 a31   b01 b11 b21 b31
193    // a02 a12 a22 a32   b02 b12 b22 b32
194    // a03 a13 a23 a33   b03 b13 b23 b33
195  }
196
197  // Add inverse transform to 'dst' and store.
198  {
199    const __m128i zero = _mm_setzero_si128();
200    // Load the reference(s).
201    __m128i dst0, dst1, dst2, dst3;
202    if (do_two) {
203      // Load eight bytes/pixels per line.
204      dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
205      dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
206      dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
207      dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
208    } else {
209      // Load four bytes/pixels per line.
210      dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
211      dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
212      dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
213      dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
214    }
215    // Convert to 16b.
216    dst0 = _mm_unpacklo_epi8(dst0, zero);
217    dst1 = _mm_unpacklo_epi8(dst1, zero);
218    dst2 = _mm_unpacklo_epi8(dst2, zero);
219    dst3 = _mm_unpacklo_epi8(dst3, zero);
220    // Add the inverse transform(s).
221    dst0 = _mm_add_epi16(dst0, T0);
222    dst1 = _mm_add_epi16(dst1, T1);
223    dst2 = _mm_add_epi16(dst2, T2);
224    dst3 = _mm_add_epi16(dst3, T3);
225    // Unsigned saturate to 8b.
226    dst0 = _mm_packus_epi16(dst0, dst0);
227    dst1 = _mm_packus_epi16(dst1, dst1);
228    dst2 = _mm_packus_epi16(dst2, dst2);
229    dst3 = _mm_packus_epi16(dst3, dst3);
230    // Store the results.
231    if (do_two) {
232      // Store eight bytes/pixels per line.
233      _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
234      _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
235      _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
236      _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
237    } else {
238      // Store four bytes/pixels per line.
239      WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
240      WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
241      WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
242      WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
243    }
244  }
245}
246
247#if defined(USE_TRANSFORM_AC3)
248#define MUL(a, b) (((a) * (b)) >> 16)
249static void TransformAC3(const int16_t* in, uint8_t* dst) {
250  static const int kC1 = 20091 + (1 << 16);
251  static const int kC2 = 35468;
252  const __m128i A = _mm_set1_epi16(in[0] + 4);
253  const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
254  const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
255  const int c1 = MUL(in[1], kC2);
256  const int d1 = MUL(in[1], kC1);
257  const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
258  const __m128i B = _mm_adds_epi16(A, CD);
259  const __m128i m0 = _mm_adds_epi16(B, d4);
260  const __m128i m1 = _mm_adds_epi16(B, c4);
261  const __m128i m2 = _mm_subs_epi16(B, c4);
262  const __m128i m3 = _mm_subs_epi16(B, d4);
263  const __m128i zero = _mm_setzero_si128();
264  // Load the source pixels.
265  __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
266  __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
267  __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
268  __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
269  // Convert to 16b.
270  dst0 = _mm_unpacklo_epi8(dst0, zero);
271  dst1 = _mm_unpacklo_epi8(dst1, zero);
272  dst2 = _mm_unpacklo_epi8(dst2, zero);
273  dst3 = _mm_unpacklo_epi8(dst3, zero);
274  // Add the inverse transform.
275  dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
276  dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
277  dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
278  dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
279  // Unsigned saturate to 8b.
280  dst0 = _mm_packus_epi16(dst0, dst0);
281  dst1 = _mm_packus_epi16(dst1, dst1);
282  dst2 = _mm_packus_epi16(dst2, dst2);
283  dst3 = _mm_packus_epi16(dst3, dst3);
284  // Store the results.
285  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
286  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
287  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
288  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
289}
290#undef MUL
291#endif   // USE_TRANSFORM_AC3
292
293//------------------------------------------------------------------------------
294// Loop Filter (Paragraph 15)
295
296// Compute abs(p - q) = subs(p - q) OR subs(q - p)
297#define MM_ABS(p, q)  _mm_or_si128(                                            \
298    _mm_subs_epu8((q), (p)),                                                   \
299    _mm_subs_epu8((p), (q)))
300
301// Shift each byte of "x" by 3 bits while preserving by the sign bit.
302static WEBP_INLINE void SignedShift8b(__m128i* const x) {
303  const __m128i zero = _mm_setzero_si128();
304  const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
305  const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
306  const __m128i lo_1 = _mm_srai_epi16(lo_0, 3 + 8);
307  const __m128i hi_1 = _mm_srai_epi16(hi_0, 3 + 8);
308  *x = _mm_packs_epi16(lo_1, hi_1);
309}
310
311#define FLIP_SIGN_BIT2(a, b) {                                                 \
312  a = _mm_xor_si128(a, sign_bit);                                              \
313  b = _mm_xor_si128(b, sign_bit);                                              \
314}
315
316#define FLIP_SIGN_BIT4(a, b, c, d) {                                           \
317  FLIP_SIGN_BIT2(a, b);                                                        \
318  FLIP_SIGN_BIT2(c, d);                                                        \
319}
320
321// input/output is uint8_t
322static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
323                                  const __m128i* const p0,
324                                  const __m128i* const q0,
325                                  const __m128i* const q1,
326                                  int hev_thresh, __m128i* const not_hev) {
327  const __m128i zero = _mm_setzero_si128();
328  const __m128i t_1 = MM_ABS(*p1, *p0);
329  const __m128i t_2 = MM_ABS(*q1, *q0);
330
331  const __m128i h = _mm_set1_epi8(hev_thresh);
332  const __m128i t_max = _mm_max_epu8(t_1, t_2);
333
334  const __m128i t_max_h = _mm_subs_epu8(t_max, h);
335  *not_hev = _mm_cmpeq_epi8(t_max_h, zero);  // not_hev <= t1 && not_hev <= t2
336}
337
338// input pixels are int8_t
339static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
340                                     const __m128i* const p0,
341                                     const __m128i* const q0,
342                                     const __m128i* const q1,
343                                     __m128i* const delta) {
344  // beware of addition order, for saturation!
345  const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1);   // p1 - q1
346  const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0);   // q0 - p0
347  const __m128i s1 = _mm_adds_epi8(p1_q1, q0_p0);  // p1 - q1 + 1 * (q0 - p0)
348  const __m128i s2 = _mm_adds_epi8(q0_p0, s1);     // p1 - q1 + 2 * (q0 - p0)
349  const __m128i s3 = _mm_adds_epi8(q0_p0, s2);     // p1 - q1 + 3 * (q0 - p0)
350  *delta = s3;
351}
352
353// input and output are int8_t
354static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
355                                       const __m128i* const fl) {
356  const __m128i k3 = _mm_set1_epi8(3);
357  const __m128i k4 = _mm_set1_epi8(4);
358  __m128i v3 = _mm_adds_epi8(*fl, k3);
359  __m128i v4 = _mm_adds_epi8(*fl, k4);
360
361  SignedShift8b(&v4);                  // v4 >> 3
362  SignedShift8b(&v3);                  // v3 >> 3
363  *q0 = _mm_subs_epi8(*q0, v4);        // q0 -= v4
364  *p0 = _mm_adds_epi8(*p0, v3);        // p0 += v3
365}
366
367// Updates values of 2 pixels at MB edge during complex filtering.
368// Update operations:
369// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
370// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
371static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
372                                      const __m128i* const a0_lo,
373                                      const __m128i* const a0_hi) {
374  const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
375  const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
376  const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
377  const __m128i sign_bit = _mm_set1_epi8(0x80);
378  *pi = _mm_adds_epi8(*pi, delta);
379  *qi = _mm_subs_epi8(*qi, delta);
380  FLIP_SIGN_BIT2(*pi, *qi);
381}
382
383// input pixels are uint8_t
384static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
385                                    const __m128i* const p0,
386                                    const __m128i* const q0,
387                                    const __m128i* const q1,
388                                    int thresh, __m128i* const mask) {
389  const __m128i m_thresh = _mm_set1_epi8(thresh);
390  const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
391  const __m128i kFE = _mm_set1_epi8(0xFE);
392  const __m128i t2 = _mm_and_si128(t1, kFE);  // set lsb of each byte to zero
393  const __m128i t3 = _mm_srli_epi16(t2, 1);   // abs(p1 - q1) / 2
394
395  const __m128i t4 = MM_ABS(*p0, *q0);        // abs(p0 - q0)
396  const __m128i t5 = _mm_adds_epu8(t4, t4);   // abs(p0 - q0) * 2
397  const __m128i t6 = _mm_adds_epu8(t5, t3);   // abs(p0-q0)*2 + abs(p1-q1)/2
398
399  const __m128i t7 = _mm_subs_epu8(t6, m_thresh);  // mask <= m_thresh
400  *mask = _mm_cmpeq_epi8(t7, _mm_setzero_si128());
401}
402
403//------------------------------------------------------------------------------
404// Edge filtering functions
405
406// Applies filter on 2 pixels (p0 and q0)
407static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
408                                  __m128i* const q0, __m128i* const q1,
409                                  int thresh) {
410  __m128i a, mask;
411  const __m128i sign_bit = _mm_set1_epi8(0x80);
412  // convert p1/q1 to int8_t (for GetBaseDelta)
413  const __m128i p1s = _mm_xor_si128(*p1, sign_bit);
414  const __m128i q1s = _mm_xor_si128(*q1, sign_bit);
415
416  NeedsFilter(p1, p0, q0, q1, thresh, &mask);
417
418  FLIP_SIGN_BIT2(*p0, *q0);
419  GetBaseDelta(&p1s, p0, q0, &q1s, &a);
420  a = _mm_and_si128(a, mask);     // mask filter values we don't care about
421  DoSimpleFilter(p0, q0, &a);
422  FLIP_SIGN_BIT2(*p0, *q0);
423}
424
425// Applies filter on 4 pixels (p1, p0, q0 and q1)
426static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
427                                  __m128i* const q0, __m128i* const q1,
428                                  const __m128i* const mask, int hev_thresh) {
429  const __m128i zero = _mm_setzero_si128();
430  const __m128i sign_bit = _mm_set1_epi8(0x80);
431  const __m128i k64 = _mm_set1_epi8(64);
432  const __m128i k3 = _mm_set1_epi8(3);
433  const __m128i k4 = _mm_set1_epi8(4);
434  __m128i not_hev;
435  __m128i t1, t2, t3;
436
437  // compute hev mask
438  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
439
440  // convert to signed values
441  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
442
443  t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
444  t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
445  t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
446  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
447  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
448  t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
449  t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about
450
451  t2 = _mm_adds_epi8(t1, k3);        // 3 * (q0 - p0) + hev(p1 - q1) + 3
452  t3 = _mm_adds_epi8(t1, k4);        // 3 * (q0 - p0) + hev(p1 - q1) + 4
453  SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
454  SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
455  *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
456  *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
457  FLIP_SIGN_BIT2(*p0, *q0);
458
459  // this is equivalent to signed (a + 1) >> 1 calculation
460  t2 = _mm_add_epi8(t3, sign_bit);
461  t3 = _mm_avg_epu8(t2, zero);
462  t3 = _mm_sub_epi8(t3, k64);
463
464  t3 = _mm_and_si128(not_hev, t3);   // if !hev
465  *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
466  *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
467  FLIP_SIGN_BIT2(*p1, *q1);
468}
469
470// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
471static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
472                                  __m128i* const p0, __m128i* const q0,
473                                  __m128i* const q1, __m128i* const q2,
474                                  const __m128i* const mask, int hev_thresh) {
475  const __m128i zero = _mm_setzero_si128();
476  const __m128i sign_bit = _mm_set1_epi8(0x80);
477  __m128i a, not_hev;
478
479  // compute hev mask
480  GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);
481
482  FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
483  FLIP_SIGN_BIT2(*p2, *q2);
484  GetBaseDelta(p1, p0, q0, q1, &a);
485
486  { // do simple filter on pixels with hev
487    const __m128i m = _mm_andnot_si128(not_hev, *mask);
488    const __m128i f = _mm_and_si128(a, m);
489    DoSimpleFilter(p0, q0, &f);
490  }
491
492  { // do strong filter on pixels with not hev
493    const __m128i k9 = _mm_set1_epi16(0x0900);
494    const __m128i k63 = _mm_set1_epi16(63);
495
496    const __m128i m = _mm_and_si128(not_hev, *mask);
497    const __m128i f = _mm_and_si128(a, m);
498
499    const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
500    const __m128i f_hi = _mm_unpackhi_epi8(zero, f);
501
502    const __m128i f9_lo = _mm_mulhi_epi16(f_lo, k9);    // Filter (lo) * 9
503    const __m128i f9_hi = _mm_mulhi_epi16(f_hi, k9);    // Filter (hi) * 9
504
505    const __m128i a2_lo = _mm_add_epi16(f9_lo, k63);    // Filter * 9 + 63
506    const __m128i a2_hi = _mm_add_epi16(f9_hi, k63);    // Filter * 9 + 63
507
508    const __m128i a1_lo = _mm_add_epi16(a2_lo, f9_lo);  // Filter * 18 + 63
509    const __m128i a1_hi = _mm_add_epi16(a2_hi, f9_hi);  // Filter * 18 + 63
510
511    const __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo);  // Filter * 27 + 63
512    const __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi);  // Filter * 27 + 63
513
514    Update2Pixels(p2, q2, &a2_lo, &a2_hi);
515    Update2Pixels(p1, q1, &a1_lo, &a1_hi);
516    Update2Pixels(p0, q0, &a0_lo, &a0_hi);
517  }
518}
519
520// reads 8 rows across a vertical edge.
521static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
522                                __m128i* const p, __m128i* const q) {
523  // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
524  // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
525  const __m128i A0 = _mm_set_epi32(
526      WebPMemToUint32(&b[6 * stride]), WebPMemToUint32(&b[2 * stride]),
527      WebPMemToUint32(&b[4 * stride]), WebPMemToUint32(&b[0 * stride]));
528  const __m128i A1 = _mm_set_epi32(
529      WebPMemToUint32(&b[7 * stride]), WebPMemToUint32(&b[3 * stride]),
530      WebPMemToUint32(&b[5 * stride]), WebPMemToUint32(&b[1 * stride]));
531
532  // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
533  // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
534  const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
535  const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
536
537  // C0 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
538  // C1 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
539  const __m128i C0 = _mm_unpacklo_epi16(B0, B1);
540  const __m128i C1 = _mm_unpackhi_epi16(B0, B1);
541
542  // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
543  // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
544  *p = _mm_unpacklo_epi32(C0, C1);
545  *q = _mm_unpackhi_epi32(C0, C1);
546}
547
548static WEBP_INLINE void Load16x4(const uint8_t* const r0,
549                                 const uint8_t* const r8,
550                                 int stride,
551                                 __m128i* const p1, __m128i* const p0,
552                                 __m128i* const q0, __m128i* const q1) {
553  // Assume the pixels around the edge (|) are numbered as follows
554  //                00 01 | 02 03
555  //                10 11 | 12 13
556  //                 ...  |  ...
557  //                e0 e1 | e2 e3
558  //                f0 f1 | f2 f3
559  //
560  // r0 is pointing to the 0th row (00)
561  // r8 is pointing to the 8th row (80)
562
563  // Load
564  // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
565  // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
566  // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
567  // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
568  Load8x4(r0, stride, p1, q0);
569  Load8x4(r8, stride, p0, q1);
570
571  {
572    // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
573    // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
574    // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
575    // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
576    const __m128i t1 = *p1;
577    const __m128i t2 = *q0;
578    *p1 = _mm_unpacklo_epi64(t1, *p0);
579    *p0 = _mm_unpackhi_epi64(t1, *p0);
580    *q0 = _mm_unpacklo_epi64(t2, *q1);
581    *q1 = _mm_unpackhi_epi64(t2, *q1);
582  }
583}
584
585static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
586  int i;
587  for (i = 0; i < 4; ++i, dst += stride) {
588    WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
589    *x = _mm_srli_si128(*x, 4);
590  }
591}
592
593// Transpose back and store
594static WEBP_INLINE void Store16x4(const __m128i* const p1,
595                                  const __m128i* const p0,
596                                  const __m128i* const q0,
597                                  const __m128i* const q1,
598                                  uint8_t* r0, uint8_t* r8,
599                                  int stride) {
600  __m128i t1, p1_s, p0_s, q0_s, q1_s;
601
602  // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
603  // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
604  t1 = *p0;
605  p0_s = _mm_unpacklo_epi8(*p1, t1);
606  p1_s = _mm_unpackhi_epi8(*p1, t1);
607
608  // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
609  // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
610  t1 = *q0;
611  q0_s = _mm_unpacklo_epi8(t1, *q1);
612  q1_s = _mm_unpackhi_epi8(t1, *q1);
613
614  // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
615  // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
616  t1 = p0_s;
617  p0_s = _mm_unpacklo_epi16(t1, q0_s);
618  q0_s = _mm_unpackhi_epi16(t1, q0_s);
619
620  // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
621  // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
622  t1 = p1_s;
623  p1_s = _mm_unpacklo_epi16(t1, q1_s);
624  q1_s = _mm_unpackhi_epi16(t1, q1_s);
625
626  Store4x4(&p0_s, r0, stride);
627  r0 += 4 * stride;
628  Store4x4(&q0_s, r0, stride);
629
630  Store4x4(&p1_s, r8, stride);
631  r8 += 4 * stride;
632  Store4x4(&q1_s, r8, stride);
633}
634
635//------------------------------------------------------------------------------
636// Simple In-loop filtering (Paragraph 15.2)
637
638static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
639  // Load
640  __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]);
641  __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]);
642  __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]);
643  __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]);
644
645  DoFilter2(&p1, &p0, &q0, &q1, thresh);
646
647  // Store
648  _mm_storeu_si128((__m128i*)&p[-stride], p0);
649  _mm_storeu_si128((__m128i*)&p[0], q0);
650}
651
652static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
653  __m128i p1, p0, q0, q1;
654
655  p -= 2;  // beginning of p1
656
657  Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1);
658  DoFilter2(&p1, &p0, &q0, &q1, thresh);
659  Store16x4(&p1, &p0, &q0, &q1, p, p + 8 * stride, stride);
660}
661
662static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
663  int k;
664  for (k = 3; k > 0; --k) {
665    p += 4 * stride;
666    SimpleVFilter16(p, stride, thresh);
667  }
668}
669
670static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
671  int k;
672  for (k = 3; k > 0; --k) {
673    p += 4;
674    SimpleHFilter16(p, stride, thresh);
675  }
676}
677
678//------------------------------------------------------------------------------
679// Complex In-loop filtering (Paragraph 15.3)
680
681#define MAX_DIFF1(p3, p2, p1, p0, m) do {                                      \
682  m = MM_ABS(p1, p0);                                                          \
683  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
684  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
685} while (0)
686
687#define MAX_DIFF2(p3, p2, p1, p0, m) do {                                      \
688  m = _mm_max_epu8(m, MM_ABS(p1, p0));                                         \
689  m = _mm_max_epu8(m, MM_ABS(p3, p2));                                         \
690  m = _mm_max_epu8(m, MM_ABS(p2, p1));                                         \
691} while (0)
692
693#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) {                             \
694  e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]);                            \
695  e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]);                            \
696  e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]);                            \
697  e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]);                            \
698}
699
700#define LOADUV_H_EDGE(p, u, v, stride) do {                                    \
701  const __m128i U = _mm_loadl_epi64((__m128i*)&(u)[(stride)]);                 \
702  const __m128i V = _mm_loadl_epi64((__m128i*)&(v)[(stride)]);                 \
703  p = _mm_unpacklo_epi64(U, V);                                                \
704} while (0)
705
706#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) {                        \
707  LOADUV_H_EDGE(e1, u, v, 0 * stride);                                         \
708  LOADUV_H_EDGE(e2, u, v, 1 * stride);                                         \
709  LOADUV_H_EDGE(e3, u, v, 2 * stride);                                         \
710  LOADUV_H_EDGE(e4, u, v, 3 * stride);                                         \
711}
712
713#define STOREUV(p, u, v, stride) {                                             \
714  _mm_storel_epi64((__m128i*)&u[(stride)], p);                                 \
715  p = _mm_srli_si128(p, 8);                                                    \
716  _mm_storel_epi64((__m128i*)&v[(stride)], p);                                 \
717}
718
719static WEBP_INLINE void ComplexMask(const __m128i* const p1,
720                                    const __m128i* const p0,
721                                    const __m128i* const q0,
722                                    const __m128i* const q1,
723                                    int thresh, int ithresh,
724                                    __m128i* const mask) {
725  const __m128i it = _mm_set1_epi8(ithresh);
726  const __m128i diff = _mm_subs_epu8(*mask, it);
727  const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
728  __m128i filter_mask;
729  NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
730  *mask = _mm_and_si128(thresh_mask, filter_mask);
731}
732
733// on macroblock edges
734static void VFilter16(uint8_t* p, int stride,
735                      int thresh, int ithresh, int hev_thresh) {
736  __m128i t1;
737  __m128i mask;
738  __m128i p2, p1, p0, q0, q1, q2;
739
740  // Load p3, p2, p1, p0
741  LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0);
742  MAX_DIFF1(t1, p2, p1, p0, mask);
743
744  // Load q0, q1, q2, q3
745  LOAD_H_EDGES4(p, stride, q0, q1, q2, t1);
746  MAX_DIFF2(t1, q2, q1, q0, mask);
747
748  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
749  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
750
751  // Store
752  _mm_storeu_si128((__m128i*)&p[-3 * stride], p2);
753  _mm_storeu_si128((__m128i*)&p[-2 * stride], p1);
754  _mm_storeu_si128((__m128i*)&p[-1 * stride], p0);
755  _mm_storeu_si128((__m128i*)&p[+0 * stride], q0);
756  _mm_storeu_si128((__m128i*)&p[+1 * stride], q1);
757  _mm_storeu_si128((__m128i*)&p[+2 * stride], q2);
758}
759
760static void HFilter16(uint8_t* p, int stride,
761                      int thresh, int ithresh, int hev_thresh) {
762  __m128i mask;
763  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
764
765  uint8_t* const b = p - 4;
766  Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
767  MAX_DIFF1(p3, p2, p1, p0, mask);
768
769  Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3);  // q0, q1, q2, q3
770  MAX_DIFF2(q3, q2, q1, q0, mask);
771
772  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
773  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
774
775  Store16x4(&p3, &p2, &p1, &p0, b, b + 8 * stride, stride);
776  Store16x4(&q0, &q1, &q2, &q3, p, p + 8 * stride, stride);
777}
778
779// on three inner edges
780static void VFilter16i(uint8_t* p, int stride,
781                       int thresh, int ithresh, int hev_thresh) {
782  int k;
783  __m128i p3, p2, p1, p0;   // loop invariants
784
785  LOAD_H_EDGES4(p, stride, p3, p2, p1, p0);  // prologue
786
787  for (k = 3; k > 0; --k) {
788    __m128i mask, tmp1, tmp2;
789    uint8_t* const b = p + 2 * stride;   // beginning of p1
790    p += 4 * stride;
791
792    MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
793    LOAD_H_EDGES4(p, stride, p3, p2, tmp1, tmp2);
794    MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
795
796    // p3 and p2 are not just temporary variables here: they will be
797    // re-used for next span. And q2/q3 will become p1/p0 accordingly.
798    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
799    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
800
801    // Store
802    _mm_storeu_si128((__m128i*)&b[0 * stride], p1);
803    _mm_storeu_si128((__m128i*)&b[1 * stride], p0);
804    _mm_storeu_si128((__m128i*)&b[2 * stride], p3);
805    _mm_storeu_si128((__m128i*)&b[3 * stride], p2);
806
807    // rotate samples
808    p1 = tmp1;
809    p0 = tmp2;
810  }
811}
812
813static void HFilter16i(uint8_t* p, int stride,
814                       int thresh, int ithresh, int hev_thresh) {
815  int k;
816  __m128i p3, p2, p1, p0;   // loop invariants
817
818  Load16x4(p, p + 8 * stride, stride, &p3, &p2, &p1, &p0);  // prologue
819
820  for (k = 3; k > 0; --k) {
821    __m128i mask, tmp1, tmp2;
822    uint8_t* const b = p + 2;   // beginning of p1
823
824    p += 4;  // beginning of q0 (and next span)
825
826    MAX_DIFF1(p3, p2, p1, p0, mask);   // compute partial mask
827    Load16x4(p, p + 8 * stride, stride, &p3, &p2, &tmp1, &tmp2);
828    MAX_DIFF2(p3, p2, tmp1, tmp2, mask);
829
830    ComplexMask(&p1, &p0, &p3, &p2, thresh, ithresh, &mask);
831    DoFilter4(&p1, &p0, &p3, &p2, &mask, hev_thresh);
832
833    Store16x4(&p1, &p0, &p3, &p2, b, b + 8 * stride, stride);
834
835    // rotate samples
836    p1 = tmp1;
837    p0 = tmp2;
838  }
839}
840
841// 8-pixels wide variant, for chroma filtering
842static void VFilter8(uint8_t* u, uint8_t* v, int stride,
843                     int thresh, int ithresh, int hev_thresh) {
844  __m128i mask;
845  __m128i t1, p2, p1, p0, q0, q1, q2;
846
847  // Load p3, p2, p1, p0
848  LOADUV_H_EDGES4(u - 4 * stride, v - 4 * stride, stride, t1, p2, p1, p0);
849  MAX_DIFF1(t1, p2, p1, p0, mask);
850
851  // Load q0, q1, q2, q3
852  LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1);
853  MAX_DIFF2(t1, q2, q1, q0, mask);
854
855  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
856  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
857
858  // Store
859  STOREUV(p2, u, v, -3 * stride);
860  STOREUV(p1, u, v, -2 * stride);
861  STOREUV(p0, u, v, -1 * stride);
862  STOREUV(q0, u, v, 0 * stride);
863  STOREUV(q1, u, v, 1 * stride);
864  STOREUV(q2, u, v, 2 * stride);
865}
866
867static void HFilter8(uint8_t* u, uint8_t* v, int stride,
868                     int thresh, int ithresh, int hev_thresh) {
869  __m128i mask;
870  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
871
872  uint8_t* const tu = u - 4;
873  uint8_t* const tv = v - 4;
874  Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0);  // p3, p2, p1, p0
875  MAX_DIFF1(p3, p2, p1, p0, mask);
876
877  Load16x4(u, v, stride, &q0, &q1, &q2, &q3);    // q0, q1, q2, q3
878  MAX_DIFF2(q3, q2, q1, q0, mask);
879
880  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
881  DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh);
882
883  Store16x4(&p3, &p2, &p1, &p0, tu, tv, stride);
884  Store16x4(&q0, &q1, &q2, &q3, u, v, stride);
885}
886
887static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
888                      int thresh, int ithresh, int hev_thresh) {
889  __m128i mask;
890  __m128i t1, t2, p1, p0, q0, q1;
891
892  // Load p3, p2, p1, p0
893  LOADUV_H_EDGES4(u, v, stride, t2, t1, p1, p0);
894  MAX_DIFF1(t2, t1, p1, p0, mask);
895
896  u += 4 * stride;
897  v += 4 * stride;
898
899  // Load q0, q1, q2, q3
900  LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2);
901  MAX_DIFF2(t2, t1, q1, q0, mask);
902
903  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
904  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
905
906  // Store
907  STOREUV(p1, u, v, -2 * stride);
908  STOREUV(p0, u, v, -1 * stride);
909  STOREUV(q0, u, v, 0 * stride);
910  STOREUV(q1, u, v, 1 * stride);
911}
912
913static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
914                      int thresh, int ithresh, int hev_thresh) {
915  __m128i mask;
916  __m128i t1, t2, p1, p0, q0, q1;
917  Load16x4(u, v, stride, &t2, &t1, &p1, &p0);   // p3, p2, p1, p0
918  MAX_DIFF1(t2, t1, p1, p0, mask);
919
920  u += 4;  // beginning of q0
921  v += 4;
922  Load16x4(u, v, stride, &q0, &q1, &t1, &t2);  // q0, q1, q2, q3
923  MAX_DIFF2(t2, t1, q1, q0, mask);
924
925  ComplexMask(&p1, &p0, &q0, &q1, thresh, ithresh, &mask);
926  DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh);
927
928  u -= 2;  // beginning of p1
929  v -= 2;
930  Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
931}
932
933//------------------------------------------------------------------------------
934// 4x4 predictions
935
936#define DST(x, y) dst[(x) + (y) * BPS]
937#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
938
939// We use the following 8b-arithmetic tricks:
940//     (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
941//   where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
942// and:
943//     (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
944//   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
945//   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
946
947static void VE4(uint8_t* dst) {    // vertical
948  const __m128i one = _mm_set1_epi8(1);
949  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
950  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
951  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
952  const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
953  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
954  const __m128i b = _mm_subs_epu8(a, lsb);
955  const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
956  const uint32_t vals = _mm_cvtsi128_si32(avg);
957  int i;
958  for (i = 0; i < 4; ++i) {
959    WebPUint32ToMem(dst + i * BPS, vals);
960  }
961}
962
963static void LD4(uint8_t* dst) {   // Down-Left
964  const __m128i one = _mm_set1_epi8(1);
965  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
966  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
967  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
968  const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, dst[-BPS + 7], 3);
969  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
970  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
971  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
972  const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
973  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
974  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
975  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
976  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
977}
978
979static void VR4(uint8_t* dst) {   // Vertical-Right
980  const __m128i one = _mm_set1_epi8(1);
981  const int I = dst[-1 + 0 * BPS];
982  const int J = dst[-1 + 1 * BPS];
983  const int K = dst[-1 + 2 * BPS];
984  const int X = dst[-1 - BPS];
985  const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
986  const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
987  const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
988  const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
989  const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
990  const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
991  const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
992  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
993  const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
994  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
995  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
996  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
997  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
998
999  // these two are hard to implement in SSE2, so we keep the C-version:
1000  DST(0, 2) = AVG3(J, I, X);
1001  DST(0, 3) = AVG3(K, J, I);
1002}
1003
1004static void VL4(uint8_t* dst) {   // Vertical-Left
1005  const __m128i one = _mm_set1_epi8(1);
1006  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
1007  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
1008  const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
1009  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
1010  const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
1011  const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
1012  const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
1013  const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
1014  const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
1015  const __m128i abbc = _mm_or_si128(ab, bc);
1016  const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
1017  const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
1018  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
1019  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
1020  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
1021  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
1022  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
1023
1024  // these two are hard to get and irregular
1025  DST(3, 2) = (extra_out >> 0) & 0xff;
1026  DST(3, 3) = (extra_out >> 8) & 0xff;
1027}
1028
1029static void RD4(uint8_t* dst) {   // Down-right
1030  const __m128i one = _mm_set1_epi8(1);
1031  const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
1032  const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
1033  const uint32_t I = dst[-1 + 0 * BPS];
1034  const uint32_t J = dst[-1 + 1 * BPS];
1035  const uint32_t K = dst[-1 + 2 * BPS];
1036  const uint32_t L = dst[-1 + 3 * BPS];
1037  const __m128i LKJI_____ =
1038      _mm_cvtsi32_si128(L | (K << 8) | (J << 16) | (I << 24));
1039  const __m128i LKJIXABCD = _mm_or_si128(LKJI_____, ____XABCD);
1040  const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
1041  const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
1042  const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
1043  const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
1044  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
1045  const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
1046  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
1047  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
1048  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
1049  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
1050}
1051
1052#undef DST
1053#undef AVG3
1054
1055//------------------------------------------------------------------------------
1056// Luma 16x16
1057
1058static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
1059  const uint8_t* top = dst - BPS;
1060  const __m128i zero = _mm_setzero_si128();
1061  int y;
1062  if (size == 4) {
1063    const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
1064    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
1065    for (y = 0; y < 4; ++y, dst += BPS) {
1066      const int val = dst[-1] - top[-1];
1067      const __m128i base = _mm_set1_epi16(val);
1068      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
1069      WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
1070    }
1071  } else if (size == 8) {
1072    const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
1073    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
1074    for (y = 0; y < 8; ++y, dst += BPS) {
1075      const int val = dst[-1] - top[-1];
1076      const __m128i base = _mm_set1_epi16(val);
1077      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
1078      _mm_storel_epi64((__m128i*)dst, out);
1079    }
1080  } else {
1081    const __m128i top_values = _mm_loadu_si128((const __m128i*)top);
1082    const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
1083    const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
1084    for (y = 0; y < 16; ++y, dst += BPS) {
1085      const int val = dst[-1] - top[-1];
1086      const __m128i base = _mm_set1_epi16(val);
1087      const __m128i out_0 = _mm_add_epi16(base, top_base_0);
1088      const __m128i out_1 = _mm_add_epi16(base, top_base_1);
1089      const __m128i out = _mm_packus_epi16(out_0, out_1);
1090      _mm_storeu_si128((__m128i*)dst, out);
1091    }
1092  }
1093}
1094
1095static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
1096static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
1097static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
1098
1099static void VE16(uint8_t* dst) {
1100  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
1101  int j;
1102  for (j = 0; j < 16; ++j) {
1103    _mm_storeu_si128((__m128i*)(dst + j * BPS), top);
1104  }
1105}
1106
1107static void HE16(uint8_t* dst) {     // horizontal
1108  int j;
1109  for (j = 16; j > 0; --j) {
1110    const __m128i values = _mm_set1_epi8(dst[-1]);
1111    _mm_storeu_si128((__m128i*)dst, values);
1112    dst += BPS;
1113  }
1114}
1115
1116static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
1117  int j;
1118  const __m128i values = _mm_set1_epi8(v);
1119  for (j = 0; j < 16; ++j) {
1120    _mm_storeu_si128((__m128i*)(dst + j * BPS), values);
1121  }
1122}
1123
1124static void DC16(uint8_t* dst) {    // DC
1125  const __m128i zero = _mm_setzero_si128();
1126  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
1127  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
1128  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
1129  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
1130  int left = 0;
1131  int j;
1132  for (j = 0; j < 16; ++j) {
1133    left += dst[-1 + j * BPS];
1134  }
1135  {
1136    const int DC = _mm_cvtsi128_si32(sum) + left + 16;
1137    Put16(DC >> 5, dst);
1138  }
1139}
1140
1141static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
1142  int DC = 8;
1143  int j;
1144  for (j = 0; j < 16; ++j) {
1145    DC += dst[-1 + j * BPS];
1146  }
1147  Put16(DC >> 4, dst);
1148}
1149
1150static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
1151  const __m128i zero = _mm_setzero_si128();
1152  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
1153  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
1154  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
1155  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
1156  const int DC = _mm_cvtsi128_si32(sum) + 8;
1157  Put16(DC >> 4, dst);
1158}
1159
1160static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
1161  Put16(0x80, dst);
1162}
1163
1164//------------------------------------------------------------------------------
1165// Chroma
1166
1167static void VE8uv(uint8_t* dst) {    // vertical
1168  int j;
1169  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
1170  for (j = 0; j < 8; ++j) {
1171    _mm_storel_epi64((__m128i*)(dst + j * BPS), top);
1172  }
1173}
1174
1175static void HE8uv(uint8_t* dst) {    // horizontal
1176  int j;
1177  for (j = 0; j < 8; ++j) {
1178    const __m128i values = _mm_set1_epi8(dst[-1]);
1179    _mm_storel_epi64((__m128i*)dst, values);
1180    dst += BPS;
1181  }
1182}
1183
1184// helper for chroma-DC predictions
1185static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
1186  int j;
1187  const __m128i values = _mm_set1_epi8(v);
1188  for (j = 0; j < 8; ++j) {
1189    _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
1190  }
1191}
1192
1193static void DC8uv(uint8_t* dst) {     // DC
1194  const __m128i zero = _mm_setzero_si128();
1195  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
1196  const __m128i sum = _mm_sad_epu8(top, zero);
1197  int left = 0;
1198  int j;
1199  for (j = 0; j < 8; ++j) {
1200    left += dst[-1 + j * BPS];
1201  }
1202  {
1203    const int DC = _mm_cvtsi128_si32(sum) + left + 8;
1204    Put8x8uv(DC >> 4, dst);
1205  }
1206}
1207
1208static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
1209  const __m128i zero = _mm_setzero_si128();
1210  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
1211  const __m128i sum = _mm_sad_epu8(top, zero);
1212  const int DC = _mm_cvtsi128_si32(sum) + 4;
1213  Put8x8uv(DC >> 3, dst);
1214}
1215
1216static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
1217  int dc0 = 4;
1218  int i;
1219  for (i = 0; i < 8; ++i) {
1220    dc0 += dst[-1 + i * BPS];
1221  }
1222  Put8x8uv(dc0 >> 3, dst);
1223}
1224
1225static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
1226  Put8x8uv(0x80, dst);
1227}
1228
1229//------------------------------------------------------------------------------
1230// Entry point
1231
1232extern void VP8DspInitSSE2(void);
1233
1234WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
1235  VP8Transform = Transform;
1236#if defined(USE_TRANSFORM_AC3)
1237  VP8TransformAC3 = TransformAC3;
1238#endif
1239
1240  VP8VFilter16 = VFilter16;
1241  VP8HFilter16 = HFilter16;
1242  VP8VFilter8 = VFilter8;
1243  VP8HFilter8 = HFilter8;
1244  VP8VFilter16i = VFilter16i;
1245  VP8HFilter16i = HFilter16i;
1246  VP8VFilter8i = VFilter8i;
1247  VP8HFilter8i = HFilter8i;
1248
1249  VP8SimpleVFilter16 = SimpleVFilter16;
1250  VP8SimpleHFilter16 = SimpleHFilter16;
1251  VP8SimpleVFilter16i = SimpleVFilter16i;
1252  VP8SimpleHFilter16i = SimpleHFilter16i;
1253
1254  VP8PredLuma4[1] = TM4;
1255  VP8PredLuma4[2] = VE4;
1256  VP8PredLuma4[4] = RD4;
1257  VP8PredLuma4[5] = VR4;
1258  VP8PredLuma4[6] = LD4;
1259  VP8PredLuma4[7] = VL4;
1260
1261  VP8PredLuma16[0] = DC16;
1262  VP8PredLuma16[1] = TM16;
1263  VP8PredLuma16[2] = VE16;
1264  VP8PredLuma16[3] = HE16;
1265  VP8PredLuma16[4] = DC16NoTop;
1266  VP8PredLuma16[5] = DC16NoLeft;
1267  VP8PredLuma16[6] = DC16NoTopLeft;
1268
1269  VP8PredChroma8[0] = DC8uv;
1270  VP8PredChroma8[1] = TM8uv;
1271  VP8PredChroma8[2] = VE8uv;
1272  VP8PredChroma8[3] = HE8uv;
1273  VP8PredChroma8[4] = DC8uvNoTop;
1274  VP8PredChroma8[5] = DC8uvNoLeft;
1275  VP8PredChroma8[6] = DC8uvNoTopLeft;
1276}
1277
1278#else  // !WEBP_USE_SSE2
1279
1280WEBP_DSP_INIT_STUB(VP8DspInitSSE2)
1281
1282#endif  // WEBP_USE_SSE2
1283