1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <emmintrin.h>  // SSE2
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx_ports/mem.h"
15#include "vpx_ports/emmintrin_compat.h"
16
17static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
18  __m128i ubounded;
19  __m128i lbounded;
20  __m128i retval;
21
22  const __m128i zero = _mm_set1_epi16(0);
23  const __m128i one = _mm_set1_epi16(1);
24  __m128i t80, max, min;
25
26  if (bd == 8) {
27    t80 = _mm_set1_epi16(0x80);
28    max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
29  } else if (bd == 10) {
30    t80 = _mm_set1_epi16(0x200);
31    max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
32  } else {  // bd == 12
33    t80 = _mm_set1_epi16(0x800);
34    max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
35  }
36
37  min = _mm_subs_epi16(zero, t80);
38
39  ubounded = _mm_cmpgt_epi16(value, max);
40  lbounded = _mm_cmplt_epi16(value, min);
41  retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
42  ubounded = _mm_and_si128(ubounded, max);
43  lbounded = _mm_and_si128(lbounded, min);
44  retval = _mm_or_si128(retval, ubounded);
45  retval = _mm_or_si128(retval, lbounded);
46  return retval;
47}
48
49// TODO(debargha, peter): Break up large functions into smaller ones
50// in this file.
51void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
52                                       const uint8_t *_blimit,
53                                       const uint8_t *_limit,
54                                       const uint8_t *_thresh, int bd) {
55  const __m128i zero = _mm_set1_epi16(0);
56  const __m128i one = _mm_set1_epi16(1);
57  __m128i blimit, limit, thresh;
58  __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
59  __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
60  __m128i ps1, qs1, ps0, qs0;
61  __m128i abs_p0q0, abs_p1q1, ffff, work;
62  __m128i filt, work_a, filter1, filter2;
63  __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
64  __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
65  __m128i flat2_q0, flat2_p0;
66  __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
67  __m128i pixelFilter_p, pixelFilter_q;
68  __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
69  __m128i sum_p7, sum_q7, sum_p3, sum_q3;
70  __m128i t4, t3, t80, t1;
71  __m128i eight, four;
72
73  if (bd == 8) {
74    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
75    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
76    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
77  } else if (bd == 10) {
78    blimit = _mm_slli_epi16(
79        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
80    limit = _mm_slli_epi16(
81        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
82    thresh = _mm_slli_epi16(
83        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
84  } else {  // bd == 12
85    blimit = _mm_slli_epi16(
86        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
87    limit = _mm_slli_epi16(
88        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
89    thresh = _mm_slli_epi16(
90        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
91  }
92
93  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
94  p4 = _mm_load_si128((__m128i *)(s - 5 * p));
95  q3 = _mm_load_si128((__m128i *)(s + 3 * p));
96  p3 = _mm_load_si128((__m128i *)(s - 4 * p));
97  q2 = _mm_load_si128((__m128i *)(s + 2 * p));
98  p2 = _mm_load_si128((__m128i *)(s - 3 * p));
99  q1 = _mm_load_si128((__m128i *)(s + 1 * p));
100  p1 = _mm_load_si128((__m128i *)(s - 2 * p));
101  q0 = _mm_load_si128((__m128i *)(s + 0 * p));
102  p0 = _mm_load_si128((__m128i *)(s - 1 * p));
103
104  //  highbd_filter_mask
105  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
106  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
107
108  ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
109
110  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
111  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
112
113  //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
114  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
115  hev = _mm_subs_epu16(flat, thresh);
116  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
117
118  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
119  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);         // abs(p1 - q1) / 2
120  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
121  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
122  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
123  work = _mm_max_epi16(
124      _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
125      _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
126  mask = _mm_max_epi16(work, mask);
127  work = _mm_max_epi16(
128      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
129      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
130  mask = _mm_max_epi16(work, mask);
131  work = _mm_max_epi16(
132      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
133      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
134  mask = _mm_max_epi16(work, mask);
135
136  mask = _mm_subs_epu16(mask, limit);
137  mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
138
139  // lp filter
140  // highbd_filter4
141  t4 = _mm_set1_epi16(4);
142  t3 = _mm_set1_epi16(3);
143  if (bd == 8)
144    t80 = _mm_set1_epi16(0x80);
145  else if (bd == 10)
146    t80 = _mm_set1_epi16(0x200);
147  else  // bd == 12
148    t80 = _mm_set1_epi16(0x800);
149
150  t1 = _mm_set1_epi16(0x1);
151
152  ps1 = _mm_subs_epi16(p1, t80);
153  qs1 = _mm_subs_epi16(q1, t80);
154  ps0 = _mm_subs_epi16(p0, t80);
155  qs0 = _mm_subs_epi16(q0, t80);
156
157  filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
158                       hev);
159  work_a = _mm_subs_epi16(qs0, ps0);
160  filt = _mm_adds_epi16(filt, work_a);
161  filt = _mm_adds_epi16(filt, work_a);
162  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
163  filt = _mm_and_si128(filt, mask);
164  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
165  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
166
167  // Filter1 >> 3
168  filter1 = _mm_srai_epi16(filter1, 0x3);
169  filter2 = _mm_srai_epi16(filter2, 0x3);
170
171  qs0 = _mm_adds_epi16(
172      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
173  ps0 = _mm_adds_epi16(
174      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
175  filt = _mm_adds_epi16(filter1, t1);
176  filt = _mm_srai_epi16(filt, 1);
177  filt = _mm_andnot_si128(hev, filt);
178  qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
179                       t80);
180  ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
181                       t80);
182
183  // end highbd_filter4
184  // loopfilter done
185
186  // highbd_flat_mask4
187  flat = _mm_max_epi16(
188      _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
189      _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
190  work = _mm_max_epi16(
191      _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
192      _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
193  flat = _mm_max_epi16(work, flat);
194  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
195  flat = _mm_max_epi16(work, flat);
196
197  if (bd == 8)
198    flat = _mm_subs_epu16(flat, one);
199  else if (bd == 10)
200    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
201  else  // bd == 12
202    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
203
204  flat = _mm_cmpeq_epi16(flat, zero);
205  // end flat_mask4
206
207  // flat & mask = flat && mask (as used in filter8)
208  // (because, in both vars, each block of 16 either all 1s or all 0s)
209  flat = _mm_and_si128(flat, mask);
210
211  p5 = _mm_load_si128((__m128i *)(s - 6 * p));
212  q5 = _mm_load_si128((__m128i *)(s + 5 * p));
213  p6 = _mm_load_si128((__m128i *)(s - 7 * p));
214  q6 = _mm_load_si128((__m128i *)(s + 6 * p));
215  p7 = _mm_load_si128((__m128i *)(s - 8 * p));
216  q7 = _mm_load_si128((__m128i *)(s + 7 * p));
217
218  // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
219  // but referred to as p0-p4 & q0-q4 in fn)
220  flat2 = _mm_max_epi16(
221      _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
222      _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
223
224  work = _mm_max_epi16(
225      _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
226      _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
227  flat2 = _mm_max_epi16(work, flat2);
228
229  work = _mm_max_epi16(
230      _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
231      _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
232  flat2 = _mm_max_epi16(work, flat2);
233
234  work = _mm_max_epi16(
235      _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
236      _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
237  flat2 = _mm_max_epi16(work, flat2);
238
239  if (bd == 8)
240    flat2 = _mm_subs_epu16(flat2, one);
241  else if (bd == 10)
242    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
243  else  // bd == 12
244    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
245
246  flat2 = _mm_cmpeq_epi16(flat2, zero);
247  flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
248  // end highbd_flat_mask5
249
250  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
251  // flat and wide flat calculations
252  eight = _mm_set1_epi16(8);
253  four = _mm_set1_epi16(4);
254
255  pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
256  pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
257
258  pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
259  pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
260
261  pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
262  pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
263  pixelFilter_p =
264      _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
265  pixetFilter_p2p1p0 = _mm_add_epi16(
266      four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
267  flat2_p0 =
268      _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
269  flat2_q0 =
270      _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
271  flat_p0 = _mm_srli_epi16(
272      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
273  flat_q0 = _mm_srli_epi16(
274      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
275
276  sum_p7 = _mm_add_epi16(p7, p7);
277  sum_q7 = _mm_add_epi16(q7, q7);
278  sum_p3 = _mm_add_epi16(p3, p3);
279  sum_q3 = _mm_add_epi16(q3, q3);
280
281  pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
282  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
283  flat2_p1 = _mm_srli_epi16(
284      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
285  flat2_q1 = _mm_srli_epi16(
286      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
287
288  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
289  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
290  flat_p1 = _mm_srli_epi16(
291      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
292  flat_q1 = _mm_srli_epi16(
293      _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
294
295  sum_p7 = _mm_add_epi16(sum_p7, p7);
296  sum_q7 = _mm_add_epi16(sum_q7, q7);
297  sum_p3 = _mm_add_epi16(sum_p3, p3);
298  sum_q3 = _mm_add_epi16(sum_q3, q3);
299
300  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
301  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
302  flat2_p2 = _mm_srli_epi16(
303      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
304  flat2_q2 = _mm_srli_epi16(
305      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
306
307  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
308  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
309  flat_p2 = _mm_srli_epi16(
310      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
311  flat_q2 = _mm_srli_epi16(
312      _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
313
314  sum_p7 = _mm_add_epi16(sum_p7, p7);
315  sum_q7 = _mm_add_epi16(sum_q7, q7);
316  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
317  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
318  flat2_p3 = _mm_srli_epi16(
319      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
320  flat2_q3 = _mm_srli_epi16(
321      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
322
323  sum_p7 = _mm_add_epi16(sum_p7, p7);
324  sum_q7 = _mm_add_epi16(sum_q7, q7);
325  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
326  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
327  flat2_p4 = _mm_srli_epi16(
328      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
329  flat2_q4 = _mm_srli_epi16(
330      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
331
332  sum_p7 = _mm_add_epi16(sum_p7, p7);
333  sum_q7 = _mm_add_epi16(sum_q7, q7);
334  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
335  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
336  flat2_p5 = _mm_srli_epi16(
337      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
338  flat2_q5 = _mm_srli_epi16(
339      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
340
341  sum_p7 = _mm_add_epi16(sum_p7, p7);
342  sum_q7 = _mm_add_epi16(sum_q7, q7);
343  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
344  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
345  flat2_p6 = _mm_srli_epi16(
346      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
347  flat2_q6 = _mm_srli_epi16(
348      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
349
350  //  wide flat
351  //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
352
353  //  highbd_filter8
354  p2 = _mm_andnot_si128(flat, p2);
355  //  p2 remains unchanged if !(flat && mask)
356  flat_p2 = _mm_and_si128(flat, flat_p2);
357  //  when (flat && mask)
358  p2 = _mm_or_si128(p2, flat_p2);  // full list of p2 values
359  q2 = _mm_andnot_si128(flat, q2);
360  flat_q2 = _mm_and_si128(flat, flat_q2);
361  q2 = _mm_or_si128(q2, flat_q2);  // full list of q2 values
362
363  ps1 = _mm_andnot_si128(flat, ps1);
364  //  p1 takes the value assigned to in in filter4 if !(flat && mask)
365  flat_p1 = _mm_and_si128(flat, flat_p1);
366  //  when (flat && mask)
367  p1 = _mm_or_si128(ps1, flat_p1);  // full list of p1 values
368  qs1 = _mm_andnot_si128(flat, qs1);
369  flat_q1 = _mm_and_si128(flat, flat_q1);
370  q1 = _mm_or_si128(qs1, flat_q1);  // full list of q1 values
371
372  ps0 = _mm_andnot_si128(flat, ps0);
373  //  p0 takes the value assigned to in in filter4 if !(flat && mask)
374  flat_p0 = _mm_and_si128(flat, flat_p0);
375  //  when (flat && mask)
376  p0 = _mm_or_si128(ps0, flat_p0);  // full list of p0 values
377  qs0 = _mm_andnot_si128(flat, qs0);
378  flat_q0 = _mm_and_si128(flat, flat_q0);
379  q0 = _mm_or_si128(qs0, flat_q0);  // full list of q0 values
380  // end highbd_filter8
381
382  // highbd_filter16
383  p6 = _mm_andnot_si128(flat2, p6);
384  //  p6 remains unchanged if !(flat2 && flat && mask)
385  flat2_p6 = _mm_and_si128(flat2, flat2_p6);
386  //  get values for when (flat2 && flat && mask)
387  p6 = _mm_or_si128(p6, flat2_p6);  // full list of p6 values
388  q6 = _mm_andnot_si128(flat2, q6);
389  //  q6 remains unchanged if !(flat2 && flat && mask)
390  flat2_q6 = _mm_and_si128(flat2, flat2_q6);
391  //  get values for when (flat2 && flat && mask)
392  q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
393  _mm_store_si128((__m128i *)(s - 7 * p), p6);
394  _mm_store_si128((__m128i *)(s + 6 * p), q6);
395
396  p5 = _mm_andnot_si128(flat2, p5);
397  //  p5 remains unchanged if !(flat2 && flat && mask)
398  flat2_p5 = _mm_and_si128(flat2, flat2_p5);
399  //  get values for when (flat2 && flat && mask)
400  p5 = _mm_or_si128(p5, flat2_p5);
401  //  full list of p5 values
402  q5 = _mm_andnot_si128(flat2, q5);
403  //  q5 remains unchanged if !(flat2 && flat && mask)
404  flat2_q5 = _mm_and_si128(flat2, flat2_q5);
405  //  get values for when (flat2 && flat && mask)
406  q5 = _mm_or_si128(q5, flat2_q5);
407  //  full list of q5 values
408  _mm_store_si128((__m128i *)(s - 6 * p), p5);
409  _mm_store_si128((__m128i *)(s + 5 * p), q5);
410
411  p4 = _mm_andnot_si128(flat2, p4);
412  //  p4 remains unchanged if !(flat2 && flat && mask)
413  flat2_p4 = _mm_and_si128(flat2, flat2_p4);
414  //  get values for when (flat2 && flat && mask)
415  p4 = _mm_or_si128(p4, flat2_p4);  // full list of p4 values
416  q4 = _mm_andnot_si128(flat2, q4);
417  //  q4 remains unchanged if !(flat2 && flat && mask)
418  flat2_q4 = _mm_and_si128(flat2, flat2_q4);
419  //  get values for when (flat2 && flat && mask)
420  q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
421  _mm_store_si128((__m128i *)(s - 5 * p), p4);
422  _mm_store_si128((__m128i *)(s + 4 * p), q4);
423
424  p3 = _mm_andnot_si128(flat2, p3);
425  //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
426  flat2_p3 = _mm_and_si128(flat2, flat2_p3);
427  //  get values for when (flat2 && flat && mask)
428  p3 = _mm_or_si128(p3, flat2_p3);  // full list of p3 values
429  q3 = _mm_andnot_si128(flat2, q3);
430  //  q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
431  flat2_q3 = _mm_and_si128(flat2, flat2_q3);
432  //  get values for when (flat2 && flat && mask)
433  q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
434  _mm_store_si128((__m128i *)(s - 4 * p), p3);
435  _mm_store_si128((__m128i *)(s + 3 * p), q3);
436
437  p2 = _mm_andnot_si128(flat2, p2);
438  //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
439  flat2_p2 = _mm_and_si128(flat2, flat2_p2);
440  //  get values for when (flat2 && flat && mask)
441  p2 = _mm_or_si128(p2, flat2_p2);
442  //  full list of p2 values
443  q2 = _mm_andnot_si128(flat2, q2);
444  //  q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
445  flat2_q2 = _mm_and_si128(flat2, flat2_q2);
446  //  get values for when (flat2 && flat && mask)
447  q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
448  _mm_store_si128((__m128i *)(s - 3 * p), p2);
449  _mm_store_si128((__m128i *)(s + 2 * p), q2);
450
451  p1 = _mm_andnot_si128(flat2, p1);
452  //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
453  flat2_p1 = _mm_and_si128(flat2, flat2_p1);
454  //  get values for when (flat2 && flat && mask)
455  p1 = _mm_or_si128(p1, flat2_p1);  // full list of p1 values
456  q1 = _mm_andnot_si128(flat2, q1);
457  //  q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
458  flat2_q1 = _mm_and_si128(flat2, flat2_q1);
459  //  get values for when (flat2 && flat && mask)
460  q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
461  _mm_store_si128((__m128i *)(s - 2 * p), p1);
462  _mm_store_si128((__m128i *)(s + 1 * p), q1);
463
464  p0 = _mm_andnot_si128(flat2, p0);
465  //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
466  flat2_p0 = _mm_and_si128(flat2, flat2_p0);
467  //  get values for when (flat2 && flat && mask)
468  p0 = _mm_or_si128(p0, flat2_p0);  // full list of p0 values
469  q0 = _mm_andnot_si128(flat2, q0);
470  //  q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
471  flat2_q0 = _mm_and_si128(flat2, flat2_q0);
472  //  get values for when (flat2 && flat && mask)
473  q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
474  _mm_store_si128((__m128i *)(s - 1 * p), p0);
475  _mm_store_si128((__m128i *)(s - 0 * p), q0);
476}
477
478void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,
479                                            const uint8_t *_blimit,
480                                            const uint8_t *_limit,
481                                            const uint8_t *_thresh, int bd) {
482  vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd);
483  vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
484}
485
486void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
487                                      const uint8_t *_blimit,
488                                      const uint8_t *_limit,
489                                      const uint8_t *_thresh, int bd) {
490  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
491  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
492  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
493  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
494  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
495  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
496  const __m128i zero = _mm_set1_epi16(0);
497  __m128i blimit, limit, thresh;
498  __m128i mask, hev, flat;
499  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
500  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
501  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
502  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
503  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
504  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
505  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
506  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
507  const __m128i one = _mm_set1_epi16(1);
508  const __m128i ffff = _mm_cmpeq_epi16(one, one);
509  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
510  const __m128i four = _mm_set1_epi16(4);
511  __m128i workp_a, workp_b, workp_shft;
512
513  const __m128i t4 = _mm_set1_epi16(4);
514  const __m128i t3 = _mm_set1_epi16(3);
515  __m128i t80;
516  const __m128i t1 = _mm_set1_epi16(0x1);
517  __m128i ps1, ps0, qs0, qs1;
518  __m128i filt;
519  __m128i work_a;
520  __m128i filter1, filter2;
521
522  if (bd == 8) {
523    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
524    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
525    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
526    t80 = _mm_set1_epi16(0x80);
527  } else if (bd == 10) {
528    blimit = _mm_slli_epi16(
529        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
530    limit = _mm_slli_epi16(
531        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
532    thresh = _mm_slli_epi16(
533        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
534    t80 = _mm_set1_epi16(0x200);
535  } else {  // bd == 12
536    blimit = _mm_slli_epi16(
537        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
538    limit = _mm_slli_epi16(
539        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
540    thresh = _mm_slli_epi16(
541        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
542    t80 = _mm_set1_epi16(0x800);
543  }
544
545  ps1 = _mm_subs_epi16(p1, t80);
546  ps0 = _mm_subs_epi16(p0, t80);
547  qs0 = _mm_subs_epi16(q0, t80);
548  qs1 = _mm_subs_epi16(q1, t80);
549
550  // filter_mask and hev_mask
551  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
552  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
553
554  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
555  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
556  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
557  hev = _mm_subs_epu16(flat, thresh);
558  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
559
560  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
561  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
562  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
563  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
564  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
565  // So taking maximums continues to work:
566  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
567  mask = _mm_max_epi16(abs_p1p0, mask);
568  // mask |= (abs(p1 - p0) > limit) * -1;
569  mask = _mm_max_epi16(abs_q1q0, mask);
570  // mask |= (abs(q1 - q0) > limit) * -1;
571
572  work = _mm_max_epi16(
573      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
574      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
575  mask = _mm_max_epi16(work, mask);
576  work = _mm_max_epi16(
577      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
578      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
579  mask = _mm_max_epi16(work, mask);
580  mask = _mm_subs_epu16(mask, limit);
581  mask = _mm_cmpeq_epi16(mask, zero);
582
583  // flat_mask4
584  flat = _mm_max_epi16(
585      _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
586      _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
587  work = _mm_max_epi16(
588      _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
589      _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
590  flat = _mm_max_epi16(work, flat);
591  flat = _mm_max_epi16(abs_p1p0, flat);
592  flat = _mm_max_epi16(abs_q1q0, flat);
593
594  if (bd == 8)
595    flat = _mm_subs_epu16(flat, one);
596  else if (bd == 10)
597    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
598  else  // bd == 12
599    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
600
601  flat = _mm_cmpeq_epi16(flat, zero);
602  flat = _mm_and_si128(flat, mask);  // flat & mask
603
604  // Added before shift for rounding part of ROUND_POWER_OF_TWO
605
606  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
607  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
608  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
609  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
610  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
611
612  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
613  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
614  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
615
616  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
617  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
618  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
619  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
620
621  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
622  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
623  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
624  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
625
626  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
627  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
628  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
629  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
630
631  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
632  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
633  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
634  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
635
636  // lp filter
637  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
638  filt = _mm_and_si128(filt, hev);
639  work_a = _mm_subs_epi16(qs0, ps0);
640  filt = _mm_adds_epi16(filt, work_a);
641  filt = _mm_adds_epi16(filt, work_a);
642  filt = _mm_adds_epi16(filt, work_a);
643  // (vpx_filter + 3 * (qs0 - ps0)) & mask
644  filt = signed_char_clamp_bd_sse2(filt, bd);
645  filt = _mm_and_si128(filt, mask);
646
647  filter1 = _mm_adds_epi16(filt, t4);
648  filter2 = _mm_adds_epi16(filt, t3);
649
650  // Filter1 >> 3
651  filter1 = signed_char_clamp_bd_sse2(filter1, bd);
652  filter1 = _mm_srai_epi16(filter1, 3);
653
654  // Filter2 >> 3
655  filter2 = signed_char_clamp_bd_sse2(filter2, bd);
656  filter2 = _mm_srai_epi16(filter2, 3);
657
658  // filt >> 1
659  filt = _mm_adds_epi16(filter1, t1);
660  filt = _mm_srai_epi16(filt, 1);
661  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
662  filt = _mm_andnot_si128(hev, filt);
663
664  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
665  work_a = _mm_adds_epi16(work_a, t80);
666  q0 = _mm_load_si128((__m128i *)flat_oq0);
667  work_a = _mm_andnot_si128(flat, work_a);
668  q0 = _mm_and_si128(flat, q0);
669  q0 = _mm_or_si128(work_a, q0);
670
671  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
672  work_a = _mm_adds_epi16(work_a, t80);
673  q1 = _mm_load_si128((__m128i *)flat_oq1);
674  work_a = _mm_andnot_si128(flat, work_a);
675  q1 = _mm_and_si128(flat, q1);
676  q1 = _mm_or_si128(work_a, q1);
677
678  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
679  q2 = _mm_load_si128((__m128i *)flat_oq2);
680  work_a = _mm_andnot_si128(flat, work_a);
681  q2 = _mm_and_si128(flat, q2);
682  q2 = _mm_or_si128(work_a, q2);
683
684  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
685  work_a = _mm_adds_epi16(work_a, t80);
686  p0 = _mm_load_si128((__m128i *)flat_op0);
687  work_a = _mm_andnot_si128(flat, work_a);
688  p0 = _mm_and_si128(flat, p0);
689  p0 = _mm_or_si128(work_a, p0);
690
691  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
692  work_a = _mm_adds_epi16(work_a, t80);
693  p1 = _mm_load_si128((__m128i *)flat_op1);
694  work_a = _mm_andnot_si128(flat, work_a);
695  p1 = _mm_and_si128(flat, p1);
696  p1 = _mm_or_si128(work_a, p1);
697
698  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
699  p2 = _mm_load_si128((__m128i *)flat_op2);
700  work_a = _mm_andnot_si128(flat, work_a);
701  p2 = _mm_and_si128(flat, p2);
702  p2 = _mm_or_si128(work_a, p2);
703
704  _mm_store_si128((__m128i *)(s - 3 * p), p2);
705  _mm_store_si128((__m128i *)(s - 2 * p), p1);
706  _mm_store_si128((__m128i *)(s - 1 * p), p0);
707  _mm_store_si128((__m128i *)(s + 0 * p), q0);
708  _mm_store_si128((__m128i *)(s + 1 * p), q1);
709  _mm_store_si128((__m128i *)(s + 2 * p), q2);
710}
711
712void vpx_highbd_lpf_horizontal_8_dual_sse2(
713    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
714    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
715    const uint8_t *_thresh1, int bd) {
716  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
717  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
718}
719
720void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
721                                      const uint8_t *_blimit,
722                                      const uint8_t *_limit,
723                                      const uint8_t *_thresh, int bd) {
724  const __m128i zero = _mm_set1_epi16(0);
725  __m128i blimit, limit, thresh;
726  __m128i mask, hev, flat;
727  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
728  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
729  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
730  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
731  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
732  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
733  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
734  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
735  const __m128i abs_p1p0 =
736      _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
737  const __m128i abs_q1q0 =
738      _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
739  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
740  const __m128i one = _mm_set1_epi16(1);
741  __m128i abs_p0q0 =
742      _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
743  __m128i abs_p1q1 =
744      _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
745  __m128i work;
746  const __m128i t4 = _mm_set1_epi16(4);
747  const __m128i t3 = _mm_set1_epi16(3);
748  __m128i t80;
749  __m128i tff80;
750  __m128i tffe0;
751  __m128i t1f;
752  // equivalent to shifting 0x1f left by bitdepth - 8
753  // and setting new bits to 1
754  const __m128i t1 = _mm_set1_epi16(0x1);
755  __m128i t7f;
756  // equivalent to shifting 0x7f left by bitdepth - 8
757  // and setting new bits to 1
758  __m128i ps1, ps0, qs0, qs1;
759  __m128i filt;
760  __m128i work_a;
761  __m128i filter1, filter2;
762
763  if (bd == 8) {
764    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
765    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
766    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
767    t80 = _mm_set1_epi16(0x80);
768    tff80 = _mm_set1_epi16(0xff80);
769    tffe0 = _mm_set1_epi16(0xffe0);
770    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
771    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
772  } else if (bd == 10) {
773    blimit = _mm_slli_epi16(
774        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
775    limit = _mm_slli_epi16(
776        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
777    thresh = _mm_slli_epi16(
778        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
779    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
780    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
781    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
782    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
783    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
784  } else {  // bd == 12
785    blimit = _mm_slli_epi16(
786        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
787    limit = _mm_slli_epi16(
788        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
789    thresh = _mm_slli_epi16(
790        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
791    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
792    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
793    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
794    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
795    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
796  }
797
798  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
799  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
800  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
801  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
802
803  // filter_mask and hev_mask
804  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
805  hev = _mm_subs_epu16(flat, thresh);
806  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
807
808  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
809  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
810  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
811  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
812  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
813  // So taking maximums continues to work:
814  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
815  mask = _mm_max_epi16(flat, mask);
816  // mask |= (abs(p1 - p0) > limit) * -1;
817  // mask |= (abs(q1 - q0) > limit) * -1;
818  work = _mm_max_epi16(
819      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
820      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
821  mask = _mm_max_epi16(work, mask);
822  work = _mm_max_epi16(
823      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
824      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
825  mask = _mm_max_epi16(work, mask);
826  mask = _mm_subs_epu16(mask, limit);
827  mask = _mm_cmpeq_epi16(mask, zero);
828
829  // filter4
830  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
831  filt = _mm_and_si128(filt, hev);
832  work_a = _mm_subs_epi16(qs0, ps0);
833  filt = _mm_adds_epi16(filt, work_a);
834  filt = _mm_adds_epi16(filt, work_a);
835  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
836
837  // (vpx_filter + 3 * (qs0 - ps0)) & mask
838  filt = _mm_and_si128(filt, mask);
839
840  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
841  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
842
843  // Filter1 >> 3
844  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
845  filter1 = _mm_srli_epi16(filter1, 3);
846  work_a = _mm_and_si128(work_a, tffe0);    // sign bits for the values < 0
847  filter1 = _mm_and_si128(filter1, t1f);    // clamp the range
848  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
849
850  // Filter2 >> 3
851  work_a = _mm_cmpgt_epi16(zero, filter2);
852  filter2 = _mm_srli_epi16(filter2, 3);
853  work_a = _mm_and_si128(work_a, tffe0);
854  filter2 = _mm_and_si128(filter2, t1f);
855  filter2 = _mm_or_si128(filter2, work_a);
856
857  // filt >> 1
858  filt = _mm_adds_epi16(filter1, t1);
859  work_a = _mm_cmpgt_epi16(zero, filt);
860  filt = _mm_srli_epi16(filt, 1);
861  work_a = _mm_and_si128(work_a, tff80);
862  filt = _mm_and_si128(filt, t7f);
863  filt = _mm_or_si128(filt, work_a);
864
865  filt = _mm_andnot_si128(hev, filt);
866
867  q0 = _mm_adds_epi16(
868      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
869  q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
870                      t80);
871  p0 = _mm_adds_epi16(
872      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
873  p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
874                      t80);
875
876  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
877  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
878  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
879  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
880}
881
882void vpx_highbd_lpf_horizontal_4_dual_sse2(
883    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
884    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
885    const uint8_t *_thresh1, int bd) {
886  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
887  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
888}
889
890static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
891                                    int out_p, int num_8x8_to_transpose) {
892  int idx8x8 = 0;
893  __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
894  do {
895    uint16_t *in = src[idx8x8];
896    uint16_t *out = dst[idx8x8];
897
898    p0 =
899        _mm_loadu_si128((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
900    p1 =
901        _mm_loadu_si128((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
902    p2 =
903        _mm_loadu_si128((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
904    p3 =
905        _mm_loadu_si128((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
906    p4 =
907        _mm_loadu_si128((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
908    p5 =
909        _mm_loadu_si128((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
910    p6 =
911        _mm_loadu_si128((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
912    p7 =
913        _mm_loadu_si128((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
914    // 00 10 01 11 02 12 03 13
915    x0 = _mm_unpacklo_epi16(p0, p1);
916    // 20 30 21 31 22 32 23 33
917    x1 = _mm_unpacklo_epi16(p2, p3);
918    // 40 50 41 51 42 52 43 53
919    x2 = _mm_unpacklo_epi16(p4, p5);
920    // 60 70 61 71 62 72 63 73
921    x3 = _mm_unpacklo_epi16(p6, p7);
922    // 00 10 20 30 01 11 21 31
923    x4 = _mm_unpacklo_epi32(x0, x1);
924    // 40 50 60 70 41 51 61 71
925    x5 = _mm_unpacklo_epi32(x2, x3);
926    // 00 10 20 30 40 50 60 70
927    x6 = _mm_unpacklo_epi64(x4, x5);
928    // 01 11 21 31 41 51 61 71
929    x7 = _mm_unpackhi_epi64(x4, x5);
930
931    _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
932    // 00 10 20 30 40 50 60 70
933    _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
934    // 01 11 21 31 41 51 61 71
935
936    // 02 12 22 32 03 13 23 33
937    x4 = _mm_unpackhi_epi32(x0, x1);
938    // 42 52 62 72 43 53 63 73
939    x5 = _mm_unpackhi_epi32(x2, x3);
940    // 02 12 22 32 42 52 62 72
941    x6 = _mm_unpacklo_epi64(x4, x5);
942    // 03 13 23 33 43 53 63 73
943    x7 = _mm_unpackhi_epi64(x4, x5);
944
945    _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
946    // 02 12 22 32 42 52 62 72
947    _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
948    // 03 13 23 33 43 53 63 73
949
950    // 04 14 05 15 06 16 07 17
951    x0 = _mm_unpackhi_epi16(p0, p1);
952    // 24 34 25 35 26 36 27 37
953    x1 = _mm_unpackhi_epi16(p2, p3);
954    // 44 54 45 55 46 56 47 57
955    x2 = _mm_unpackhi_epi16(p4, p5);
956    // 64 74 65 75 66 76 67 77
957    x3 = _mm_unpackhi_epi16(p6, p7);
958    // 04 14 24 34 05 15 25 35
959    x4 = _mm_unpacklo_epi32(x0, x1);
960    // 44 54 64 74 45 55 65 75
961    x5 = _mm_unpacklo_epi32(x2, x3);
962    // 04 14 24 34 44 54 64 74
963    x6 = _mm_unpacklo_epi64(x4, x5);
964    // 05 15 25 35 45 55 65 75
965    x7 = _mm_unpackhi_epi64(x4, x5);
966
967    _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
968    // 04 14 24 34 44 54 64 74
969    _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
970    // 05 15 25 35 45 55 65 75
971
972    // 06 16 26 36 07 17 27 37
973    x4 = _mm_unpackhi_epi32(x0, x1);
974    // 46 56 66 76 47 57 67 77
975    x5 = _mm_unpackhi_epi32(x2, x3);
976    // 06 16 26 36 46 56 66 76
977    x6 = _mm_unpacklo_epi64(x4, x5);
978    // 07 17 27 37 47 57 67 77
979    x7 = _mm_unpackhi_epi64(x4, x5);
980
981    _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
982    // 06 16 26 36 46 56 66 76
983    _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
984    // 07 17 27 37 47 57 67 77
985  } while (++idx8x8 < num_8x8_to_transpose);
986}
987
988static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
989                                        uint16_t *out, int out_p) {
990  uint16_t *src0[1];
991  uint16_t *src1[1];
992  uint16_t *dest0[1];
993  uint16_t *dest1[1];
994  src0[0] = in0;
995  src1[0] = in1;
996  dest0[0] = out;
997  dest1[0] = out + 8;
998  highbd_transpose(src0, in_p, dest0, out_p, 1);
999  highbd_transpose(src1, in_p, dest1, out_p, 1);
1000}
1001
1002void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
1003                                    const uint8_t *limit, const uint8_t *thresh,
1004                                    int bd) {
1005  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
1006  uint16_t *src[1];
1007  uint16_t *dst[1];
1008
1009  // Transpose 8x8
1010  src[0] = s - 4;
1011  dst[0] = t_dst;
1012
1013  highbd_transpose(src, p, dst, 8, 1);
1014
1015  // Loop filtering
1016  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
1017
1018  src[0] = t_dst;
1019  dst[0] = s - 4;
1020
1021  // Transpose back
1022  highbd_transpose(src, 8, dst, p, 1);
1023}
1024
1025void vpx_highbd_lpf_vertical_4_dual_sse2(
1026    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
1027    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
1028    const uint8_t *thresh1, int bd) {
1029  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
1030  uint16_t *src[2];
1031  uint16_t *dst[2];
1032
1033  // Transpose 8x16
1034  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1035
1036  // Loop filtering
1037  vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
1038                                        thresh0, blimit1, limit1, thresh1, bd);
1039  src[0] = t_dst;
1040  src[1] = t_dst + 8;
1041  dst[0] = s - 4;
1042  dst[1] = s - 4 + p * 8;
1043
1044  // Transpose back
1045  highbd_transpose(src, 16, dst, p, 2);
1046}
1047
1048void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
1049                                    const uint8_t *limit, const uint8_t *thresh,
1050                                    int bd) {
1051  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
1052  uint16_t *src[1];
1053  uint16_t *dst[1];
1054
1055  // Transpose 8x8
1056  src[0] = s - 4;
1057  dst[0] = t_dst;
1058
1059  highbd_transpose(src, p, dst, 8, 1);
1060
1061  // Loop filtering
1062  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
1063
1064  src[0] = t_dst;
1065  dst[0] = s - 4;
1066
1067  // Transpose back
1068  highbd_transpose(src, 8, dst, p, 1);
1069}
1070
1071void vpx_highbd_lpf_vertical_8_dual_sse2(
1072    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
1073    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
1074    const uint8_t *thresh1, int bd) {
1075  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
1076  uint16_t *src[2];
1077  uint16_t *dst[2];
1078
1079  // Transpose 8x16
1080  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1081
1082  // Loop filtering
1083  vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
1084                                        thresh0, blimit1, limit1, thresh1, bd);
1085  src[0] = t_dst;
1086  src[1] = t_dst + 8;
1087
1088  dst[0] = s - 4;
1089  dst[1] = s - 4 + p * 8;
1090
1091  // Transpose back
1092  highbd_transpose(src, 16, dst, p, 2);
1093}
1094
1095void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
1096                                     const uint8_t *limit,
1097                                     const uint8_t *thresh, int bd) {
1098  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
1099  uint16_t *src[2];
1100  uint16_t *dst[2];
1101
1102  src[0] = s - 8;
1103  src[1] = s;
1104  dst[0] = t_dst;
1105  dst[1] = t_dst + 8 * 8;
1106
1107  // Transpose 16x8
1108  highbd_transpose(src, p, dst, 8, 2);
1109
1110  // Loop filtering
1111  vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
1112                                    bd);
1113  src[0] = t_dst;
1114  src[1] = t_dst + 8 * 8;
1115  dst[0] = s - 8;
1116  dst[1] = s;
1117
1118  // Transpose back
1119  highbd_transpose(src, 8, dst, p, 2);
1120}
1121
1122void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
1123                                          const uint8_t *blimit,
1124                                          const uint8_t *limit,
1125                                          const uint8_t *thresh, int bd) {
1126  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
1127
1128  //  Transpose 16x16
1129  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1130  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1131
1132  //  Loop filtering
1133  vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
1134                                         thresh, bd);
1135
1136  //  Transpose back
1137  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1138  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1139}
1140