1/*
2 *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <emmintrin.h>  // SSE2
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx_ports/mem.h"
15
16static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
17  __m128i ubounded;
18  __m128i lbounded;
19  __m128i retval;
20
21  const __m128i zero = _mm_set1_epi16(0);
22  const __m128i one = _mm_set1_epi16(1);
23  __m128i t80, max, min;
24
25  if (bd == 8) {
26    t80 = _mm_set1_epi16(0x80);
27    max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
28  } else if (bd == 10) {
29    t80 = _mm_set1_epi16(0x200);
30    max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
31  } else {  // bd == 12
32    t80 = _mm_set1_epi16(0x800);
33    max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
34  }
35
36  min = _mm_subs_epi16(zero, t80);
37
38  ubounded = _mm_cmpgt_epi16(value, max);
39  lbounded = _mm_cmplt_epi16(value, min);
40  retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
41  ubounded = _mm_and_si128(ubounded, max);
42  lbounded = _mm_and_si128(lbounded, min);
43  retval = _mm_or_si128(retval, ubounded);
44  retval = _mm_or_si128(retval, lbounded);
45  return retval;
46}
47
48// TODO(debargha, peter): Break up large functions into smaller ones
49// in this file.
50void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
51                                       const uint8_t *_blimit,
52                                       const uint8_t *_limit,
53                                       const uint8_t *_thresh, int bd) {
54  const __m128i zero = _mm_set1_epi16(0);
55  const __m128i one = _mm_set1_epi16(1);
56  __m128i blimit, limit, thresh;
57  __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
58  __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
59  __m128i ps1, qs1, ps0, qs0;
60  __m128i abs_p0q0, abs_p1q1, ffff, work;
61  __m128i filt, work_a, filter1, filter2;
62  __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
63  __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
64  __m128i flat2_q0, flat2_p0;
65  __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
66  __m128i pixelFilter_p, pixelFilter_q;
67  __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
68  __m128i sum_p7, sum_q7, sum_p3, sum_q3;
69  __m128i t4, t3, t80, t1;
70  __m128i eight, four;
71
72  if (bd == 8) {
73    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
74    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
75    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
76  } else if (bd == 10) {
77    blimit = _mm_slli_epi16(
78        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
79    limit = _mm_slli_epi16(
80        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
81    thresh = _mm_slli_epi16(
82        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
83  } else {  // bd == 12
84    blimit = _mm_slli_epi16(
85        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
86    limit = _mm_slli_epi16(
87        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
88    thresh = _mm_slli_epi16(
89        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
90  }
91
92  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
93  p4 = _mm_load_si128((__m128i *)(s - 5 * p));
94  q3 = _mm_load_si128((__m128i *)(s + 3 * p));
95  p3 = _mm_load_si128((__m128i *)(s - 4 * p));
96  q2 = _mm_load_si128((__m128i *)(s + 2 * p));
97  p2 = _mm_load_si128((__m128i *)(s - 3 * p));
98  q1 = _mm_load_si128((__m128i *)(s + 1 * p));
99  p1 = _mm_load_si128((__m128i *)(s - 2 * p));
100  q0 = _mm_load_si128((__m128i *)(s + 0 * p));
101  p0 = _mm_load_si128((__m128i *)(s - 1 * p));
102
103  //  highbd_filter_mask
104  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
105  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
106
107  ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
108
109  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
110  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
111
112  //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
113  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
114  hev = _mm_subs_epu16(flat, thresh);
115  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
116
117  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
118  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);         // abs(p1 - q1) / 2
119  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
120  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
121  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
122  work = _mm_max_epi16(
123      _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
124      _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
125  mask = _mm_max_epi16(work, mask);
126  work = _mm_max_epi16(
127      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
128      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
129  mask = _mm_max_epi16(work, mask);
130  work = _mm_max_epi16(
131      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
132      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
133  mask = _mm_max_epi16(work, mask);
134
135  mask = _mm_subs_epu16(mask, limit);
136  mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
137
138  // lp filter
139  // highbd_filter4
140  t4 = _mm_set1_epi16(4);
141  t3 = _mm_set1_epi16(3);
142  if (bd == 8)
143    t80 = _mm_set1_epi16(0x80);
144  else if (bd == 10)
145    t80 = _mm_set1_epi16(0x200);
146  else  // bd == 12
147    t80 = _mm_set1_epi16(0x800);
148
149  t1 = _mm_set1_epi16(0x1);
150
151  ps1 = _mm_subs_epi16(p1, t80);
152  qs1 = _mm_subs_epi16(q1, t80);
153  ps0 = _mm_subs_epi16(p0, t80);
154  qs0 = _mm_subs_epi16(q0, t80);
155
156  filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
157                       hev);
158  work_a = _mm_subs_epi16(qs0, ps0);
159  filt = _mm_adds_epi16(filt, work_a);
160  filt = _mm_adds_epi16(filt, work_a);
161  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
162  filt = _mm_and_si128(filt, mask);
163  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
164  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
165
166  // Filter1 >> 3
167  filter1 = _mm_srai_epi16(filter1, 0x3);
168  filter2 = _mm_srai_epi16(filter2, 0x3);
169
170  qs0 = _mm_adds_epi16(
171      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
172  ps0 = _mm_adds_epi16(
173      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
174  filt = _mm_adds_epi16(filter1, t1);
175  filt = _mm_srai_epi16(filt, 1);
176  filt = _mm_andnot_si128(hev, filt);
177  qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
178                       t80);
179  ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
180                       t80);
181
182  // end highbd_filter4
183  // loopfilter done
184
185  // highbd_flat_mask4
186  flat = _mm_max_epi16(
187      _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
188      _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
189  work = _mm_max_epi16(
190      _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
191      _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
192  flat = _mm_max_epi16(work, flat);
193  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
194  flat = _mm_max_epi16(work, flat);
195
196  if (bd == 8)
197    flat = _mm_subs_epu16(flat, one);
198  else if (bd == 10)
199    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
200  else  // bd == 12
201    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
202
203  flat = _mm_cmpeq_epi16(flat, zero);
204  // end flat_mask4
205
206  // flat & mask = flat && mask (as used in filter8)
207  // (because, in both vars, each block of 16 either all 1s or all 0s)
208  flat = _mm_and_si128(flat, mask);
209
210  p5 = _mm_load_si128((__m128i *)(s - 6 * p));
211  q5 = _mm_load_si128((__m128i *)(s + 5 * p));
212  p6 = _mm_load_si128((__m128i *)(s - 7 * p));
213  q6 = _mm_load_si128((__m128i *)(s + 6 * p));
214  p7 = _mm_load_si128((__m128i *)(s - 8 * p));
215  q7 = _mm_load_si128((__m128i *)(s + 7 * p));
216
217  // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
218  // but referred to as p0-p4 & q0-q4 in fn)
219  flat2 = _mm_max_epi16(
220      _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
221      _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
222
223  work = _mm_max_epi16(
224      _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
225      _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
226  flat2 = _mm_max_epi16(work, flat2);
227
228  work = _mm_max_epi16(
229      _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
230      _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
231  flat2 = _mm_max_epi16(work, flat2);
232
233  work = _mm_max_epi16(
234      _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
235      _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
236  flat2 = _mm_max_epi16(work, flat2);
237
238  if (bd == 8)
239    flat2 = _mm_subs_epu16(flat2, one);
240  else if (bd == 10)
241    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
242  else  // bd == 12
243    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
244
245  flat2 = _mm_cmpeq_epi16(flat2, zero);
246  flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
247  // end highbd_flat_mask5
248
249  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
250  // flat and wide flat calculations
251  eight = _mm_set1_epi16(8);
252  four = _mm_set1_epi16(4);
253
254  pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
255  pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
256
257  pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
258  pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
259
260  pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
261  pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
262  pixelFilter_p =
263      _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
264  pixetFilter_p2p1p0 = _mm_add_epi16(
265      four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
266  flat2_p0 =
267      _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
268  flat2_q0 =
269      _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
270  flat_p0 = _mm_srli_epi16(
271      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
272  flat_q0 = _mm_srli_epi16(
273      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
274
275  sum_p7 = _mm_add_epi16(p7, p7);
276  sum_q7 = _mm_add_epi16(q7, q7);
277  sum_p3 = _mm_add_epi16(p3, p3);
278  sum_q3 = _mm_add_epi16(q3, q3);
279
280  pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
281  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
282  flat2_p1 = _mm_srli_epi16(
283      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
284  flat2_q1 = _mm_srli_epi16(
285      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
286
287  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
288  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
289  flat_p1 = _mm_srli_epi16(
290      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
291  flat_q1 = _mm_srli_epi16(
292      _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
293
294  sum_p7 = _mm_add_epi16(sum_p7, p7);
295  sum_q7 = _mm_add_epi16(sum_q7, q7);
296  sum_p3 = _mm_add_epi16(sum_p3, p3);
297  sum_q3 = _mm_add_epi16(sum_q3, q3);
298
299  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
300  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
301  flat2_p2 = _mm_srli_epi16(
302      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
303  flat2_q2 = _mm_srli_epi16(
304      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
305
306  pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
307  pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
308  flat_p2 = _mm_srli_epi16(
309      _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
310  flat_q2 = _mm_srli_epi16(
311      _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
312
313  sum_p7 = _mm_add_epi16(sum_p7, p7);
314  sum_q7 = _mm_add_epi16(sum_q7, q7);
315  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
316  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
317  flat2_p3 = _mm_srli_epi16(
318      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
319  flat2_q3 = _mm_srli_epi16(
320      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
321
322  sum_p7 = _mm_add_epi16(sum_p7, p7);
323  sum_q7 = _mm_add_epi16(sum_q7, q7);
324  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
325  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
326  flat2_p4 = _mm_srli_epi16(
327      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
328  flat2_q4 = _mm_srli_epi16(
329      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
330
331  sum_p7 = _mm_add_epi16(sum_p7, p7);
332  sum_q7 = _mm_add_epi16(sum_q7, q7);
333  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
334  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
335  flat2_p5 = _mm_srli_epi16(
336      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
337  flat2_q5 = _mm_srli_epi16(
338      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
339
340  sum_p7 = _mm_add_epi16(sum_p7, p7);
341  sum_q7 = _mm_add_epi16(sum_q7, q7);
342  pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
343  pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
344  flat2_p6 = _mm_srli_epi16(
345      _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
346  flat2_q6 = _mm_srli_epi16(
347      _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
348
349  //  wide flat
350  //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
351
352  //  highbd_filter8
353  p2 = _mm_andnot_si128(flat, p2);
354  //  p2 remains unchanged if !(flat && mask)
355  flat_p2 = _mm_and_si128(flat, flat_p2);
356  //  when (flat && mask)
357  p2 = _mm_or_si128(p2, flat_p2);  // full list of p2 values
358  q2 = _mm_andnot_si128(flat, q2);
359  flat_q2 = _mm_and_si128(flat, flat_q2);
360  q2 = _mm_or_si128(q2, flat_q2);  // full list of q2 values
361
362  ps1 = _mm_andnot_si128(flat, ps1);
363  //  p1 takes the value assigned to in in filter4 if !(flat && mask)
364  flat_p1 = _mm_and_si128(flat, flat_p1);
365  //  when (flat && mask)
366  p1 = _mm_or_si128(ps1, flat_p1);  // full list of p1 values
367  qs1 = _mm_andnot_si128(flat, qs1);
368  flat_q1 = _mm_and_si128(flat, flat_q1);
369  q1 = _mm_or_si128(qs1, flat_q1);  // full list of q1 values
370
371  ps0 = _mm_andnot_si128(flat, ps0);
372  //  p0 takes the value assigned to in in filter4 if !(flat && mask)
373  flat_p0 = _mm_and_si128(flat, flat_p0);
374  //  when (flat && mask)
375  p0 = _mm_or_si128(ps0, flat_p0);  // full list of p0 values
376  qs0 = _mm_andnot_si128(flat, qs0);
377  flat_q0 = _mm_and_si128(flat, flat_q0);
378  q0 = _mm_or_si128(qs0, flat_q0);  // full list of q0 values
379  // end highbd_filter8
380
381  // highbd_filter16
382  p6 = _mm_andnot_si128(flat2, p6);
383  //  p6 remains unchanged if !(flat2 && flat && mask)
384  flat2_p6 = _mm_and_si128(flat2, flat2_p6);
385  //  get values for when (flat2 && flat && mask)
386  p6 = _mm_or_si128(p6, flat2_p6);  // full list of p6 values
387  q6 = _mm_andnot_si128(flat2, q6);
388  //  q6 remains unchanged if !(flat2 && flat && mask)
389  flat2_q6 = _mm_and_si128(flat2, flat2_q6);
390  //  get values for when (flat2 && flat && mask)
391  q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
392  _mm_store_si128((__m128i *)(s - 7 * p), p6);
393  _mm_store_si128((__m128i *)(s + 6 * p), q6);
394
395  p5 = _mm_andnot_si128(flat2, p5);
396  //  p5 remains unchanged if !(flat2 && flat && mask)
397  flat2_p5 = _mm_and_si128(flat2, flat2_p5);
398  //  get values for when (flat2 && flat && mask)
399  p5 = _mm_or_si128(p5, flat2_p5);
400  //  full list of p5 values
401  q5 = _mm_andnot_si128(flat2, q5);
402  //  q5 remains unchanged if !(flat2 && flat && mask)
403  flat2_q5 = _mm_and_si128(flat2, flat2_q5);
404  //  get values for when (flat2 && flat && mask)
405  q5 = _mm_or_si128(q5, flat2_q5);
406  //  full list of q5 values
407  _mm_store_si128((__m128i *)(s - 6 * p), p5);
408  _mm_store_si128((__m128i *)(s + 5 * p), q5);
409
410  p4 = _mm_andnot_si128(flat2, p4);
411  //  p4 remains unchanged if !(flat2 && flat && mask)
412  flat2_p4 = _mm_and_si128(flat2, flat2_p4);
413  //  get values for when (flat2 && flat && mask)
414  p4 = _mm_or_si128(p4, flat2_p4);  // full list of p4 values
415  q4 = _mm_andnot_si128(flat2, q4);
416  //  q4 remains unchanged if !(flat2 && flat && mask)
417  flat2_q4 = _mm_and_si128(flat2, flat2_q4);
418  //  get values for when (flat2 && flat && mask)
419  q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
420  _mm_store_si128((__m128i *)(s - 5 * p), p4);
421  _mm_store_si128((__m128i *)(s + 4 * p), q4);
422
423  p3 = _mm_andnot_si128(flat2, p3);
424  //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
425  flat2_p3 = _mm_and_si128(flat2, flat2_p3);
426  //  get values for when (flat2 && flat && mask)
427  p3 = _mm_or_si128(p3, flat2_p3);  // full list of p3 values
428  q3 = _mm_andnot_si128(flat2, q3);
429  //  q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
430  flat2_q3 = _mm_and_si128(flat2, flat2_q3);
431  //  get values for when (flat2 && flat && mask)
432  q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
433  _mm_store_si128((__m128i *)(s - 4 * p), p3);
434  _mm_store_si128((__m128i *)(s + 3 * p), q3);
435
436  p2 = _mm_andnot_si128(flat2, p2);
437  //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
438  flat2_p2 = _mm_and_si128(flat2, flat2_p2);
439  //  get values for when (flat2 && flat && mask)
440  p2 = _mm_or_si128(p2, flat2_p2);
441  //  full list of p2 values
442  q2 = _mm_andnot_si128(flat2, q2);
443  //  q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
444  flat2_q2 = _mm_and_si128(flat2, flat2_q2);
445  //  get values for when (flat2 && flat && mask)
446  q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
447  _mm_store_si128((__m128i *)(s - 3 * p), p2);
448  _mm_store_si128((__m128i *)(s + 2 * p), q2);
449
450  p1 = _mm_andnot_si128(flat2, p1);
451  //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
452  flat2_p1 = _mm_and_si128(flat2, flat2_p1);
453  //  get values for when (flat2 && flat && mask)
454  p1 = _mm_or_si128(p1, flat2_p1);  // full list of p1 values
455  q1 = _mm_andnot_si128(flat2, q1);
456  //  q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
457  flat2_q1 = _mm_and_si128(flat2, flat2_q1);
458  //  get values for when (flat2 && flat && mask)
459  q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
460  _mm_store_si128((__m128i *)(s - 2 * p), p1);
461  _mm_store_si128((__m128i *)(s + 1 * p), q1);
462
463  p0 = _mm_andnot_si128(flat2, p0);
464  //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
465  flat2_p0 = _mm_and_si128(flat2, flat2_p0);
466  //  get values for when (flat2 && flat && mask)
467  p0 = _mm_or_si128(p0, flat2_p0);  // full list of p0 values
468  q0 = _mm_andnot_si128(flat2, q0);
469  //  q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
470  flat2_q0 = _mm_and_si128(flat2, flat2_q0);
471  //  get values for when (flat2 && flat && mask)
472  q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
473  _mm_store_si128((__m128i *)(s - 1 * p), p0);
474  _mm_store_si128((__m128i *)(s - 0 * p), q0);
475}
476
477void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,
478                                            const uint8_t *_blimit,
479                                            const uint8_t *_limit,
480                                            const uint8_t *_thresh, int bd) {
481  vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd);
482  vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
483}
484
485void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
486                                      const uint8_t *_blimit,
487                                      const uint8_t *_limit,
488                                      const uint8_t *_thresh, int bd) {
489  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
490  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
491  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
492  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
493  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
494  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
495  const __m128i zero = _mm_set1_epi16(0);
496  __m128i blimit, limit, thresh;
497  __m128i mask, hev, flat;
498  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
499  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
500  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
501  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
502  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
503  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
504  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
505  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
506  const __m128i one = _mm_set1_epi16(1);
507  const __m128i ffff = _mm_cmpeq_epi16(one, one);
508  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
509  const __m128i four = _mm_set1_epi16(4);
510  __m128i workp_a, workp_b, workp_shft;
511
512  const __m128i t4 = _mm_set1_epi16(4);
513  const __m128i t3 = _mm_set1_epi16(3);
514  __m128i t80;
515  const __m128i t1 = _mm_set1_epi16(0x1);
516  __m128i ps1, ps0, qs0, qs1;
517  __m128i filt;
518  __m128i work_a;
519  __m128i filter1, filter2;
520
521  if (bd == 8) {
522    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
523    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
524    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
525    t80 = _mm_set1_epi16(0x80);
526  } else if (bd == 10) {
527    blimit = _mm_slli_epi16(
528        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
529    limit = _mm_slli_epi16(
530        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
531    thresh = _mm_slli_epi16(
532        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
533    t80 = _mm_set1_epi16(0x200);
534  } else {  // bd == 12
535    blimit = _mm_slli_epi16(
536        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
537    limit = _mm_slli_epi16(
538        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
539    thresh = _mm_slli_epi16(
540        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
541    t80 = _mm_set1_epi16(0x800);
542  }
543
544  ps1 = _mm_subs_epi16(p1, t80);
545  ps0 = _mm_subs_epi16(p0, t80);
546  qs0 = _mm_subs_epi16(q0, t80);
547  qs1 = _mm_subs_epi16(q1, t80);
548
549  // filter_mask and hev_mask
550  abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
551  abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
552
553  abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
554  abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
555  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
556  hev = _mm_subs_epu16(flat, thresh);
557  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
558
559  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
560  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
561  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
562  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
563  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
564  // So taking maximums continues to work:
565  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
566  mask = _mm_max_epi16(abs_p1p0, mask);
567  // mask |= (abs(p1 - p0) > limit) * -1;
568  mask = _mm_max_epi16(abs_q1q0, mask);
569  // mask |= (abs(q1 - q0) > limit) * -1;
570
571  work = _mm_max_epi16(
572      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
573      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
574  mask = _mm_max_epi16(work, mask);
575  work = _mm_max_epi16(
576      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
577      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
578  mask = _mm_max_epi16(work, mask);
579  mask = _mm_subs_epu16(mask, limit);
580  mask = _mm_cmpeq_epi16(mask, zero);
581
582  // flat_mask4
583  flat = _mm_max_epi16(
584      _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
585      _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
586  work = _mm_max_epi16(
587      _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
588      _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
589  flat = _mm_max_epi16(work, flat);
590  flat = _mm_max_epi16(abs_p1p0, flat);
591  flat = _mm_max_epi16(abs_q1q0, flat);
592
593  if (bd == 8)
594    flat = _mm_subs_epu16(flat, one);
595  else if (bd == 10)
596    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
597  else  // bd == 12
598    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
599
600  flat = _mm_cmpeq_epi16(flat, zero);
601  flat = _mm_and_si128(flat, mask);  // flat & mask
602
603  // Added before shift for rounding part of ROUND_POWER_OF_TWO
604
605  workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
606  workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
607  workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
608  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
609  _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
610
611  workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
612  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
613  _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
614
615  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
616  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
617  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
618  _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
619
620  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
621  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
622  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
623  _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
624
625  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
626  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
627  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
628  _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
629
630  workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
631  workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
632  workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
633  _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
634
635  // lp filter
636  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
637  filt = _mm_and_si128(filt, hev);
638  work_a = _mm_subs_epi16(qs0, ps0);
639  filt = _mm_adds_epi16(filt, work_a);
640  filt = _mm_adds_epi16(filt, work_a);
641  filt = _mm_adds_epi16(filt, work_a);
642  // (vpx_filter + 3 * (qs0 - ps0)) & mask
643  filt = signed_char_clamp_bd_sse2(filt, bd);
644  filt = _mm_and_si128(filt, mask);
645
646  filter1 = _mm_adds_epi16(filt, t4);
647  filter2 = _mm_adds_epi16(filt, t3);
648
649  // Filter1 >> 3
650  filter1 = signed_char_clamp_bd_sse2(filter1, bd);
651  filter1 = _mm_srai_epi16(filter1, 3);
652
653  // Filter2 >> 3
654  filter2 = signed_char_clamp_bd_sse2(filter2, bd);
655  filter2 = _mm_srai_epi16(filter2, 3);
656
657  // filt >> 1
658  filt = _mm_adds_epi16(filter1, t1);
659  filt = _mm_srai_epi16(filt, 1);
660  // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
661  filt = _mm_andnot_si128(hev, filt);
662
663  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
664  work_a = _mm_adds_epi16(work_a, t80);
665  q0 = _mm_load_si128((__m128i *)flat_oq0);
666  work_a = _mm_andnot_si128(flat, work_a);
667  q0 = _mm_and_si128(flat, q0);
668  q0 = _mm_or_si128(work_a, q0);
669
670  work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
671  work_a = _mm_adds_epi16(work_a, t80);
672  q1 = _mm_load_si128((__m128i *)flat_oq1);
673  work_a = _mm_andnot_si128(flat, work_a);
674  q1 = _mm_and_si128(flat, q1);
675  q1 = _mm_or_si128(work_a, q1);
676
677  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
678  q2 = _mm_load_si128((__m128i *)flat_oq2);
679  work_a = _mm_andnot_si128(flat, work_a);
680  q2 = _mm_and_si128(flat, q2);
681  q2 = _mm_or_si128(work_a, q2);
682
683  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
684  work_a = _mm_adds_epi16(work_a, t80);
685  p0 = _mm_load_si128((__m128i *)flat_op0);
686  work_a = _mm_andnot_si128(flat, work_a);
687  p0 = _mm_and_si128(flat, p0);
688  p0 = _mm_or_si128(work_a, p0);
689
690  work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
691  work_a = _mm_adds_epi16(work_a, t80);
692  p1 = _mm_load_si128((__m128i *)flat_op1);
693  work_a = _mm_andnot_si128(flat, work_a);
694  p1 = _mm_and_si128(flat, p1);
695  p1 = _mm_or_si128(work_a, p1);
696
697  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
698  p2 = _mm_load_si128((__m128i *)flat_op2);
699  work_a = _mm_andnot_si128(flat, work_a);
700  p2 = _mm_and_si128(flat, p2);
701  p2 = _mm_or_si128(work_a, p2);
702
703  _mm_store_si128((__m128i *)(s - 3 * p), p2);
704  _mm_store_si128((__m128i *)(s - 2 * p), p1);
705  _mm_store_si128((__m128i *)(s - 1 * p), p0);
706  _mm_store_si128((__m128i *)(s + 0 * p), q0);
707  _mm_store_si128((__m128i *)(s + 1 * p), q1);
708  _mm_store_si128((__m128i *)(s + 2 * p), q2);
709}
710
711void vpx_highbd_lpf_horizontal_8_dual_sse2(
712    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
713    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
714    const uint8_t *_thresh1, int bd) {
715  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
716  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
717}
718
719void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
720                                      const uint8_t *_blimit,
721                                      const uint8_t *_limit,
722                                      const uint8_t *_thresh, int bd) {
723  const __m128i zero = _mm_set1_epi16(0);
724  __m128i blimit, limit, thresh;
725  __m128i mask, hev, flat;
726  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
727  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
728  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
729  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
730  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
731  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
732  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
733  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
734  const __m128i abs_p1p0 =
735      _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
736  const __m128i abs_q1q0 =
737      _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
738  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
739  const __m128i one = _mm_set1_epi16(1);
740  __m128i abs_p0q0 =
741      _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
742  __m128i abs_p1q1 =
743      _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
744  __m128i work;
745  const __m128i t4 = _mm_set1_epi16(4);
746  const __m128i t3 = _mm_set1_epi16(3);
747  __m128i t80;
748  __m128i tff80;
749  __m128i tffe0;
750  __m128i t1f;
751  // equivalent to shifting 0x1f left by bitdepth - 8
752  // and setting new bits to 1
753  const __m128i t1 = _mm_set1_epi16(0x1);
754  __m128i t7f;
755  // equivalent to shifting 0x7f left by bitdepth - 8
756  // and setting new bits to 1
757  __m128i ps1, ps0, qs0, qs1;
758  __m128i filt;
759  __m128i work_a;
760  __m128i filter1, filter2;
761
762  if (bd == 8) {
763    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
764    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
765    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
766    t80 = _mm_set1_epi16(0x80);
767    tff80 = _mm_set1_epi16(0xff80);
768    tffe0 = _mm_set1_epi16(0xffe0);
769    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
770    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
771  } else if (bd == 10) {
772    blimit = _mm_slli_epi16(
773        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
774    limit = _mm_slli_epi16(
775        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
776    thresh = _mm_slli_epi16(
777        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
778    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
779    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
780    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
781    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
782    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
783  } else {  // bd == 12
784    blimit = _mm_slli_epi16(
785        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
786    limit = _mm_slli_epi16(
787        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
788    thresh = _mm_slli_epi16(
789        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
790    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
791    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
792    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
793    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
794    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
795  }
796
797  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
798  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
799  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
800  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
801
802  // filter_mask and hev_mask
803  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
804  hev = _mm_subs_epu16(flat, thresh);
805  hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
806
807  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
808  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
809  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
810  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
811  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
812  // So taking maximums continues to work:
813  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
814  mask = _mm_max_epi16(flat, mask);
815  // mask |= (abs(p1 - p0) > limit) * -1;
816  // mask |= (abs(q1 - q0) > limit) * -1;
817  work = _mm_max_epi16(
818      _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
819      _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
820  mask = _mm_max_epi16(work, mask);
821  work = _mm_max_epi16(
822      _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
823      _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
824  mask = _mm_max_epi16(work, mask);
825  mask = _mm_subs_epu16(mask, limit);
826  mask = _mm_cmpeq_epi16(mask, zero);
827
828  // filter4
829  filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
830  filt = _mm_and_si128(filt, hev);
831  work_a = _mm_subs_epi16(qs0, ps0);
832  filt = _mm_adds_epi16(filt, work_a);
833  filt = _mm_adds_epi16(filt, work_a);
834  filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
835
836  // (vpx_filter + 3 * (qs0 - ps0)) & mask
837  filt = _mm_and_si128(filt, mask);
838
839  filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
840  filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
841
842  // Filter1 >> 3
843  work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
844  filter1 = _mm_srli_epi16(filter1, 3);
845  work_a = _mm_and_si128(work_a, tffe0);    // sign bits for the values < 0
846  filter1 = _mm_and_si128(filter1, t1f);    // clamp the range
847  filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
848
849  // Filter2 >> 3
850  work_a = _mm_cmpgt_epi16(zero, filter2);
851  filter2 = _mm_srli_epi16(filter2, 3);
852  work_a = _mm_and_si128(work_a, tffe0);
853  filter2 = _mm_and_si128(filter2, t1f);
854  filter2 = _mm_or_si128(filter2, work_a);
855
856  // filt >> 1
857  filt = _mm_adds_epi16(filter1, t1);
858  work_a = _mm_cmpgt_epi16(zero, filt);
859  filt = _mm_srli_epi16(filt, 1);
860  work_a = _mm_and_si128(work_a, tff80);
861  filt = _mm_and_si128(filt, t7f);
862  filt = _mm_or_si128(filt, work_a);
863
864  filt = _mm_andnot_si128(hev, filt);
865
866  q0 = _mm_adds_epi16(
867      signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
868  q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
869                      t80);
870  p0 = _mm_adds_epi16(
871      signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
872  p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
873                      t80);
874
875  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
876  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
877  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
878  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
879}
880
881void vpx_highbd_lpf_horizontal_4_dual_sse2(
882    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
883    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
884    const uint8_t *_thresh1, int bd) {
885  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
886  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
887}
888
889static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
890                                    int out_p, int num_8x8_to_transpose) {
891  int idx8x8 = 0;
892  __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
893  do {
894    uint16_t *in = src[idx8x8];
895    uint16_t *out = dst[idx8x8];
896
897    p0 =
898        _mm_loadu_si128((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
899    p1 =
900        _mm_loadu_si128((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
901    p2 =
902        _mm_loadu_si128((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
903    p3 =
904        _mm_loadu_si128((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
905    p4 =
906        _mm_loadu_si128((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
907    p5 =
908        _mm_loadu_si128((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
909    p6 =
910        _mm_loadu_si128((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
911    p7 =
912        _mm_loadu_si128((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
913    // 00 10 01 11 02 12 03 13
914    x0 = _mm_unpacklo_epi16(p0, p1);
915    // 20 30 21 31 22 32 23 33
916    x1 = _mm_unpacklo_epi16(p2, p3);
917    // 40 50 41 51 42 52 43 53
918    x2 = _mm_unpacklo_epi16(p4, p5);
919    // 60 70 61 71 62 72 63 73
920    x3 = _mm_unpacklo_epi16(p6, p7);
921    // 00 10 20 30 01 11 21 31
922    x4 = _mm_unpacklo_epi32(x0, x1);
923    // 40 50 60 70 41 51 61 71
924    x5 = _mm_unpacklo_epi32(x2, x3);
925    // 00 10 20 30 40 50 60 70
926    x6 = _mm_unpacklo_epi64(x4, x5);
927    // 01 11 21 31 41 51 61 71
928    x7 = _mm_unpackhi_epi64(x4, x5);
929
930    _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
931    // 00 10 20 30 40 50 60 70
932    _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
933    // 01 11 21 31 41 51 61 71
934
935    // 02 12 22 32 03 13 23 33
936    x4 = _mm_unpackhi_epi32(x0, x1);
937    // 42 52 62 72 43 53 63 73
938    x5 = _mm_unpackhi_epi32(x2, x3);
939    // 02 12 22 32 42 52 62 72
940    x6 = _mm_unpacklo_epi64(x4, x5);
941    // 03 13 23 33 43 53 63 73
942    x7 = _mm_unpackhi_epi64(x4, x5);
943
944    _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
945    // 02 12 22 32 42 52 62 72
946    _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
947    // 03 13 23 33 43 53 63 73
948
949    // 04 14 05 15 06 16 07 17
950    x0 = _mm_unpackhi_epi16(p0, p1);
951    // 24 34 25 35 26 36 27 37
952    x1 = _mm_unpackhi_epi16(p2, p3);
953    // 44 54 45 55 46 56 47 57
954    x2 = _mm_unpackhi_epi16(p4, p5);
955    // 64 74 65 75 66 76 67 77
956    x3 = _mm_unpackhi_epi16(p6, p7);
957    // 04 14 24 34 05 15 25 35
958    x4 = _mm_unpacklo_epi32(x0, x1);
959    // 44 54 64 74 45 55 65 75
960    x5 = _mm_unpacklo_epi32(x2, x3);
961    // 04 14 24 34 44 54 64 74
962    x6 = _mm_unpacklo_epi64(x4, x5);
963    // 05 15 25 35 45 55 65 75
964    x7 = _mm_unpackhi_epi64(x4, x5);
965
966    _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
967    // 04 14 24 34 44 54 64 74
968    _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
969    // 05 15 25 35 45 55 65 75
970
971    // 06 16 26 36 07 17 27 37
972    x4 = _mm_unpackhi_epi32(x0, x1);
973    // 46 56 66 76 47 57 67 77
974    x5 = _mm_unpackhi_epi32(x2, x3);
975    // 06 16 26 36 46 56 66 76
976    x6 = _mm_unpacklo_epi64(x4, x5);
977    // 07 17 27 37 47 57 67 77
978    x7 = _mm_unpackhi_epi64(x4, x5);
979
980    _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
981    // 06 16 26 36 46 56 66 76
982    _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
983    // 07 17 27 37 47 57 67 77
984  } while (++idx8x8 < num_8x8_to_transpose);
985}
986
987static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
988                                        uint16_t *out, int out_p) {
989  uint16_t *src0[1];
990  uint16_t *src1[1];
991  uint16_t *dest0[1];
992  uint16_t *dest1[1];
993  src0[0] = in0;
994  src1[0] = in1;
995  dest0[0] = out;
996  dest1[0] = out + 8;
997  highbd_transpose(src0, in_p, dest0, out_p, 1);
998  highbd_transpose(src1, in_p, dest1, out_p, 1);
999}
1000
1001void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
1002                                    const uint8_t *limit, const uint8_t *thresh,
1003                                    int bd) {
1004  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
1005  uint16_t *src[1];
1006  uint16_t *dst[1];
1007
1008  // Transpose 8x8
1009  src[0] = s - 4;
1010  dst[0] = t_dst;
1011
1012  highbd_transpose(src, p, dst, 8, 1);
1013
1014  // Loop filtering
1015  vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
1016
1017  src[0] = t_dst;
1018  dst[0] = s - 4;
1019
1020  // Transpose back
1021  highbd_transpose(src, 8, dst, p, 1);
1022}
1023
1024void vpx_highbd_lpf_vertical_4_dual_sse2(
1025    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
1026    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
1027    const uint8_t *thresh1, int bd) {
1028  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
1029  uint16_t *src[2];
1030  uint16_t *dst[2];
1031
1032  // Transpose 8x16
1033  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1034
1035  // Loop filtering
1036  vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
1037                                        thresh0, blimit1, limit1, thresh1, bd);
1038  src[0] = t_dst;
1039  src[1] = t_dst + 8;
1040  dst[0] = s - 4;
1041  dst[1] = s - 4 + p * 8;
1042
1043  // Transpose back
1044  highbd_transpose(src, 16, dst, p, 2);
1045}
1046
1047void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
1048                                    const uint8_t *limit, const uint8_t *thresh,
1049                                    int bd) {
1050  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
1051  uint16_t *src[1];
1052  uint16_t *dst[1];
1053
1054  // Transpose 8x8
1055  src[0] = s - 4;
1056  dst[0] = t_dst;
1057
1058  highbd_transpose(src, p, dst, 8, 1);
1059
1060  // Loop filtering
1061  vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
1062
1063  src[0] = t_dst;
1064  dst[0] = s - 4;
1065
1066  // Transpose back
1067  highbd_transpose(src, 8, dst, p, 1);
1068}
1069
1070void vpx_highbd_lpf_vertical_8_dual_sse2(
1071    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
1072    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
1073    const uint8_t *thresh1, int bd) {
1074  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
1075  uint16_t *src[2];
1076  uint16_t *dst[2];
1077
1078  // Transpose 8x16
1079  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1080
1081  // Loop filtering
1082  vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
1083                                        thresh0, blimit1, limit1, thresh1, bd);
1084  src[0] = t_dst;
1085  src[1] = t_dst + 8;
1086
1087  dst[0] = s - 4;
1088  dst[1] = s - 4 + p * 8;
1089
1090  // Transpose back
1091  highbd_transpose(src, 16, dst, p, 2);
1092}
1093
1094void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
1095                                     const uint8_t *limit,
1096                                     const uint8_t *thresh, int bd) {
1097  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
1098  uint16_t *src[2];
1099  uint16_t *dst[2];
1100
1101  src[0] = s - 8;
1102  src[1] = s;
1103  dst[0] = t_dst;
1104  dst[1] = t_dst + 8 * 8;
1105
1106  // Transpose 16x8
1107  highbd_transpose(src, p, dst, 8, 2);
1108
1109  // Loop filtering
1110  vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
1111                                    bd);
1112  src[0] = t_dst;
1113  src[1] = t_dst + 8 * 8;
1114  dst[0] = s - 8;
1115  dst[1] = s;
1116
1117  // Transpose back
1118  highbd_transpose(src, 8, dst, p, 2);
1119}
1120
1121void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
1122                                          const uint8_t *blimit,
1123                                          const uint8_t *limit,
1124                                          const uint8_t *thresh, int bd) {
1125  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
1126
1127  //  Transpose 16x16
1128  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1129  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1130
1131  //  Loop filtering
1132  vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
1133                                         thresh, bd);
1134
1135  //  Transpose back
1136  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1137  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1138}
1139