loopfilter_sse2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <emmintrin.h>  // SSE2
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx_ports/mem.h"
15#include "vpx_ports/emmintrin_compat.h"
16
17static INLINE __m128i abs_diff(__m128i a, __m128i b) {
18  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
19}
20
21static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
22                                            int p,
23                                            const unsigned char *_blimit,
24                                            const unsigned char *_limit,
25                                            const unsigned char *_thresh) {
26  const __m128i zero = _mm_set1_epi16(0);
27  const __m128i one = _mm_set1_epi8(1);
28  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
29  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
30  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
31  __m128i mask, hev, flat, flat2;
32  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
33  __m128i abs_p1p0;
34
35  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
36  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
37                                       (__m64 *)(s + 4 * p)));
38  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
39  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
40                                       (__m64 *)(s + 3 * p)));
41  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
42  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
43                                       (__m64 *)(s + 2 * p)));
44  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
45  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
46                                       (__m64 *)(s + 1 * p)));
47  p1q1 = _mm_shuffle_epi32(q1p1, 78);
48  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
49  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
50                                       (__m64 *)(s - 0 * p)));
51  p0q0 = _mm_shuffle_epi32(q0p0, 78);
52
53  {
54    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
55    abs_p1p0 = abs_diff(q1p1, q0p0);
56    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
57    fe = _mm_set1_epi8(0xfe);
58    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
59    abs_p0q0 = abs_diff(q0p0, p0q0);
60    abs_p1q1 = abs_diff(q1p1, p1q1);
61    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
62    hev = _mm_subs_epu8(flat, thresh);
63    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
64
65    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
66    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
67    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
68    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
69    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
70    mask = _mm_max_epu8(abs_p1p0, mask);
71    // mask |= (abs(p1 - p0) > limit) * -1;
72    // mask |= (abs(q1 - q0) > limit) * -1;
73
74    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
75                        abs_diff(q3p3, q2p2));
76    mask = _mm_max_epu8(work, mask);
77    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
78    mask = _mm_subs_epu8(mask, limit);
79    mask = _mm_cmpeq_epi8(mask, zero);
80  }
81
82  // lp filter
83  {
84    const __m128i t4 = _mm_set1_epi8(4);
85    const __m128i t3 = _mm_set1_epi8(3);
86    const __m128i t80 = _mm_set1_epi8(0x80);
87    const __m128i t1 = _mm_set1_epi16(0x1);
88    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
89    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
90    __m128i qs0 = _mm_xor_si128(p0q0, t80);
91    __m128i qs1 = _mm_xor_si128(p1q1, t80);
92    __m128i filt;
93    __m128i work_a;
94    __m128i filter1, filter2;
95    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
96    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
97
98    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
99    work_a = _mm_subs_epi8(qs0, qs0ps0);
100    filt = _mm_adds_epi8(filt, work_a);
101    filt = _mm_adds_epi8(filt, work_a);
102    filt = _mm_adds_epi8(filt, work_a);
103    // (vpx_filter + 3 * (qs0 - ps0)) & mask
104    filt = _mm_and_si128(filt, mask);
105
106    filter1 = _mm_adds_epi8(filt, t4);
107    filter2 = _mm_adds_epi8(filt, t3);
108
109    filter1 = _mm_unpacklo_epi8(zero, filter1);
110    filter1 = _mm_srai_epi16(filter1, 0xB);
111    filter2 = _mm_unpacklo_epi8(zero, filter2);
112    filter2 = _mm_srai_epi16(filter2, 0xB);
113
114    // Filter1 >> 3
115    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
116    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
117
118    // filt >> 1
119    filt = _mm_adds_epi16(filter1, t1);
120    filt = _mm_srai_epi16(filt, 1);
121    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
122                            filt);
123    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
124    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
125    // loopfilter done
126
127    {
128      __m128i work;
129      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
130      flat = _mm_max_epu8(abs_p1p0, flat);
131      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
132      flat = _mm_subs_epu8(flat, one);
133      flat = _mm_cmpeq_epi8(flat, zero);
134      flat = _mm_and_si128(flat, mask);
135
136      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
137      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
138                                           (__m64 *)(s + 5 * p)));
139
140      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
141      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
142                                           (__m64 *)(s + 6 * p)));
143      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
144
145      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
146      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
147                                           (__m64 *)(s + 7 * p)));
148      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
149      flat2 = _mm_max_epu8(work, flat2);
150      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
151      flat2 = _mm_subs_epu8(flat2, one);
152      flat2 = _mm_cmpeq_epi8(flat2, zero);
153      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
154    }
155
156    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
157    // flat and wide flat calculations
158    {
159      const __m128i eight = _mm_set1_epi16(8);
160      const __m128i four = _mm_set1_epi16(4);
161      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
162      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
163      __m128i pixelFilter_p, pixelFilter_q;
164      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
165      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
166
167      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
168      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
169      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
170      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
171      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
172      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
173      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
174      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
175      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
176      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
177      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
178      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
179      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
180      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
181      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
182      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
183
184      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
185                                    _mm_add_epi16(p4_16, p3_16));
186      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
187                                    _mm_add_epi16(q4_16, q3_16));
188
189      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
190      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
191
192      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
193      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
194      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
195                                                         pixelFilter_q));
196      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
197                                           _mm_add_epi16(pixetFilter_p2p1p0,
198                                                         pixetFilter_q2q1q0));
199      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
200                                           _mm_add_epi16(p7_16, p0_16)), 4);
201      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
202                                           _mm_add_epi16(q7_16, q0_16)), 4);
203      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
204      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
205                                           _mm_add_epi16(p3_16, p0_16)), 3);
206      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
207                                           _mm_add_epi16(q3_16, q0_16)), 3);
208
209      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
210
211      sum_p7 = _mm_add_epi16(p7_16, p7_16);
212      sum_q7 = _mm_add_epi16(q7_16, q7_16);
213      sum_p3 = _mm_add_epi16(p3_16, p3_16);
214      sum_q3 = _mm_add_epi16(q3_16, q3_16);
215
216      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
217      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
218      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
219                             _mm_add_epi16(sum_p7, p1_16)), 4);
220      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
221                             _mm_add_epi16(sum_q7, q1_16)), 4);
222      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
223
224      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
225      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
226      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
227                             _mm_add_epi16(sum_p3, p1_16)), 3);
228      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
229                             _mm_add_epi16(sum_q3, q1_16)), 3);
230      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
231
232      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
233      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
234      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
235      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
236
237      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
238      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
239      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
240                             _mm_add_epi16(sum_p7, p2_16)), 4);
241      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
242                             _mm_add_epi16(sum_q7, q2_16)), 4);
243      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
244
245      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
246      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
247
248      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
249                                           _mm_add_epi16(sum_p3, p2_16)), 3);
250      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
251                                           _mm_add_epi16(sum_q3, q2_16)), 3);
252      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
253
254      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
255      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
256      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
257      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
258      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
259                             _mm_add_epi16(sum_p7, p3_16)), 4);
260      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
261                             _mm_add_epi16(sum_q7, q3_16)), 4);
262      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
263
264      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
265      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
266      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
267      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
268      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
269                             _mm_add_epi16(sum_p7, p4_16)), 4);
270      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
271                             _mm_add_epi16(sum_q7, q4_16)), 4);
272      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
273
274      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
275      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
276      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
277      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
278      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
279                             _mm_add_epi16(sum_p7, p5_16)), 4);
280      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
281                             _mm_add_epi16(sum_q7, q5_16)), 4);
282      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
283
284      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
285      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
286      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
287      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
288      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
289                             _mm_add_epi16(sum_p7, p6_16)), 4);
290      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
291                             _mm_add_epi16(sum_q7, q6_16)), 4);
292      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
293    }
294    // wide flat
295    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
296
297    flat = _mm_shuffle_epi32(flat, 68);
298    flat2 = _mm_shuffle_epi32(flat2, 68);
299
300    q2p2 = _mm_andnot_si128(flat, q2p2);
301    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
302    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
303
304    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
305    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
306    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
307
308    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
309    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
310    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
311
312    q6p6 = _mm_andnot_si128(flat2, q6p6);
313    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
314    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
315    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
316    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
317
318    q5p5 = _mm_andnot_si128(flat2, q5p5);
319    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
320    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
321    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
322    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
323
324    q4p4 = _mm_andnot_si128(flat2, q4p4);
325    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
326    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
327    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
328    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
329
330    q3p3 = _mm_andnot_si128(flat2, q3p3);
331    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
332    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
333    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
334    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
335
336    q2p2 = _mm_andnot_si128(flat2, q2p2);
337    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
338    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
339    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
340    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
341
342    q1p1 = _mm_andnot_si128(flat2, q1p1);
343    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
344    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
345    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
346    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
347
348    q0p0 = _mm_andnot_si128(flat2, q0p0);
349    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
350    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
351    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
352    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
353  }
354}
355
356static INLINE __m128i filter_add2_sub2(const __m128i *const total,
357                                       const __m128i *const a1,
358                                       const __m128i *const a2,
359                                       const __m128i *const s1,
360                                       const __m128i *const s2) {
361  __m128i x = _mm_add_epi16(*a1, *total);
362  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
363  return x;
364}
365
366static INLINE __m128i filter8_mask(const __m128i *const flat,
367                                   const __m128i *const other_filt,
368                                   const __m128i *const f8_lo,
369                                   const __m128i *const f8_hi) {
370  const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
371                                      _mm_srli_epi16(*f8_hi, 3));
372  const __m128i result = _mm_and_si128(*flat, f8);
373  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
374}
375
376static INLINE __m128i filter16_mask(const __m128i *const flat,
377                                    const __m128i *const other_filt,
378                                    const __m128i *const f_lo,
379                                    const __m128i *const f_hi) {
380  const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
381                                     _mm_srli_epi16(*f_hi, 4));
382  const __m128i result = _mm_and_si128(*flat, f);
383  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
384}
385
386static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
387                                             int p,
388                                             const unsigned char *_blimit,
389                                             const unsigned char *_limit,
390                                             const unsigned char *_thresh) {
391  const __m128i zero = _mm_set1_epi16(0);
392  const __m128i one = _mm_set1_epi8(1);
393  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
394  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
395  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
396  __m128i mask, hev, flat, flat2;
397  __m128i p7, p6, p5;
398  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
399  __m128i q5, q6, q7;
400
401  __m128i op2, op1, op0, oq0, oq1, oq2;
402
403  __m128i max_abs_p1p0q1q0;
404
405  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
406  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
407  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
408  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
409  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
410  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
411  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
412  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
413  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
414  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
415  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
416  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
417  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
418  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
419  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
420  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
421
422  {
423    const __m128i abs_p1p0 = abs_diff(p1, p0);
424    const __m128i abs_q1q0 = abs_diff(q1, q0);
425    const __m128i fe = _mm_set1_epi8(0xfe);
426    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
427    __m128i abs_p0q0 = abs_diff(p0, q0);
428    __m128i abs_p1q1 = abs_diff(p1, q1);
429    __m128i work;
430    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
431
432    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
433    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
434    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
435    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
436    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
437    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
438    // mask |= (abs(p1 - p0) > limit) * -1;
439    // mask |= (abs(q1 - q0) > limit) * -1;
440    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
441    mask = _mm_max_epu8(work, mask);
442    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
443    mask = _mm_max_epu8(work, mask);
444    mask = _mm_subs_epu8(mask, limit);
445    mask = _mm_cmpeq_epi8(mask, zero);
446  }
447
448  {
449    __m128i work;
450    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
451    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
452    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
453    flat = _mm_max_epu8(work, flat);
454    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
455    flat = _mm_subs_epu8(flat, one);
456    flat = _mm_cmpeq_epi8(flat, zero);
457    flat = _mm_and_si128(flat, mask);
458    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
459    flat2 = _mm_max_epu8(work, flat2);
460    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
461    flat2 = _mm_max_epu8(work, flat2);
462    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
463    flat2 = _mm_max_epu8(work, flat2);
464    flat2 = _mm_subs_epu8(flat2, one);
465    flat2 = _mm_cmpeq_epi8(flat2, zero);
466    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
467  }
468
469  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
470  // filter4
471  {
472    const __m128i t4 = _mm_set1_epi8(4);
473    const __m128i t3 = _mm_set1_epi8(3);
474    const __m128i t80 = _mm_set1_epi8(0x80);
475    const __m128i te0 = _mm_set1_epi8(0xe0);
476    const __m128i t1f = _mm_set1_epi8(0x1f);
477    const __m128i t1 = _mm_set1_epi8(0x1);
478    const __m128i t7f = _mm_set1_epi8(0x7f);
479    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
480
481    __m128i filt;
482    __m128i work_a;
483    __m128i filter1, filter2;
484
485    op1 = _mm_xor_si128(p1, t80);
486    op0 = _mm_xor_si128(p0, t80);
487    oq0 = _mm_xor_si128(q0, t80);
488    oq1 = _mm_xor_si128(q1, t80);
489
490    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
491    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
492    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
493
494    work_a = _mm_subs_epi8(oq0, op0);
495    filt = _mm_adds_epi8(filt, work_a);
496    filt = _mm_adds_epi8(filt, work_a);
497    filt = _mm_adds_epi8(filt, work_a);
498    // (vpx_filter + 3 * (qs0 - ps0)) & mask
499    filt = _mm_and_si128(filt, mask);
500    filter1 = _mm_adds_epi8(filt, t4);
501    filter2 = _mm_adds_epi8(filt, t3);
502
503    // Filter1 >> 3
504    work_a = _mm_cmpgt_epi8(zero, filter1);
505    filter1 = _mm_srli_epi16(filter1, 3);
506    work_a = _mm_and_si128(work_a, te0);
507    filter1 = _mm_and_si128(filter1, t1f);
508    filter1 = _mm_or_si128(filter1, work_a);
509    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
510
511    // Filter2 >> 3
512    work_a = _mm_cmpgt_epi8(zero, filter2);
513    filter2 = _mm_srli_epi16(filter2, 3);
514    work_a = _mm_and_si128(work_a, te0);
515    filter2 = _mm_and_si128(filter2, t1f);
516    filter2 = _mm_or_si128(filter2, work_a);
517    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
518
519    // filt >> 1
520    filt = _mm_adds_epi8(filter1, t1);
521    work_a = _mm_cmpgt_epi8(zero, filt);
522    filt = _mm_srli_epi16(filt, 1);
523    work_a = _mm_and_si128(work_a, t80);
524    filt = _mm_and_si128(filt, t7f);
525    filt = _mm_or_si128(filt, work_a);
526    filt = _mm_andnot_si128(hev, filt);
527    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
528    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
529    // loopfilter done
530
531    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
532    // filter8
533    {
534      const __m128i four = _mm_set1_epi16(4);
535      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
536      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
537      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
538      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
539      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
540      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
541      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
542      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
543
544      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
545      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
546      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
547      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
548      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
549      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
550      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
551      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
552      __m128i f8_lo, f8_hi;
553
554      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
555                            _mm_add_epi16(p3_lo, p2_lo));
556      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
557                            _mm_add_epi16(p2_lo, p1_lo));
558      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
559
560      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
561                            _mm_add_epi16(p3_hi, p2_hi));
562      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
563                            _mm_add_epi16(p2_hi, p1_hi));
564      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
565
566      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
567
568      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
569      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
570      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
571
572      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
573      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
574      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
575
576      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
577      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
578      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
579
580      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
581      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
582      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
583
584      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
585      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
586      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
587    }
588
589    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
590    // wide flat calculations
591    {
592      const __m128i eight = _mm_set1_epi16(8);
593      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
594      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
595      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
596      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
597      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
598      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
599      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
600      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
601      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
602      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
603      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
604      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
605      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
606      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
607      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
608      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
609
610      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
611      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
612      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
613      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
614      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
615      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
616      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
617      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
618      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
619      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
620      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
621      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
622      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
623      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
624      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
625      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
626
627      __m128i f_lo;
628      __m128i f_hi;
629
630      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
631      f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
632                           _mm_add_epi16(p4_lo, f_lo));
633      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
634                           _mm_add_epi16(p2_lo, p1_lo));
635      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
636      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
637
638      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
639      f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
640                           _mm_add_epi16(p4_hi, f_hi));
641      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
642                           _mm_add_epi16(p2_hi, p1_hi));
643      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
644      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
645
646      p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
647      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
648
649      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
650      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
651      p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
652      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
653
654      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
655      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
656      p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
657      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
658
659      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
660      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
661      p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
662      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
663
664      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
665      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
666      op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
667      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
668
669      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
670      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
671      op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
672      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
673
674      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
675      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
676      op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
677      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
678
679      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
680      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
681      oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
682      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
683
684      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
685      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
686      oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
687      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
688
689      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
690      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
691      oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
692      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
693
694      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
695      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
696      q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
697      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
698
699      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
700      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
701      q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
702      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
703
704      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
705      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
706      q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
707      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
708
709      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
710      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
711      q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
712      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
713    }
714    // wide flat
715    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
716  }
717}
718
719// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
720void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
721                                const unsigned char *_blimit,
722                                const unsigned char *_limit,
723                                const unsigned char *_thresh, int count) {
724  if (count == 1)
725    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
726  else
727    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
728}
729
730void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
731                               const unsigned char *_blimit,
732                               const unsigned char *_limit,
733                               const unsigned char *_thresh, int count) {
734  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
735  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
736  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
737  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
738  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
739  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
740  const __m128i zero = _mm_set1_epi16(0);
741  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
742  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
743  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
744  __m128i mask, hev, flat;
745  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
746  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
747
748  (void)count;
749
750  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
751                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
752  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
753                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
754  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
755                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
756  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
757                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
758  p1q1 = _mm_shuffle_epi32(q1p1, 78);
759  p0q0 = _mm_shuffle_epi32(q0p0, 78);
760
761  {
762    // filter_mask and hev_mask
763    const __m128i one = _mm_set1_epi8(1);
764    const __m128i fe = _mm_set1_epi8(0xfe);
765    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
766    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
767    abs_p1p0 = abs_diff(q1p1, q0p0);
768    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
769
770    abs_p0q0 = abs_diff(q0p0, p0q0);
771    abs_p1q1 = abs_diff(q1p1, p1q1);
772    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
773    hev = _mm_subs_epu8(flat, thresh);
774    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
775
776    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
777    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
778    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
779    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
780    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
781    mask = _mm_max_epu8(abs_p1p0, mask);
782    // mask |= (abs(p1 - p0) > limit) * -1;
783    // mask |= (abs(q1 - q0) > limit) * -1;
784
785    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
786                        abs_diff(q3p3, q2p2));
787    mask = _mm_max_epu8(work, mask);
788    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
789    mask = _mm_subs_epu8(mask, limit);
790    mask = _mm_cmpeq_epi8(mask, zero);
791
792    // flat_mask4
793
794    flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
795                        abs_diff(q3p3, q0p0));
796    flat = _mm_max_epu8(abs_p1p0, flat);
797    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
798    flat = _mm_subs_epu8(flat, one);
799    flat = _mm_cmpeq_epi8(flat, zero);
800    flat = _mm_and_si128(flat, mask);
801  }
802
803  {
804    const __m128i four = _mm_set1_epi16(4);
805    unsigned char *src = s;
806    {
807      __m128i workp_a, workp_b, workp_shft;
808      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
809      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
810      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
811      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
812      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
813      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
814      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
815      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
816
817      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
818      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
819      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
820      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
821      _mm_storel_epi64((__m128i *)&flat_op2[0],
822                       _mm_packus_epi16(workp_shft, workp_shft));
823
824      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
825      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
826      _mm_storel_epi64((__m128i *)&flat_op1[0],
827                       _mm_packus_epi16(workp_shft, workp_shft));
828
829      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
830      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
831      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
832      _mm_storel_epi64((__m128i *)&flat_op0[0],
833                       _mm_packus_epi16(workp_shft, workp_shft));
834
835      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
836      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
837      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
838      _mm_storel_epi64((__m128i *)&flat_oq0[0],
839                       _mm_packus_epi16(workp_shft, workp_shft));
840
841      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
842      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
843      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
844      _mm_storel_epi64((__m128i *)&flat_oq1[0],
845                       _mm_packus_epi16(workp_shft, workp_shft));
846
847      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
848      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
849      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
850      _mm_storel_epi64((__m128i *)&flat_oq2[0],
851                       _mm_packus_epi16(workp_shft, workp_shft));
852    }
853  }
854  // lp filter
855  {
856    const __m128i t4 = _mm_set1_epi8(4);
857    const __m128i t3 = _mm_set1_epi8(3);
858    const __m128i t80 = _mm_set1_epi8(0x80);
859    const __m128i t1 = _mm_set1_epi8(0x1);
860    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
861                                      t80);
862    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
863                                      t80);
864    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
865                                      t80);
866    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
867                                      t80);
868    __m128i filt;
869    __m128i work_a;
870    __m128i filter1, filter2;
871
872    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
873    work_a = _mm_subs_epi8(qs0, ps0);
874    filt = _mm_adds_epi8(filt, work_a);
875    filt = _mm_adds_epi8(filt, work_a);
876    filt = _mm_adds_epi8(filt, work_a);
877    // (vpx_filter + 3 * (qs0 - ps0)) & mask
878    filt = _mm_and_si128(filt, mask);
879
880    filter1 = _mm_adds_epi8(filt, t4);
881    filter2 = _mm_adds_epi8(filt, t3);
882
883    // Filter1 >> 3
884    filter1 = _mm_unpacklo_epi8(zero, filter1);
885    filter1 = _mm_srai_epi16(filter1, 11);
886    filter1 = _mm_packs_epi16(filter1, filter1);
887
888    // Filter2 >> 3
889    filter2 = _mm_unpacklo_epi8(zero, filter2);
890    filter2 = _mm_srai_epi16(filter2, 11);
891    filter2 = _mm_packs_epi16(filter2, zero);
892
893    // filt >> 1
894    filt = _mm_adds_epi8(filter1, t1);
895    filt = _mm_unpacklo_epi8(zero, filt);
896    filt = _mm_srai_epi16(filt, 9);
897    filt = _mm_packs_epi16(filt, zero);
898
899    filt = _mm_andnot_si128(hev, filt);
900
901    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
902    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
903    work_a = _mm_andnot_si128(flat, work_a);
904    q0 = _mm_and_si128(flat, q0);
905    q0 = _mm_or_si128(work_a, q0);
906
907    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
908    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
909    work_a = _mm_andnot_si128(flat, work_a);
910    q1 = _mm_and_si128(flat, q1);
911    q1 = _mm_or_si128(work_a, q1);
912
913    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
914    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
915    work_a = _mm_andnot_si128(flat, work_a);
916    q2 = _mm_and_si128(flat, q2);
917    q2 = _mm_or_si128(work_a, q2);
918
919    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
920    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
921    work_a = _mm_andnot_si128(flat, work_a);
922    p0 = _mm_and_si128(flat, p0);
923    p0 = _mm_or_si128(work_a, p0);
924
925    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
926    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
927    work_a = _mm_andnot_si128(flat, work_a);
928    p1 = _mm_and_si128(flat, p1);
929    p1 = _mm_or_si128(work_a, p1);
930
931    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
932    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
933    work_a = _mm_andnot_si128(flat, work_a);
934    p2 = _mm_and_si128(flat, p2);
935    p2 = _mm_or_si128(work_a, p2);
936
937    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
938    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
939    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
940    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
941    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
942    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
943  }
944}
945
946void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
947                                    const uint8_t *_blimit0,
948                                    const uint8_t *_limit0,
949                                    const uint8_t *_thresh0,
950                                    const uint8_t *_blimit1,
951                                    const uint8_t *_limit1,
952                                    const uint8_t *_thresh1) {
953  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
954  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
955  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
956  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
957  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
958  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
959  const __m128i zero = _mm_set1_epi16(0);
960  const __m128i blimit =
961      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
962                         _mm_load_si128((const __m128i *)_blimit1));
963  const __m128i limit =
964      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
965                         _mm_load_si128((const __m128i *)_limit1));
966  const __m128i thresh =
967      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
968                         _mm_load_si128((const __m128i *)_thresh1));
969
970  __m128i mask, hev, flat;
971  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
972
973  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
974  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
975  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
976  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
977  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
978  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
979  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
980  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
981  {
982    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
983                                          _mm_subs_epu8(p0, p1));
984    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
985                                          _mm_subs_epu8(q0, q1));
986    const __m128i one = _mm_set1_epi8(1);
987    const __m128i fe = _mm_set1_epi8(0xfe);
988    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
989    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
990                                    _mm_subs_epu8(q0, p0));
991    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
992                                    _mm_subs_epu8(q1, p1));
993    __m128i work;
994
995    // filter_mask and hev_mask
996    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
997    hev = _mm_subs_epu8(flat, thresh);
998    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
999
1000    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1001    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1002    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1003    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1004    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1005    mask = _mm_max_epu8(flat, mask);
1006    // mask |= (abs(p1 - p0) > limit) * -1;
1007    // mask |= (abs(q1 - q0) > limit) * -1;
1008    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1009                                     _mm_subs_epu8(p1, p2)),
1010                         _mm_or_si128(_mm_subs_epu8(p3, p2),
1011                                      _mm_subs_epu8(p2, p3)));
1012    mask = _mm_max_epu8(work, mask);
1013    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1014                                     _mm_subs_epu8(q1, q2)),
1015                         _mm_or_si128(_mm_subs_epu8(q3, q2),
1016                                      _mm_subs_epu8(q2, q3)));
1017    mask = _mm_max_epu8(work, mask);
1018    mask = _mm_subs_epu8(mask, limit);
1019    mask = _mm_cmpeq_epi8(mask, zero);
1020
1021    // flat_mask4
1022    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
1023                                     _mm_subs_epu8(p0, p2)),
1024                         _mm_or_si128(_mm_subs_epu8(q2, q0),
1025                                      _mm_subs_epu8(q0, q2)));
1026    flat = _mm_max_epu8(work, flat);
1027    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
1028                                     _mm_subs_epu8(p0, p3)),
1029                         _mm_or_si128(_mm_subs_epu8(q3, q0),
1030                                      _mm_subs_epu8(q0, q3)));
1031    flat = _mm_max_epu8(work, flat);
1032    flat = _mm_subs_epu8(flat, one);
1033    flat = _mm_cmpeq_epi8(flat, zero);
1034    flat = _mm_and_si128(flat, mask);
1035  }
1036  {
1037    const __m128i four = _mm_set1_epi16(4);
1038    unsigned char *src = s;
1039    int i = 0;
1040
1041    do {
1042      __m128i workp_a, workp_b, workp_shft;
1043      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1044      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1045      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1046      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1047      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1048      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1049      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1050      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1051
1052      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1053      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1054      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1055      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1056      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1057                       _mm_packus_epi16(workp_shft, workp_shft));
1058
1059      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1060      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1061      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1062                       _mm_packus_epi16(workp_shft, workp_shft));
1063
1064      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1065      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1066      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1067      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1068                       _mm_packus_epi16(workp_shft, workp_shft));
1069
1070      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1071      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1072      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1073      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1074                       _mm_packus_epi16(workp_shft, workp_shft));
1075
1076      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1077      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1078      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1079      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1080                       _mm_packus_epi16(workp_shft, workp_shft));
1081
1082      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1083      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1084      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1085      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1086                       _mm_packus_epi16(workp_shft, workp_shft));
1087
1088      src += 8;
1089    } while (++i < 2);
1090  }
1091  // lp filter
1092  {
1093    const __m128i t4 = _mm_set1_epi8(4);
1094    const __m128i t3 = _mm_set1_epi8(3);
1095    const __m128i t80 = _mm_set1_epi8(0x80);
1096    const __m128i te0 = _mm_set1_epi8(0xe0);
1097    const __m128i t1f = _mm_set1_epi8(0x1f);
1098    const __m128i t1 = _mm_set1_epi8(0x1);
1099    const __m128i t7f = _mm_set1_epi8(0x7f);
1100
1101    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1102                                      t80);
1103    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1104                                      t80);
1105    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1106                                      t80);
1107    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1108                                      t80);
1109    __m128i filt;
1110    __m128i work_a;
1111    __m128i filter1, filter2;
1112
1113    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1114    work_a = _mm_subs_epi8(qs0, ps0);
1115    filt = _mm_adds_epi8(filt, work_a);
1116    filt = _mm_adds_epi8(filt, work_a);
1117    filt = _mm_adds_epi8(filt, work_a);
1118    // (vpx_filter + 3 * (qs0 - ps0)) & mask
1119    filt = _mm_and_si128(filt, mask);
1120
1121    filter1 = _mm_adds_epi8(filt, t4);
1122    filter2 = _mm_adds_epi8(filt, t3);
1123
1124    // Filter1 >> 3
1125    work_a = _mm_cmpgt_epi8(zero, filter1);
1126    filter1 = _mm_srli_epi16(filter1, 3);
1127    work_a = _mm_and_si128(work_a, te0);
1128    filter1 = _mm_and_si128(filter1, t1f);
1129    filter1 = _mm_or_si128(filter1, work_a);
1130
1131    // Filter2 >> 3
1132    work_a = _mm_cmpgt_epi8(zero, filter2);
1133    filter2 = _mm_srli_epi16(filter2, 3);
1134    work_a = _mm_and_si128(work_a, te0);
1135    filter2 = _mm_and_si128(filter2, t1f);
1136    filter2 = _mm_or_si128(filter2, work_a);
1137
1138    // filt >> 1
1139    filt = _mm_adds_epi8(filter1, t1);
1140    work_a = _mm_cmpgt_epi8(zero, filt);
1141    filt = _mm_srli_epi16(filt, 1);
1142    work_a = _mm_and_si128(work_a, t80);
1143    filt = _mm_and_si128(filt, t7f);
1144    filt = _mm_or_si128(filt, work_a);
1145
1146    filt = _mm_andnot_si128(hev, filt);
1147
1148    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1149    q0 = _mm_load_si128((__m128i *)flat_oq0);
1150    work_a = _mm_andnot_si128(flat, work_a);
1151    q0 = _mm_and_si128(flat, q0);
1152    q0 = _mm_or_si128(work_a, q0);
1153
1154    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1155    q1 = _mm_load_si128((__m128i *)flat_oq1);
1156    work_a = _mm_andnot_si128(flat, work_a);
1157    q1 = _mm_and_si128(flat, q1);
1158    q1 = _mm_or_si128(work_a, q1);
1159
1160    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1161    q2 = _mm_load_si128((__m128i *)flat_oq2);
1162    work_a = _mm_andnot_si128(flat, work_a);
1163    q2 = _mm_and_si128(flat, q2);
1164    q2 = _mm_or_si128(work_a, q2);
1165
1166    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1167    p0 = _mm_load_si128((__m128i *)flat_op0);
1168    work_a = _mm_andnot_si128(flat, work_a);
1169    p0 = _mm_and_si128(flat, p0);
1170    p0 = _mm_or_si128(work_a, p0);
1171
1172    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1173    p1 = _mm_load_si128((__m128i *)flat_op1);
1174    work_a = _mm_andnot_si128(flat, work_a);
1175    p1 = _mm_and_si128(flat, p1);
1176    p1 = _mm_or_si128(work_a, p1);
1177
1178    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1179    p2 = _mm_load_si128((__m128i *)flat_op2);
1180    work_a = _mm_andnot_si128(flat, work_a);
1181    p2 = _mm_and_si128(flat, p2);
1182    p2 = _mm_or_si128(work_a, p2);
1183
1184    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1185    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1186    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1187    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1188    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1189    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1190  }
1191}
1192
1193void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1194                                    const unsigned char *_blimit0,
1195                                    const unsigned char *_limit0,
1196                                    const unsigned char *_thresh0,
1197                                    const unsigned char *_blimit1,
1198                                    const unsigned char *_limit1,
1199                                    const unsigned char *_thresh1) {
1200  const __m128i blimit =
1201      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1202                         _mm_load_si128((const __m128i *)_blimit1));
1203  const __m128i limit =
1204      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1205                         _mm_load_si128((const __m128i *)_limit1));
1206  const __m128i thresh =
1207      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1208                         _mm_load_si128((const __m128i *)_thresh1));
1209  const __m128i zero = _mm_set1_epi16(0);
1210  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1211  __m128i mask, hev, flat;
1212
1213  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1214  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1215  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1216  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1217  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1218  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1219  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1220  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1221
1222  // filter_mask and hev_mask
1223  {
1224    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
1225                                          _mm_subs_epu8(p0, p1));
1226    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
1227                                          _mm_subs_epu8(q0, q1));
1228    const __m128i fe = _mm_set1_epi8(0xfe);
1229    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1230    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
1231                                    _mm_subs_epu8(q0, p0));
1232    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
1233                                    _mm_subs_epu8(q1, p1));
1234    __m128i work;
1235
1236    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1237    hev = _mm_subs_epu8(flat, thresh);
1238    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1239
1240    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1241    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1242    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1243    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1244    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1245    mask = _mm_max_epu8(flat, mask);
1246    // mask |= (abs(p1 - p0) > limit) * -1;
1247    // mask |= (abs(q1 - q0) > limit) * -1;
1248    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1249                                     _mm_subs_epu8(p1, p2)),
1250                         _mm_or_si128(_mm_subs_epu8(p3, p2),
1251                                      _mm_subs_epu8(p2, p3)));
1252    mask = _mm_max_epu8(work, mask);
1253    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1254                                     _mm_subs_epu8(q1, q2)),
1255                         _mm_or_si128(_mm_subs_epu8(q3, q2),
1256                                      _mm_subs_epu8(q2, q3)));
1257    mask = _mm_max_epu8(work, mask);
1258    mask = _mm_subs_epu8(mask, limit);
1259    mask = _mm_cmpeq_epi8(mask, zero);
1260  }
1261
1262  // filter4
1263  {
1264    const __m128i t4 = _mm_set1_epi8(4);
1265    const __m128i t3 = _mm_set1_epi8(3);
1266    const __m128i t80 = _mm_set1_epi8(0x80);
1267    const __m128i te0 = _mm_set1_epi8(0xe0);
1268    const __m128i t1f = _mm_set1_epi8(0x1f);
1269    const __m128i t1 = _mm_set1_epi8(0x1);
1270    const __m128i t7f = _mm_set1_epi8(0x7f);
1271
1272    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1273                                      t80);
1274    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1275                                      t80);
1276    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1277                                      t80);
1278    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1279                                      t80);
1280    __m128i filt;
1281    __m128i work_a;
1282    __m128i filter1, filter2;
1283
1284    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1285    work_a = _mm_subs_epi8(qs0, ps0);
1286    filt = _mm_adds_epi8(filt, work_a);
1287    filt = _mm_adds_epi8(filt, work_a);
1288    filt = _mm_adds_epi8(filt, work_a);
1289    // (vpx_filter + 3 * (qs0 - ps0)) & mask
1290    filt = _mm_and_si128(filt, mask);
1291
1292    filter1 = _mm_adds_epi8(filt, t4);
1293    filter2 = _mm_adds_epi8(filt, t3);
1294
1295    // Filter1 >> 3
1296    work_a = _mm_cmpgt_epi8(zero, filter1);
1297    filter1 = _mm_srli_epi16(filter1, 3);
1298    work_a = _mm_and_si128(work_a, te0);
1299    filter1 = _mm_and_si128(filter1, t1f);
1300    filter1 = _mm_or_si128(filter1, work_a);
1301
1302    // Filter2 >> 3
1303    work_a = _mm_cmpgt_epi8(zero, filter2);
1304    filter2 = _mm_srli_epi16(filter2, 3);
1305    work_a = _mm_and_si128(work_a, te0);
1306    filter2 = _mm_and_si128(filter2, t1f);
1307    filter2 = _mm_or_si128(filter2, work_a);
1308
1309    // filt >> 1
1310    filt = _mm_adds_epi8(filter1, t1);
1311    work_a = _mm_cmpgt_epi8(zero, filt);
1312    filt = _mm_srli_epi16(filt, 1);
1313    work_a = _mm_and_si128(work_a, t80);
1314    filt = _mm_and_si128(filt, t7f);
1315    filt = _mm_or_si128(filt, work_a);
1316
1317    filt = _mm_andnot_si128(hev, filt);
1318
1319    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1320    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1321    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1322    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1323
1324    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1325    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1326    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1327    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1328  }
1329}
1330
1331static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1332                                 int in_p, unsigned char *out, int out_p) {
1333  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1334  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1335
1336  // 2-way interleave w/hoisting of unpacks
1337  x0 = _mm_loadl_epi64((__m128i *)in0);  // 1
1338  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
1339  x0 = _mm_unpacklo_epi8(x0, x1);  // 1
1340
1341  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
1342  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));  // 7
1343  x1 = _mm_unpacklo_epi8(x2, x3);  // 2
1344
1345  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));  // 9
1346  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));  // 11
1347  x2 = _mm_unpacklo_epi8(x4, x5);  // 3
1348
1349  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));  // 13
1350  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));  // 15
1351  x3 = _mm_unpacklo_epi8(x6, x7);  // 4
1352  x4 = _mm_unpacklo_epi16(x0, x1);  // 9
1353
1354  x8 = _mm_loadl_epi64((__m128i *)in1);  // 2
1355  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
1356  x8 = _mm_unpacklo_epi8(x8, x9);  // 5
1357  x5 = _mm_unpacklo_epi16(x2, x3);  // 10
1358
1359  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
1360  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));  // 8
1361  x9 = _mm_unpacklo_epi8(x10, x11);  // 6
1362
1363  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));  // 10
1364  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));  // 12
1365  x10 = _mm_unpacklo_epi8(x12, x13);  // 7
1366  x12 = _mm_unpacklo_epi16(x8, x9);  // 11
1367
1368  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));  // 14
1369  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));  // 16
1370  x11 = _mm_unpacklo_epi8(x14, x15);  // 8
1371  x13 = _mm_unpacklo_epi16(x10, x11);  // 12
1372
1373  x6 = _mm_unpacklo_epi32(x4, x5);  // 13
1374  x7 = _mm_unpackhi_epi32(x4, x5);  // 14
1375  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
1376  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
1377
1378  // Store first 4-line result
1379  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1380  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1381  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1382  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1383
1384  x4 = _mm_unpackhi_epi16(x0, x1);
1385  x5 = _mm_unpackhi_epi16(x2, x3);
1386  x12 = _mm_unpackhi_epi16(x8, x9);
1387  x13 = _mm_unpackhi_epi16(x10, x11);
1388
1389  x6 = _mm_unpacklo_epi32(x4, x5);
1390  x7 = _mm_unpackhi_epi32(x4, x5);
1391  x14 = _mm_unpacklo_epi32(x12, x13);
1392  x15 = _mm_unpackhi_epi32(x12, x13);
1393
1394  // Store second 4-line result
1395  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1396  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1397  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1398  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1399}
1400
1401static INLINE void transpose(unsigned char *src[], int in_p,
1402                             unsigned char *dst[], int out_p,
1403                             int num_8x8_to_transpose) {
1404  int idx8x8 = 0;
1405  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1406  do {
1407    unsigned char *in = src[idx8x8];
1408    unsigned char *out = dst[idx8x8];
1409
1410    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
1411    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
1412    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1413    x0 = _mm_unpacklo_epi8(x0, x1);
1414
1415    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
1416    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
1417    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1418    x1 = _mm_unpacklo_epi8(x2, x3);
1419
1420    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
1421    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
1422    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1423    x2 = _mm_unpacklo_epi8(x4, x5);
1424
1425    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
1426    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
1427    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1428    x3 = _mm_unpacklo_epi8(x6, x7);
1429
1430    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1431    x4 = _mm_unpacklo_epi16(x0, x1);
1432    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1433    x5 = _mm_unpacklo_epi16(x2, x3);
1434    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1435    x6 = _mm_unpacklo_epi32(x4, x5);
1436    _mm_storel_pd((double *)(out + 0*out_p),
1437                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
1438    _mm_storeh_pd((double *)(out + 1*out_p),
1439                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
1440    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1441    x7 = _mm_unpackhi_epi32(x4, x5);
1442    _mm_storel_pd((double *)(out + 2*out_p),
1443                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
1444    _mm_storeh_pd((double *)(out + 3*out_p),
1445                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
1446
1447    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1448    x4 = _mm_unpackhi_epi16(x0, x1);
1449    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1450    x5 = _mm_unpackhi_epi16(x2, x3);
1451    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1452    x6 = _mm_unpacklo_epi32(x4, x5);
1453    _mm_storel_pd((double *)(out + 4*out_p),
1454                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
1455    _mm_storeh_pd((double *)(out + 5*out_p),
1456                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
1457    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1458    x7 = _mm_unpackhi_epi32(x4, x5);
1459
1460    _mm_storel_pd((double *)(out + 6*out_p),
1461                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
1462    _mm_storeh_pd((double *)(out + 7*out_p),
1463                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
1464  } while (++idx8x8 < num_8x8_to_transpose);
1465}
1466
1467void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1468                                  const uint8_t *limit0,
1469                                  const uint8_t *thresh0,
1470                                  const uint8_t *blimit1,
1471                                  const uint8_t *limit1,
1472                                  const uint8_t *thresh1) {
1473  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1474  unsigned char *src[2];
1475  unsigned char *dst[2];
1476
1477  // Transpose 8x16
1478  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1479
1480  // Loop filtering
1481  vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1482                                 blimit1, limit1, thresh1);
1483  src[0] = t_dst;
1484  src[1] = t_dst + 8;
1485  dst[0] = s - 4;
1486  dst[1] = s - 4 + p * 8;
1487
1488  // Transpose back
1489  transpose(src, 16, dst, p, 2);
1490}
1491
1492void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
1493                             const unsigned char *blimit,
1494                             const unsigned char *limit,
1495                             const unsigned char *thresh, int count) {
1496  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
1497  unsigned char *src[1];
1498  unsigned char *dst[1];
1499  (void)count;
1500
1501  // Transpose 8x8
1502  src[0] = s - 4;
1503  dst[0] = t_dst;
1504
1505  transpose(src, p, dst, 8, 1);
1506
1507  // Loop filtering
1508  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
1509
1510  src[0] = t_dst;
1511  dst[0] = s - 4;
1512
1513  // Transpose back
1514  transpose(src, 8, dst, p, 1);
1515}
1516
1517void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1518                                  const uint8_t *limit0,
1519                                  const uint8_t *thresh0,
1520                                  const uint8_t *blimit1,
1521                                  const uint8_t *limit1,
1522                                  const uint8_t *thresh1) {
1523  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1524  unsigned char *src[2];
1525  unsigned char *dst[2];
1526
1527  // Transpose 8x16
1528  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1529
1530  // Loop filtering
1531  vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1532                                 blimit1, limit1, thresh1);
1533  src[0] = t_dst;
1534  src[1] = t_dst + 8;
1535
1536  dst[0] = s - 4;
1537  dst[1] = s - 4 + p * 8;
1538
1539  // Transpose back
1540  transpose(src, 16, dst, p, 2);
1541}
1542
1543void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
1544                              const unsigned char *blimit,
1545                              const unsigned char *limit,
1546                              const unsigned char *thresh) {
1547  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
1548  unsigned char *src[2];
1549  unsigned char *dst[2];
1550
1551  src[0] = s - 8;
1552  src[1] = s;
1553  dst[0] = t_dst;
1554  dst[1] = t_dst + 8 * 8;
1555
1556  // Transpose 16x8
1557  transpose(src, p, dst, 8, 2);
1558
1559  // Loop filtering
1560  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
1561
1562  src[0] = t_dst;
1563  src[1] = t_dst + 8 * 8;
1564  dst[0] = s - 8;
1565  dst[1] = s;
1566
1567  // Transpose back
1568  transpose(src, 8, dst, p, 2);
1569}
1570
1571void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
1572                                   const uint8_t *blimit, const uint8_t *limit,
1573                                   const uint8_t *thresh) {
1574  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
1575
1576  // Transpose 16x16
1577  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1578  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1579
1580  // Loop filtering
1581  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
1582                                   thresh);
1583
1584  // Transpose back
1585  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1586  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1587}
1588