1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <emmintrin.h>  /* SSE2 */
12#include "vp9/common/vp9_loopfilter.h"
13#include "vpx_ports/emmintrin_compat.h"
14
15static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
16                                            int p,
17                                            const unsigned char *_blimit,
18                                            const unsigned char *_limit,
19                                            const unsigned char *_thresh) {
20  __m128i mask, hev, flat, flat2;
21  const __m128i zero = _mm_set1_epi16(0);
22  const __m128i one = _mm_set1_epi8(1);
23  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
24  __m128i abs_p1p0;
25  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
26  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
27  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
28  const __m128i thresh =
29      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
30  const __m128i limit =
31      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
32  const __m128i blimit =
33      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
34
35  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
36  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
37                                       (__m64 *)(s + 4 * p)));
38  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
39  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
40                                       (__m64 *)(s + 3 * p)));
41  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
42  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
43                                       (__m64 *)(s + 2 * p)));
44  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
45  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
46                                       (__m64 *)(s + 1 * p)));
47  p1q1 = _mm_shuffle_epi32(q1p1, 78);
48  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
49  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
50                                       (__m64 *)(s - 0 * p)));
51  p0q0 = _mm_shuffle_epi32(q0p0, 78);
52
53  {
54    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
55    abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
56                            _mm_subs_epu8(q0p0, q1p1));
57    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
58    fe = _mm_set1_epi8(0xfe);
59    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
60    abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
61                            _mm_subs_epu8(p0q0, q0p0));
62    abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
63                            _mm_subs_epu8(p1q1, q1p1));
64    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
65    hev = _mm_subs_epu8(flat, thresh);
66    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
67
68    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
69    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
70    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
71    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
72    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
73    mask = _mm_max_epu8(abs_p1p0, mask);
74    // mask |= (abs(p1 - p0) > limit) * -1;
75    // mask |= (abs(q1 - q0) > limit) * -1;
76
77    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
78                                     _mm_subs_epu8(q1p1, q2p2)),
79                        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
80                                     _mm_subs_epu8(q2p2, q3p3)));
81    mask = _mm_max_epu8(work, mask);
82    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
83    mask = _mm_subs_epu8(mask, limit);
84    mask = _mm_cmpeq_epi8(mask, zero);
85  }
86
87  // lp filter
88  {
89    const __m128i t4 = _mm_set1_epi8(4);
90    const __m128i t3 = _mm_set1_epi8(3);
91    const __m128i t80 = _mm_set1_epi8(0x80);
92    const __m128i t1 = _mm_set1_epi16(0x1);
93    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
94    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
95    __m128i qs0 = _mm_xor_si128(p0q0, t80);
96    __m128i qs1 = _mm_xor_si128(p1q1, t80);
97    __m128i filt;
98    __m128i work_a;
99    __m128i filter1, filter2;
100    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
101    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
102
103    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
104    work_a = _mm_subs_epi8(qs0, qs0ps0);
105    filt = _mm_adds_epi8(filt, work_a);
106    filt = _mm_adds_epi8(filt, work_a);
107    filt = _mm_adds_epi8(filt, work_a);
108    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
109    filt = _mm_and_si128(filt, mask);
110
111    filter1 = _mm_adds_epi8(filt, t4);
112    filter2 = _mm_adds_epi8(filt, t3);
113
114    filter1 = _mm_unpacklo_epi8(zero, filter1);
115    filter1 = _mm_srai_epi16(filter1, 0xB);
116    filter2 = _mm_unpacklo_epi8(zero, filter2);
117    filter2 = _mm_srai_epi16(filter2, 0xB);
118
119    /* Filter1 >> 3 */
120    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
121    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
122
123    /* filt >> 1 */
124    filt = _mm_adds_epi16(filter1, t1);
125    filt = _mm_srai_epi16(filt, 1);
126    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
127                            filt);
128    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
129    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
130    // loopfilter done
131
132    {
133      __m128i work;
134      flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
135                                       _mm_subs_epu8(q0p0, q2p2)),
136                          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
137                                       _mm_subs_epu8(q0p0, q3p3)));
138      flat = _mm_max_epu8(abs_p1p0, flat);
139      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
140      flat = _mm_subs_epu8(flat, one);
141      flat = _mm_cmpeq_epi8(flat, zero);
142      flat = _mm_and_si128(flat, mask);
143
144      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
145      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
146                                           (__m64 *)(s + 5 * p)));
147
148      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
149      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
150                                           (__m64 *)(s + 6 * p)));
151
152      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
153                                        _mm_subs_epu8(q0p0, q4p4)),
154                           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
155                                        _mm_subs_epu8(q0p0, q5p5)));
156
157      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
158      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
159                                           (__m64 *)(s + 7 * p)));
160
161      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
162                                       _mm_subs_epu8(q0p0, q6p6)),
163                          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
164                                       _mm_subs_epu8(q0p0, q7p7)));
165
166      flat2 = _mm_max_epu8(work, flat2);
167      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
168      flat2 = _mm_subs_epu8(flat2, one);
169      flat2 = _mm_cmpeq_epi8(flat2, zero);
170      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
171    }
172
173    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
174    // flat and wide flat calculations
175    {
176      const __m128i eight = _mm_set1_epi16(8);
177      const __m128i four = _mm_set1_epi16(4);
178      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
179      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
180      __m128i pixelFilter_p, pixelFilter_q;
181      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
182      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
183
184      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
185      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
186      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
187      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
188      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
189      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
190      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
191      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
192      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
193      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
194      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
195      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
196      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
197      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
198      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
199      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
200
201      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
202                                    _mm_add_epi16(p4_16, p3_16));
203      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
204                                    _mm_add_epi16(q4_16, q3_16));
205
206      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
207      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
208
209      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
210      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
211      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
212                                                         pixelFilter_q));
213      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
214                                           _mm_add_epi16(pixetFilter_p2p1p0,
215                                                         pixetFilter_q2q1q0));
216      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
217                                           _mm_add_epi16(p7_16, p0_16)), 4);
218      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
219                                           _mm_add_epi16(q7_16, q0_16)), 4);
220      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
221      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
222                                           _mm_add_epi16(p3_16, p0_16)), 3);
223      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
224                                           _mm_add_epi16(q3_16, q0_16)), 3);
225
226      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
227
228      sum_p7 = _mm_add_epi16(p7_16, p7_16);
229      sum_q7 = _mm_add_epi16(q7_16, q7_16);
230      sum_p3 = _mm_add_epi16(p3_16, p3_16);
231      sum_q3 = _mm_add_epi16(q3_16, q3_16);
232
233      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
234      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
235      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
236                             _mm_add_epi16(sum_p7, p1_16)), 4);
237      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
238                             _mm_add_epi16(sum_q7, q1_16)), 4);
239      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
240
241      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
242      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
243      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
244                             _mm_add_epi16(sum_p3, p1_16)), 3);
245      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
246                             _mm_add_epi16(sum_q3, q1_16)), 3);
247      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
248
249      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
250      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
251      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
252      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
253
254      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
255      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
256      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
257                             _mm_add_epi16(sum_p7, p2_16)), 4);
258      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
259                             _mm_add_epi16(sum_q7, q2_16)), 4);
260      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
261
262      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
263      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
264
265      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
266                                           _mm_add_epi16(sum_p3, p2_16)), 3);
267      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
268                                           _mm_add_epi16(sum_q3, q2_16)), 3);
269      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
270
271      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
272      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
273      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
274      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
275      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
276                             _mm_add_epi16(sum_p7, p3_16)), 4);
277      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
278                             _mm_add_epi16(sum_q7, q3_16)), 4);
279      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
280
281      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
282      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
283      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
284      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
285      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
286                             _mm_add_epi16(sum_p7, p4_16)), 4);
287      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
288                             _mm_add_epi16(sum_q7, q4_16)), 4);
289      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
290
291      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
292      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
293      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
294      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
295      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
296                             _mm_add_epi16(sum_p7, p5_16)), 4);
297      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
298                             _mm_add_epi16(sum_q7, q5_16)), 4);
299      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
300
301      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
302      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
303      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
304      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
305      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
306                             _mm_add_epi16(sum_p7, p6_16)), 4);
307      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
308                             _mm_add_epi16(sum_q7, q6_16)), 4);
309      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
310    }
311    // wide flat
312    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
313
314    flat = _mm_shuffle_epi32(flat, 68);
315    flat2 = _mm_shuffle_epi32(flat2, 68);
316
317    q2p2 = _mm_andnot_si128(flat, q2p2);
318    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
319    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
320
321    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
322    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
323    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
324
325    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
326    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
327    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
328
329    q6p6 = _mm_andnot_si128(flat2, q6p6);
330    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
331    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
332    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
333    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
334
335    q5p5 = _mm_andnot_si128(flat2, q5p5);
336    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
337    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
338    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
339    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
340
341    q4p4 = _mm_andnot_si128(flat2, q4p4);
342    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
343    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
344    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
345    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
346
347    q3p3 = _mm_andnot_si128(flat2, q3p3);
348    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
349    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
350    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
351    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
352
353    q2p2 = _mm_andnot_si128(flat2, q2p2);
354    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
355    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
356    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
357    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
358
359    q1p1 = _mm_andnot_si128(flat2, q1p1);
360    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
361    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
362    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
363    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
364
365    q0p0 = _mm_andnot_si128(flat2, q0p0);
366    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
367    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
368    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
369    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
370  }
371}
372
373static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
374                                             int p,
375                                             const unsigned char *_blimit,
376                                             const unsigned char *_limit,
377                                             const unsigned char *_thresh) {
378  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
379  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
380
381  DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
382  DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
383
384  DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
385  DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
386
387
388  __m128i mask, hev, flat, flat2;
389  const __m128i zero = _mm_set1_epi16(0);
390  const __m128i one = _mm_set1_epi8(1);
391  __m128i p7, p6, p5;
392  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
393  __m128i q5, q6, q7;
394  int i = 0;
395  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
396  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
397  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
398  const __m128i thresh =
399      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
400  const __m128i limit =
401      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
402  const __m128i blimit =
403      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
404
405  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
406  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
407  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
408  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
409  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
410  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
411  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
412  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
413  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
414  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
415
416  _mm_store_si128((__m128i *)ap[4], p4);
417  _mm_store_si128((__m128i *)ap[3], p3);
418  _mm_store_si128((__m128i *)ap[2], p2);
419  _mm_store_si128((__m128i *)ap[1], p1);
420  _mm_store_si128((__m128i *)ap[0], p0);
421  _mm_store_si128((__m128i *)aq[4], q4);
422  _mm_store_si128((__m128i *)aq[3], q3);
423  _mm_store_si128((__m128i *)aq[2], q2);
424  _mm_store_si128((__m128i *)aq[1], q1);
425  _mm_store_si128((__m128i *)aq[0], q0);
426
427
428  {
429    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
430                                          _mm_subs_epu8(p0, p1));
431    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
432                                          _mm_subs_epu8(q0, q1));
433    const __m128i fe = _mm_set1_epi8(0xfe);
434    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
435    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
436                                    _mm_subs_epu8(q0, p0));
437    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
438                                    _mm_subs_epu8(q1, p1));
439    __m128i work;
440    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
441    hev = _mm_subs_epu8(flat, thresh);
442    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
443
444    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
445    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
446    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
447    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
448    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
449    mask = _mm_max_epu8(flat, mask);
450    // mask |= (abs(p1 - p0) > limit) * -1;
451    // mask |= (abs(q1 - q0) > limit) * -1;
452    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
453                                     _mm_subs_epu8(p1, p2)),
454                         _mm_or_si128(_mm_subs_epu8(p3, p2),
455                                      _mm_subs_epu8(p2, p3)));
456    mask = _mm_max_epu8(work, mask);
457    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
458                                     _mm_subs_epu8(q1, q2)),
459                         _mm_or_si128(_mm_subs_epu8(q3, q2),
460                                      _mm_subs_epu8(q2, q3)));
461    mask = _mm_max_epu8(work, mask);
462    mask = _mm_subs_epu8(mask, limit);
463    mask = _mm_cmpeq_epi8(mask, zero);
464  }
465
466  // lp filter
467  {
468    const __m128i t4 = _mm_set1_epi8(4);
469    const __m128i t3 = _mm_set1_epi8(3);
470    const __m128i t80 = _mm_set1_epi8(0x80);
471    const __m128i te0 = _mm_set1_epi8(0xe0);
472    const __m128i t1f = _mm_set1_epi8(0x1f);
473    const __m128i t1 = _mm_set1_epi8(0x1);
474    const __m128i t7f = _mm_set1_epi8(0x7f);
475
476    __m128i ps1 = _mm_xor_si128(p1, t80);
477    __m128i ps0 = _mm_xor_si128(p0, t80);
478    __m128i qs0 = _mm_xor_si128(q0, t80);
479    __m128i qs1 = _mm_xor_si128(q1, t80);
480    __m128i filt;
481    __m128i work_a;
482    __m128i filter1, filter2;
483
484    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
485    work_a = _mm_subs_epi8(qs0, ps0);
486    filt = _mm_adds_epi8(filt, work_a);
487    filt = _mm_adds_epi8(filt, work_a);
488    filt = _mm_adds_epi8(filt, work_a);
489    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
490    filt = _mm_and_si128(filt, mask);
491
492    filter1 = _mm_adds_epi8(filt, t4);
493    filter2 = _mm_adds_epi8(filt, t3);
494
495    /* Filter1 >> 3 */
496    work_a = _mm_cmpgt_epi8(zero, filter1);
497    filter1 = _mm_srli_epi16(filter1, 3);
498    work_a = _mm_and_si128(work_a, te0);
499    filter1 = _mm_and_si128(filter1, t1f);
500    filter1 = _mm_or_si128(filter1, work_a);
501    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
502
503    /* Filter2 >> 3 */
504    work_a = _mm_cmpgt_epi8(zero, filter2);
505    filter2 = _mm_srli_epi16(filter2, 3);
506    work_a = _mm_and_si128(work_a, te0);
507    filter2 = _mm_and_si128(filter2, t1f);
508    filter2 = _mm_or_si128(filter2, work_a);
509    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
510
511    /* filt >> 1 */
512    filt = _mm_adds_epi8(filter1, t1);
513    work_a = _mm_cmpgt_epi8(zero, filt);
514    filt = _mm_srli_epi16(filt, 1);
515    work_a = _mm_and_si128(work_a, t80);
516    filt = _mm_and_si128(filt, t7f);
517    filt = _mm_or_si128(filt, work_a);
518    filt = _mm_andnot_si128(hev, filt);
519    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
520    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
521    // loopfilter done
522
523    {
524      __m128i work;
525      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
526                                       _mm_subs_epu8(p0, p2)),
527                           _mm_or_si128(_mm_subs_epu8(q2, q0),
528                                        _mm_subs_epu8(q0, q2)));
529      flat = _mm_max_epu8(work, flat);
530      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
531                                       _mm_subs_epu8(p0, p3)),
532                           _mm_or_si128(_mm_subs_epu8(q3, q0),
533                                        _mm_subs_epu8(q0, q3)));
534      flat = _mm_max_epu8(work, flat);
535      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
536                                       _mm_subs_epu8(p0, p4)),
537                           _mm_or_si128(_mm_subs_epu8(q4, q0),
538                                        _mm_subs_epu8(q0, q4)));
539      flat = _mm_subs_epu8(flat, one);
540      flat = _mm_cmpeq_epi8(flat, zero);
541      flat = _mm_and_si128(flat, mask);
542
543      p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
544      q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
545      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
546                                       _mm_subs_epu8(p0, p5)),
547                           _mm_or_si128(_mm_subs_epu8(q5, q0),
548                                        _mm_subs_epu8(q0, q5)));
549      _mm_store_si128((__m128i *)ap[5], p5);
550      _mm_store_si128((__m128i *)aq[5], q5);
551      flat2 = _mm_max_epu8(work, flat2);
552      p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
553      q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
554      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
555                                       _mm_subs_epu8(p0, p6)),
556                           _mm_or_si128(_mm_subs_epu8(q6, q0),
557                                        _mm_subs_epu8(q0, q6)));
558      _mm_store_si128((__m128i *)ap[6], p6);
559      _mm_store_si128((__m128i *)aq[6], q6);
560      flat2 = _mm_max_epu8(work, flat2);
561
562      p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
563      q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
564      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
565                                       _mm_subs_epu8(p0, p7)),
566                           _mm_or_si128(_mm_subs_epu8(q7, q0),
567                                        _mm_subs_epu8(q0, q7)));
568      _mm_store_si128((__m128i *)ap[7], p7);
569      _mm_store_si128((__m128i *)aq[7], q7);
570      flat2 = _mm_max_epu8(work, flat2);
571      flat2 = _mm_subs_epu8(flat2, one);
572      flat2 = _mm_cmpeq_epi8(flat2, zero);
573      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
574    }
575
576    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
577    // flat and wide flat calculations
578    {
579      const __m128i eight = _mm_set1_epi16(8);
580      const __m128i four = _mm_set1_epi16(4);
581      __m128i temp_flat2 = flat2;
582      unsigned char *src = s;
583      int i = 0;
584      do {
585        __m128i workp_shft;
586        __m128i a, b, c;
587
588        unsigned int off = i * 8;
589        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
590        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
591        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
592        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
593        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
594        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
595        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
596        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
597        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
598        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
599        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
600        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
601        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
602        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
603        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
604        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
605
606        c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
607        c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
608
609        b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
610        a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
611        a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
612
613        _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
614                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
615                                          , b));
616
617        c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
618        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
619        _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
620                         _mm_packus_epi16(workp_shft, workp_shft));
621
622        a = _mm_add_epi16(q1, a);
623        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
624        _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
625                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
626                                          , b));
627
628        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
629        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
630        _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
631                         _mm_packus_epi16(workp_shft, workp_shft));
632
633        a = _mm_add_epi16(q2, a);
634        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
635        _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
636                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
637                                          , b));
638
639        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
640        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
641        _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
642                         _mm_packus_epi16(workp_shft, workp_shft));
643
644        a = _mm_add_epi16(q3, a);
645        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
646        _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
647                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
648                                          , b));
649
650        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
651        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
652        _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
653                         _mm_packus_epi16(workp_shft, workp_shft));
654
655        b = _mm_add_epi16(q3, b);
656        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
657        _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
658                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
659                                          , b));
660
661        c = _mm_add_epi16(q4, c);
662        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
663        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
664        _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
665                         _mm_packus_epi16(workp_shft, workp_shft));
666
667        b = _mm_add_epi16(q3, b);
668        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
669        _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
670                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
671                                          , b));
672        a = _mm_add_epi16(q5, a);
673        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
674        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
675        _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
676                         _mm_packus_epi16(workp_shft, workp_shft));
677
678        a = _mm_add_epi16(q6, a);
679        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
680        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
681        _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
682                         _mm_packus_epi16(workp_shft, workp_shft));
683
684        a = _mm_add_epi16(q7, a);
685        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
686        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
687        _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
688                         _mm_packus_epi16(workp_shft, workp_shft));
689
690        a = _mm_add_epi16(q7, a);
691        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
692        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
693        _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
694                         _mm_packus_epi16(workp_shft, workp_shft));
695
696        a = _mm_add_epi16(q7, a);
697        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
698        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
699        _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
700                         _mm_packus_epi16(workp_shft, workp_shft));
701
702        a = _mm_add_epi16(q7, a);
703        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
704        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
705        _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
706                         _mm_packus_epi16(workp_shft, workp_shft));
707
708        a = _mm_add_epi16(q7, a);
709        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
710        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
711        _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
712                         _mm_packus_epi16(workp_shft, workp_shft));
713
714        a = _mm_add_epi16(q7, a);
715        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
716        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
717        _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
718                         _mm_packus_epi16(workp_shft, workp_shft));
719
720        a = _mm_add_epi16(q7, a);
721        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
722        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
723        _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
724                         _mm_packus_epi16(workp_shft, workp_shft));
725
726        temp_flat2 = _mm_srli_si128(temp_flat2, 8);
727        src += 8;
728      } while (++i < 2);
729    }
730    // wide flat
731    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
732
733    work_a = _mm_load_si128((__m128i *)ap[2]);
734    p2 = _mm_load_si128((__m128i *)flat_op[2]);
735    work_a = _mm_andnot_si128(flat, work_a);
736    p2 = _mm_and_si128(flat, p2);
737    p2 = _mm_or_si128(work_a, p2);
738    _mm_store_si128((__m128i *)flat_op[2], p2);
739
740    p1 = _mm_load_si128((__m128i *)flat_op[1]);
741    work_a = _mm_andnot_si128(flat, ps1);
742    p1 = _mm_and_si128(flat, p1);
743    p1 = _mm_or_si128(work_a, p1);
744    _mm_store_si128((__m128i *)flat_op[1], p1);
745
746    p0 = _mm_load_si128((__m128i *)flat_op[0]);
747    work_a = _mm_andnot_si128(flat, ps0);
748    p0 = _mm_and_si128(flat, p0);
749    p0 = _mm_or_si128(work_a, p0);
750    _mm_store_si128((__m128i *)flat_op[0], p0);
751
752    q0 = _mm_load_si128((__m128i *)flat_oq[0]);
753    work_a = _mm_andnot_si128(flat, qs0);
754    q0 = _mm_and_si128(flat, q0);
755    q0 = _mm_or_si128(work_a, q0);
756    _mm_store_si128((__m128i *)flat_oq[0], q0);
757
758    q1 = _mm_load_si128((__m128i *)flat_oq[1]);
759    work_a = _mm_andnot_si128(flat, qs1);
760    q1 = _mm_and_si128(flat, q1);
761    q1 = _mm_or_si128(work_a, q1);
762    _mm_store_si128((__m128i *)flat_oq[1], q1);
763
764    work_a = _mm_load_si128((__m128i *)aq[2]);
765    q2 = _mm_load_si128((__m128i *)flat_oq[2]);
766    work_a = _mm_andnot_si128(flat, work_a);
767    q2 = _mm_and_si128(flat, q2);
768    q2 = _mm_or_si128(work_a, q2);
769    _mm_store_si128((__m128i *)flat_oq[2], q2);
770
771    // write out op6 - op3
772    {
773      unsigned char *dst = (s - 7 * p);
774      for (i = 6; i > 2; i--) {
775        __m128i flat2_output;
776        work_a = _mm_load_si128((__m128i *)ap[i]);
777        flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
778        work_a = _mm_andnot_si128(flat2, work_a);
779        flat2_output = _mm_and_si128(flat2, flat2_output);
780        work_a = _mm_or_si128(work_a, flat2_output);
781        _mm_storeu_si128((__m128i *)dst, work_a);
782        dst += p;
783      }
784    }
785
786    work_a = _mm_load_si128((__m128i *)flat_op[2]);
787    p2 = _mm_load_si128((__m128i *)flat2_op[2]);
788    work_a = _mm_andnot_si128(flat2, work_a);
789    p2 = _mm_and_si128(flat2, p2);
790    p2 = _mm_or_si128(work_a, p2);
791    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
792
793    work_a = _mm_load_si128((__m128i *)flat_op[1]);
794    p1 = _mm_load_si128((__m128i *)flat2_op[1]);
795    work_a = _mm_andnot_si128(flat2, work_a);
796    p1 = _mm_and_si128(flat2, p1);
797    p1 = _mm_or_si128(work_a, p1);
798    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
799
800    work_a = _mm_load_si128((__m128i *)flat_op[0]);
801    p0 = _mm_load_si128((__m128i *)flat2_op[0]);
802    work_a = _mm_andnot_si128(flat2, work_a);
803    p0 = _mm_and_si128(flat2, p0);
804    p0 = _mm_or_si128(work_a, p0);
805    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
806
807    work_a = _mm_load_si128((__m128i *)flat_oq[0]);
808    q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
809    work_a = _mm_andnot_si128(flat2, work_a);
810    q0 = _mm_and_si128(flat2, q0);
811    q0 = _mm_or_si128(work_a, q0);
812    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
813
814    work_a = _mm_load_si128((__m128i *)flat_oq[1]);
815    q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
816    work_a = _mm_andnot_si128(flat2, work_a);
817    q1 = _mm_and_si128(flat2, q1);
818    q1 = _mm_or_si128(work_a, q1);
819    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
820
821    work_a = _mm_load_si128((__m128i *)flat_oq[2]);
822    q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
823    work_a = _mm_andnot_si128(flat2, work_a);
824    q2 = _mm_and_si128(flat2, q2);
825    q2 = _mm_or_si128(work_a, q2);
826    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
827
828    // write out oq3 - oq7
829    {
830      unsigned char *dst = (s + 3 * p);
831      for (i = 3; i < 7; i++) {
832        __m128i flat2_output;
833        work_a = _mm_load_si128((__m128i *)aq[i]);
834        flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
835        work_a = _mm_andnot_si128(flat2, work_a);
836        flat2_output = _mm_and_si128(flat2, flat2_output);
837        work_a = _mm_or_si128(work_a, flat2_output);
838        _mm_storeu_si128((__m128i *)dst, work_a);
839        dst += p;
840      }
841    }
842  }
843}
844
845void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
846                                       int p,
847                                       const unsigned char *_blimit,
848                                       const unsigned char *_limit,
849                                       const unsigned char *_thresh,
850                                       int count) {
851  if (count == 1)
852    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
853  else
854    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
855}
856
857void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
858                                            int p,
859                                            const unsigned char *_blimit,
860                                            const unsigned char *_limit,
861                                            const unsigned char *_thresh,
862                                            int count) {
863  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
864  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
865  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
866  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
867  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
868  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
869  __m128i mask, hev, flat;
870  const __m128i zero = _mm_set1_epi16(0);
871  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
872  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
873  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
874  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
875  const __m128i thresh =
876      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
877  const __m128i limit =
878      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
879  const __m128i blimit =
880      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
881
882  (void)count;
883  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
884  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
885  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
886  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
887  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
888  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
889  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
890  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
891  {
892    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
893                                          _mm_subs_epu8(p0, p1));
894    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
895                                          _mm_subs_epu8(q0, q1));
896    const __m128i one = _mm_set1_epi8(1);
897    const __m128i fe = _mm_set1_epi8(0xfe);
898    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
899    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
900                                    _mm_subs_epu8(q0, p0));
901    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
902                                    _mm_subs_epu8(q1, p1));
903    __m128i work;
904    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
905    hev = _mm_subs_epu8(flat, thresh);
906    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
907
908    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
909    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
910    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
911    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
912    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
913    mask = _mm_max_epu8(flat, mask);
914    // mask |= (abs(p1 - p0) > limit) * -1;
915    // mask |= (abs(q1 - q0) > limit) * -1;
916    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
917                                     _mm_subs_epu8(p1, p2)),
918                         _mm_or_si128(_mm_subs_epu8(p3, p2),
919                                      _mm_subs_epu8(p2, p3)));
920    mask = _mm_max_epu8(work, mask);
921    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
922                                     _mm_subs_epu8(q1, q2)),
923                         _mm_or_si128(_mm_subs_epu8(q3, q2),
924                                      _mm_subs_epu8(q2, q3)));
925    mask = _mm_max_epu8(work, mask);
926    mask = _mm_subs_epu8(mask, limit);
927    mask = _mm_cmpeq_epi8(mask, zero);
928
929    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
930                                     _mm_subs_epu8(p0, p2)),
931                         _mm_or_si128(_mm_subs_epu8(q2, q0),
932                                      _mm_subs_epu8(q0, q2)));
933    flat = _mm_max_epu8(work, flat);
934    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
935                                     _mm_subs_epu8(p0, p3)),
936                         _mm_or_si128(_mm_subs_epu8(q3, q0),
937                                      _mm_subs_epu8(q0, q3)));
938    flat = _mm_max_epu8(work, flat);
939    flat = _mm_subs_epu8(flat, one);
940    flat = _mm_cmpeq_epi8(flat, zero);
941    flat = _mm_and_si128(flat, mask);
942  }
943  {
944    const __m128i four = _mm_set1_epi16(4);
945    unsigned char *src = s;
946    {
947      __m128i workp_a, workp_b, workp_shft;
948      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
949      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
950      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
951      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
952      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
953      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
954      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
955      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
956
957      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
958      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
959      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
960      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
961      _mm_storel_epi64((__m128i *)&flat_op2[0],
962                       _mm_packus_epi16(workp_shft, workp_shft));
963
964      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
965      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
966      _mm_storel_epi64((__m128i *)&flat_op1[0],
967                       _mm_packus_epi16(workp_shft, workp_shft));
968
969      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
970      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
971      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
972      _mm_storel_epi64((__m128i *)&flat_op0[0],
973                       _mm_packus_epi16(workp_shft, workp_shft));
974
975      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
976      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
977      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
978      _mm_storel_epi64((__m128i *)&flat_oq0[0],
979                       _mm_packus_epi16(workp_shft, workp_shft));
980
981      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
982      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
983      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
984      _mm_storel_epi64((__m128i *)&flat_oq1[0],
985                       _mm_packus_epi16(workp_shft, workp_shft));
986
987      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
988      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
989      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
990      _mm_storel_epi64((__m128i *)&flat_oq2[0],
991                       _mm_packus_epi16(workp_shft, workp_shft));
992    }
993  }
994  // lp filter
995  {
996    const __m128i t4 = _mm_set1_epi8(4);
997    const __m128i t3 = _mm_set1_epi8(3);
998    const __m128i t80 = _mm_set1_epi8(0x80);
999    const __m128i te0 = _mm_set1_epi8(0xe0);
1000    const __m128i t1f = _mm_set1_epi8(0x1f);
1001    const __m128i t1 = _mm_set1_epi8(0x1);
1002    const __m128i t7f = _mm_set1_epi8(0x7f);
1003
1004    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
1005                                      t80);
1006    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
1007                                      t80);
1008    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
1009                                      t80);
1010    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
1011                                      t80);
1012    __m128i filt;
1013    __m128i work_a;
1014    __m128i filter1, filter2;
1015
1016    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1017    work_a = _mm_subs_epi8(qs0, ps0);
1018    filt = _mm_adds_epi8(filt, work_a);
1019    filt = _mm_adds_epi8(filt, work_a);
1020    filt = _mm_adds_epi8(filt, work_a);
1021    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
1022    filt = _mm_and_si128(filt, mask);
1023
1024    filter1 = _mm_adds_epi8(filt, t4);
1025    filter2 = _mm_adds_epi8(filt, t3);
1026
1027    /* Filter1 >> 3 */
1028    work_a = _mm_cmpgt_epi8(zero, filter1);
1029    filter1 = _mm_srli_epi16(filter1, 3);
1030    work_a = _mm_and_si128(work_a, te0);
1031    filter1 = _mm_and_si128(filter1, t1f);
1032    filter1 = _mm_or_si128(filter1, work_a);
1033
1034    /* Filter2 >> 3 */
1035    work_a = _mm_cmpgt_epi8(zero, filter2);
1036    filter2 = _mm_srli_epi16(filter2, 3);
1037    work_a = _mm_and_si128(work_a, te0);
1038    filter2 = _mm_and_si128(filter2, t1f);
1039    filter2 = _mm_or_si128(filter2, work_a);
1040
1041    /* filt >> 1 */
1042    filt = _mm_adds_epi8(filter1, t1);
1043    work_a = _mm_cmpgt_epi8(zero, filt);
1044    filt = _mm_srli_epi16(filt, 1);
1045    work_a = _mm_and_si128(work_a, t80);
1046    filt = _mm_and_si128(filt, t7f);
1047    filt = _mm_or_si128(filt, work_a);
1048
1049    filt = _mm_andnot_si128(hev, filt);
1050
1051    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1052    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1053    work_a = _mm_andnot_si128(flat, work_a);
1054    q0 = _mm_and_si128(flat, q0);
1055    q0 = _mm_or_si128(work_a, q0);
1056
1057    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1058    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1059    work_a = _mm_andnot_si128(flat, work_a);
1060    q1 = _mm_and_si128(flat, q1);
1061    q1 = _mm_or_si128(work_a, q1);
1062
1063    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1064    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1065    work_a = _mm_andnot_si128(flat, work_a);
1066    q2 = _mm_and_si128(flat, q2);
1067    q2 = _mm_or_si128(work_a, q2);
1068
1069    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1070    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1071    work_a = _mm_andnot_si128(flat, work_a);
1072    p0 = _mm_and_si128(flat, p0);
1073    p0 = _mm_or_si128(work_a, p0);
1074
1075    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1076    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1077    work_a = _mm_andnot_si128(flat, work_a);
1078    p1 = _mm_and_si128(flat, p1);
1079    p1 = _mm_or_si128(work_a, p1);
1080
1081    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1082    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1083    work_a = _mm_andnot_si128(flat, work_a);
1084    p2 = _mm_and_si128(flat, p2);
1085    p2 = _mm_or_si128(work_a, p2);
1086
1087    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1088    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
1089    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
1090    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
1091    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
1092    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1093  }
1094}
1095
1096static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1097                                 int in_p, unsigned char *out, int out_p) {
1098  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1099  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1100
1101  /* Read in 16 lines */
1102  x0 = _mm_loadl_epi64((__m128i *)in0);
1103  x8 = _mm_loadl_epi64((__m128i *)in1);
1104  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
1105  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
1106  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
1107  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
1108  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
1109  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
1110  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
1111  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
1112  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
1113  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
1114  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
1115  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
1116  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
1117  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
1118
1119  x0 = _mm_unpacklo_epi8(x0, x1);
1120  x1 = _mm_unpacklo_epi8(x2, x3);
1121  x2 = _mm_unpacklo_epi8(x4, x5);
1122  x3 = _mm_unpacklo_epi8(x6, x7);
1123
1124  x8 = _mm_unpacklo_epi8(x8, x9);
1125  x9 = _mm_unpacklo_epi8(x10, x11);
1126  x10 = _mm_unpacklo_epi8(x12, x13);
1127  x11 = _mm_unpacklo_epi8(x14, x15);
1128
1129  x4 = _mm_unpacklo_epi16(x0, x1);
1130  x5 = _mm_unpacklo_epi16(x2, x3);
1131  x12 = _mm_unpacklo_epi16(x8, x9);
1132  x13 = _mm_unpacklo_epi16(x10, x11);
1133
1134  x6 = _mm_unpacklo_epi32(x4, x5);
1135  x7 = _mm_unpackhi_epi32(x4, x5);
1136  x14 = _mm_unpacklo_epi32(x12, x13);
1137  x15 = _mm_unpackhi_epi32(x12, x13);
1138
1139  /* Store first 4-line result */
1140  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1141  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1142  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1143  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1144
1145  x4 = _mm_unpackhi_epi16(x0, x1);
1146  x5 = _mm_unpackhi_epi16(x2, x3);
1147  x12 = _mm_unpackhi_epi16(x8, x9);
1148  x13 = _mm_unpackhi_epi16(x10, x11);
1149
1150  x6 = _mm_unpacklo_epi32(x4, x5);
1151  x7 = _mm_unpackhi_epi32(x4, x5);
1152  x14 = _mm_unpacklo_epi32(x12, x13);
1153  x15 = _mm_unpackhi_epi32(x12, x13);
1154
1155  /* Store second 4-line result */
1156  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1157  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1158  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1159  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1160}
1161
1162static INLINE void transpose(unsigned char *src[], int in_p,
1163                             unsigned char *dst[], int out_p,
1164                             int num_8x8_to_transpose) {
1165  int idx8x8 = 0;
1166  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1167  do {
1168    unsigned char *in = src[idx8x8];
1169    unsigned char *out = dst[idx8x8];
1170
1171    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
1172    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
1173    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
1174    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
1175    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
1176    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
1177    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
1178    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
1179    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1180    x0 = _mm_unpacklo_epi8(x0, x1);
1181    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1182    x1 = _mm_unpacklo_epi8(x2, x3);
1183    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1184    x2 = _mm_unpacklo_epi8(x4, x5);
1185    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1186    x3 = _mm_unpacklo_epi8(x6, x7);
1187    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1188    x4 = _mm_unpacklo_epi16(x0, x1);
1189    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1190    x5 = _mm_unpacklo_epi16(x2, x3);
1191    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1192    x6 = _mm_unpacklo_epi32(x4, x5);
1193    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1194    x7 = _mm_unpackhi_epi32(x4, x5);
1195
1196    _mm_storel_pd((double *)(out + 0*out_p),
1197                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
1198    _mm_storeh_pd((double *)(out + 1*out_p),
1199                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
1200    _mm_storel_pd((double *)(out + 2*out_p),
1201                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
1202    _mm_storeh_pd((double *)(out + 3*out_p),
1203                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
1204
1205    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1206    x4 = _mm_unpackhi_epi16(x0, x1);
1207    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1208    x5 = _mm_unpackhi_epi16(x2, x3);
1209    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1210    x6 = _mm_unpacklo_epi32(x4, x5);
1211    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1212    x7 = _mm_unpackhi_epi32(x4, x5);
1213
1214    _mm_storel_pd((double *)(out + 4*out_p),
1215                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
1216    _mm_storeh_pd((double *)(out + 5*out_p),
1217                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
1218    _mm_storel_pd((double *)(out + 6*out_p),
1219                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
1220    _mm_storeh_pd((double *)(out + 7*out_p),
1221                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
1222  } while (++idx8x8 < num_8x8_to_transpose);
1223}
1224
1225void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
1226                                          int p,
1227                                          const unsigned char *blimit,
1228                                          const unsigned char *limit,
1229                                          const unsigned char *thresh,
1230                                          int count) {
1231  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1232  unsigned char *src[2];
1233  unsigned char *dst[2];
1234
1235  (void)count;
1236  /* Transpose 16x16 */
1237  transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
1238  transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
1239
1240  /* Loop filtering */
1241  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
1242                                         thresh, 1);
1243  src[0] = t_dst + 3 * 16;
1244  src[1] = t_dst + 3 * 16 + 8;
1245
1246  dst[0] = s - 5;
1247  dst[1] = s - 5 + p * 8;
1248
1249  /* Transpose 16x8 */
1250  transpose(src, 16, dst, p, 2);
1251}
1252
1253void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
1254                                     int p,
1255                                     const unsigned char *blimit,
1256                                     const unsigned char *limit,
1257                                     const unsigned char *thresh) {
1258  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1259  unsigned char *src[4];
1260  unsigned char *dst[4];
1261
1262  dst[0] = t_dst;
1263  dst[1] = t_dst + 8 * 16;
1264
1265  src[0] = s - 8;
1266  src[1] = s - 8 + 8;
1267
1268  /* Transpose 16x16 */
1269  transpose(src, p, dst, 16, 2);
1270
1271  /* Loop filtering */
1272  vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
1273                                    thresh, 1);
1274
1275  src[0] = t_dst;
1276  src[1] = t_dst + 8 * 16;
1277
1278  dst[0] = s - 8;
1279  dst[1] = s - 8 + 8;
1280
1281  transpose(src, 16, dst, p, 2);
1282}
1283