1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <emmintrin.h>  // SSE2
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx_ports/mem.h"
15#include "vpx_ports/emmintrin_compat.h"
16
17static INLINE __m128i abs_diff(__m128i a, __m128i b) {
18  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
19}
20
21// filter_mask and hev_mask
22#define FILTER_HEV_MASK                                                       \
23  do {                                                                        \
24    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
25    __m128i flat = abs_diff(q1p1, q0p0);                                      \
26    /* abs(p1 - q1), abs(p0 - q0) */                                          \
27    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
28    __m128i abs_p0q0, abs_p1q1, work;                                         \
29                                                                              \
30    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
31    hev =                                                                     \
32        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
33    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
34    hev = _mm_packs_epi16(hev, hev);                                          \
35                                                                              \
36    /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
37    /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */    \
38    abs_p0q0 =                                                                \
39        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
40    abs_p1q1 =                                                                \
41        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
42    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
43    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
44    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
45    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
46    /* abs(p3 - p2), abs(p2 - p1) */                                          \
47    work = abs_diff(p3p2, p2p1);                                              \
48    flat = _mm_max_epu8(work, flat);                                          \
49    /* abs(q3 - q2), abs(q2 - q1) */                                          \
50    work = abs_diff(q3q2, q2q1);                                              \
51    flat = _mm_max_epu8(work, flat);                                          \
52    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
53    mask = _mm_unpacklo_epi64(mask, flat);                                    \
54    mask = _mm_subs_epu8(mask, limit);                                        \
55    mask = _mm_cmpeq_epi8(mask, zero);                                        \
56    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
57  } while (0)
58
59#define FILTER4                                                             \
60  do {                                                                      \
61    const __m128i t3t4 =                                                    \
62        _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
63    const __m128i t80 = _mm_set1_epi8(0x80);                                \
64    __m128i filter, filter2filter1, work;                                   \
65                                                                            \
66    ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
67    qs1qs0 = _mm_xor_si128(q1q0, t80);                                      \
68                                                                            \
69    /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */               \
70    work = _mm_subs_epi8(ps1ps0, qs1qs0);                                   \
71    filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                   \
72    /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */      \
73    filter = _mm_subs_epi8(filter, work);                                   \
74    filter = _mm_subs_epi8(filter, work);                                   \
75    filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */           \
76    filter = _mm_and_si128(filter, mask); /* & mask */                      \
77    filter = _mm_unpacklo_epi64(filter, filter);                            \
78                                                                            \
79    /* filter1 = signed_char_clamp(filter + 4) >> 3; */                     \
80    /* filter2 = signed_char_clamp(filter + 3) >> 3; */                     \
81    filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */   \
82    filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);             \
83    filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);     \
84    filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */         \
85    filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */         \
86    filter2filter1 = _mm_packs_epi16(filter2filter1, filter);               \
87                                                                            \
88    /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                   \
89    filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */                   \
90    filter = _mm_unpacklo_epi8(filter, filter);                             \
91    filter = _mm_srai_epi16(filter, 9); /* round */                         \
92    filter = _mm_packs_epi16(filter, filter);                               \
93    filter = _mm_andnot_si128(hev, filter);                                 \
94                                                                            \
95    hev = _mm_unpackhi_epi64(filter2filter1, filter);                       \
96    filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);            \
97                                                                            \
98    /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
99    qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                         \
100    /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
101    ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                    \
102    qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */                       \
103    ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
104  } while (0)
105
106void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
107                               const uint8_t *_blimit, const uint8_t *_limit,
108                               const uint8_t *_thresh) {
109  const __m128i zero = _mm_set1_epi16(0);
110  const __m128i limit =
111      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
112                         _mm_loadl_epi64((const __m128i *)_limit));
113  const __m128i thresh =
114      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
115  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
116  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
117  __m128i mask, hev;
118
119  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
120                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
121  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
122                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
123  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
124                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
125  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
126                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
127  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
128  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
129  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
130  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
131
132  FILTER_HEV_MASK;
133  FILTER4;
134
135  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
136  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
137  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
138  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
139}
140
141void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
142                             const uint8_t *_blimit, const uint8_t *_limit,
143                             const uint8_t *_thresh) {
144  const __m128i zero = _mm_set1_epi16(0);
145  const __m128i limit =
146      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
147                         _mm_loadl_epi64((const __m128i *)_limit));
148  const __m128i thresh =
149      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
150  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
151  __m128i x0, x1, x2, x3;
152  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
153  __m128i mask, hev;
154
155  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
156  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
157                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
158
159  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
160  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
161                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
162
163  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
164  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
165                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
166
167  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
168  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
169                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
170
171  // Transpose 8x8
172  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
173  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
174  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
175  x0 = _mm_unpacklo_epi16(x2, x3);
176  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
177  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
178  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
179  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
180  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
181  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
182
183  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
184  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
185  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
186  x2 = _mm_unpackhi_epi16(x2, x3);
187  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
188  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
189  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
190  q1q0 = _mm_unpacklo_epi32(q1q0, x2);
191
192  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
193  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
194  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
195  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
196  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
197
198  FILTER_HEV_MASK;
199  FILTER4;
200
201  // Transpose 8x4 to 4x8
202  // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
203  // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
204  // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
205  ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
206  // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
207  x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
208  // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
209  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
210  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
211  qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
212  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
213  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
214
215  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
216  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
217  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
218  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
219  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
220  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
221  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
222
223  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
224  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
225  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
226  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
227  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
228  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
229  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
230}
231
232void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
233                                const unsigned char *_blimit,
234                                const unsigned char *_limit,
235                                const unsigned char *_thresh) {
236  const __m128i zero = _mm_set1_epi16(0);
237  const __m128i one = _mm_set1_epi8(1);
238  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
239  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
240  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
241  __m128i mask, hev, flat, flat2;
242  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
243  __m128i abs_p1p0;
244
245  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
246  q4p4 = _mm_castps_si128(
247      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
248  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
249  q3p3 = _mm_castps_si128(
250      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
251  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
252  q2p2 = _mm_castps_si128(
253      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
254  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
255  q1p1 = _mm_castps_si128(
256      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
257  p1q1 = _mm_shuffle_epi32(q1p1, 78);
258  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
259  q0p0 = _mm_castps_si128(
260      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
261  p0q0 = _mm_shuffle_epi32(q0p0, 78);
262
263  {
264    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
265    abs_p1p0 = abs_diff(q1p1, q0p0);
266    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
267    fe = _mm_set1_epi8(0xfe);
268    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
269    abs_p0q0 = abs_diff(q0p0, p0q0);
270    abs_p1q1 = abs_diff(q1p1, p1q1);
271    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
272    hev = _mm_subs_epu8(flat, thresh);
273    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
274
275    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
276    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
277    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
278    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
279    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
280    mask = _mm_max_epu8(abs_p1p0, mask);
281    // mask |= (abs(p1 - p0) > limit) * -1;
282    // mask |= (abs(q1 - q0) > limit) * -1;
283
284    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
285    mask = _mm_max_epu8(work, mask);
286    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
287    mask = _mm_subs_epu8(mask, limit);
288    mask = _mm_cmpeq_epi8(mask, zero);
289  }
290
291  // lp filter
292  {
293    const __m128i t4 = _mm_set1_epi8(4);
294    const __m128i t3 = _mm_set1_epi8(3);
295    const __m128i t80 = _mm_set1_epi8(0x80);
296    const __m128i t1 = _mm_set1_epi16(0x1);
297    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
298    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
299    __m128i qs0 = _mm_xor_si128(p0q0, t80);
300    __m128i qs1 = _mm_xor_si128(p1q1, t80);
301    __m128i filt;
302    __m128i work_a;
303    __m128i filter1, filter2;
304    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
305    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
306
307    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
308    work_a = _mm_subs_epi8(qs0, qs0ps0);
309    filt = _mm_adds_epi8(filt, work_a);
310    filt = _mm_adds_epi8(filt, work_a);
311    filt = _mm_adds_epi8(filt, work_a);
312    // (vpx_filter + 3 * (qs0 - ps0)) & mask
313    filt = _mm_and_si128(filt, mask);
314
315    filter1 = _mm_adds_epi8(filt, t4);
316    filter2 = _mm_adds_epi8(filt, t3);
317
318    filter1 = _mm_unpacklo_epi8(zero, filter1);
319    filter1 = _mm_srai_epi16(filter1, 0xB);
320    filter2 = _mm_unpacklo_epi8(zero, filter2);
321    filter2 = _mm_srai_epi16(filter2, 0xB);
322
323    // Filter1 >> 3
324    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
325    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
326
327    // filt >> 1
328    filt = _mm_adds_epi16(filter1, t1);
329    filt = _mm_srai_epi16(filt, 1);
330    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
331                            filt);
332    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
333    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
334    // loopfilter done
335
336    {
337      __m128i work;
338      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
339      flat = _mm_max_epu8(abs_p1p0, flat);
340      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
341      flat = _mm_subs_epu8(flat, one);
342      flat = _mm_cmpeq_epi8(flat, zero);
343      flat = _mm_and_si128(flat, mask);
344
345      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
346      q5p5 = _mm_castps_si128(
347          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
348
349      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
350      q6p6 = _mm_castps_si128(
351          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
352      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
353
354      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
355      q7p7 = _mm_castps_si128(
356          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
357      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
358      flat2 = _mm_max_epu8(work, flat2);
359      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
360      flat2 = _mm_subs_epu8(flat2, one);
361      flat2 = _mm_cmpeq_epi8(flat2, zero);
362      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
363    }
364
365    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
366    // flat and wide flat calculations
367    {
368      const __m128i eight = _mm_set1_epi16(8);
369      const __m128i four = _mm_set1_epi16(4);
370      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
371      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
372      __m128i pixelFilter_p, pixelFilter_q;
373      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
374      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
375
376      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
377      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
378      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
379      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
380      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
381      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
382      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
383      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
384      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
385      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
386      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
387      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
388      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
389      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
390      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
391      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
392
393      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
394                                    _mm_add_epi16(p4_16, p3_16));
395      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
396                                    _mm_add_epi16(q4_16, q3_16));
397
398      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
399      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
400
401      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
402      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
403      pixelFilter_p =
404          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
405      pixetFilter_p2p1p0 = _mm_add_epi16(
406          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
407      res_p = _mm_srli_epi16(
408          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
409      res_q = _mm_srli_epi16(
410          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
411      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
412      res_p = _mm_srli_epi16(
413          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
414      res_q = _mm_srli_epi16(
415          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
416
417      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
418
419      sum_p7 = _mm_add_epi16(p7_16, p7_16);
420      sum_q7 = _mm_add_epi16(q7_16, q7_16);
421      sum_p3 = _mm_add_epi16(p3_16, p3_16);
422      sum_q3 = _mm_add_epi16(q3_16, q3_16);
423
424      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
425      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
426      res_p = _mm_srli_epi16(
427          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
428      res_q = _mm_srli_epi16(
429          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
430      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
431
432      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
433      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
434      res_p = _mm_srli_epi16(
435          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
436      res_q = _mm_srli_epi16(
437          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
438      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
439
440      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
441      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
442      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
443      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
444
445      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
446      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
447      res_p = _mm_srli_epi16(
448          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
449      res_q = _mm_srli_epi16(
450          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
451      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
452
453      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
454      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
455
456      res_p = _mm_srli_epi16(
457          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
458      res_q = _mm_srli_epi16(
459          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
460      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
461
462      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
463      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
464      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
465      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
466      res_p = _mm_srli_epi16(
467          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
468      res_q = _mm_srli_epi16(
469          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
470      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
471
472      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
473      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
474      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
475      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
476      res_p = _mm_srli_epi16(
477          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
478      res_q = _mm_srli_epi16(
479          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
480      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
481
482      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
483      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
484      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
485      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
486      res_p = _mm_srli_epi16(
487          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
488      res_q = _mm_srli_epi16(
489          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
490      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
491
492      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
493      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
494      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
495      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
496      res_p = _mm_srli_epi16(
497          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
498      res_q = _mm_srli_epi16(
499          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
500      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
501    }
502    // wide flat
503    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
504
505    flat = _mm_shuffle_epi32(flat, 68);
506    flat2 = _mm_shuffle_epi32(flat2, 68);
507
508    q2p2 = _mm_andnot_si128(flat, q2p2);
509    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
510    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
511
512    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
513    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
514    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
515
516    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
517    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
518    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
519
520    q6p6 = _mm_andnot_si128(flat2, q6p6);
521    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
522    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
523    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
524    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
525
526    q5p5 = _mm_andnot_si128(flat2, q5p5);
527    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
528    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
529    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
530    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
531
532    q4p4 = _mm_andnot_si128(flat2, q4p4);
533    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
534    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
535    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
536    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
537
538    q3p3 = _mm_andnot_si128(flat2, q3p3);
539    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
540    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
541    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
542    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
543
544    q2p2 = _mm_andnot_si128(flat2, q2p2);
545    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
546    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
547    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
548    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
549
550    q1p1 = _mm_andnot_si128(flat2, q1p1);
551    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
552    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
553    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
554    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
555
556    q0p0 = _mm_andnot_si128(flat2, q0p0);
557    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
558    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
559    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
560    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
561  }
562}
563
564static INLINE __m128i filter_add2_sub2(const __m128i *const total,
565                                       const __m128i *const a1,
566                                       const __m128i *const a2,
567                                       const __m128i *const s1,
568                                       const __m128i *const s2) {
569  __m128i x = _mm_add_epi16(*a1, *total);
570  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
571  return x;
572}
573
574static INLINE __m128i filter8_mask(const __m128i *const flat,
575                                   const __m128i *const other_filt,
576                                   const __m128i *const f8_lo,
577                                   const __m128i *const f8_hi) {
578  const __m128i f8 =
579      _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
580  const __m128i result = _mm_and_si128(*flat, f8);
581  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
582}
583
584static INLINE __m128i filter16_mask(const __m128i *const flat,
585                                    const __m128i *const other_filt,
586                                    const __m128i *const f_lo,
587                                    const __m128i *const f_hi) {
588  const __m128i f =
589      _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
590  const __m128i result = _mm_and_si128(*flat, f);
591  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
592}
593
594void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
595                                     const unsigned char *_blimit,
596                                     const unsigned char *_limit,
597                                     const unsigned char *_thresh) {
598  const __m128i zero = _mm_set1_epi16(0);
599  const __m128i one = _mm_set1_epi8(1);
600  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
601  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
602  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
603  __m128i mask, hev, flat, flat2;
604  __m128i p7, p6, p5;
605  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
606  __m128i q5, q6, q7;
607
608  __m128i op2, op1, op0, oq0, oq1, oq2;
609
610  __m128i max_abs_p1p0q1q0;
611
612  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
613  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
614  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
615  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
616  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
617  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
618  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
619  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
620  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
621  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
622  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
623  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
624  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
625  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
626  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
627  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
628
629  {
630    const __m128i abs_p1p0 = abs_diff(p1, p0);
631    const __m128i abs_q1q0 = abs_diff(q1, q0);
632    const __m128i fe = _mm_set1_epi8(0xfe);
633    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
634    __m128i abs_p0q0 = abs_diff(p0, q0);
635    __m128i abs_p1q1 = abs_diff(p1, q1);
636    __m128i work;
637    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
638
639    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
640    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
641    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
642    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
643    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
644    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
645    // mask |= (abs(p1 - p0) > limit) * -1;
646    // mask |= (abs(q1 - q0) > limit) * -1;
647    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
648    mask = _mm_max_epu8(work, mask);
649    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
650    mask = _mm_max_epu8(work, mask);
651    mask = _mm_subs_epu8(mask, limit);
652    mask = _mm_cmpeq_epi8(mask, zero);
653  }
654
655  {
656    __m128i work;
657    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
658    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
659    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
660    flat = _mm_max_epu8(work, flat);
661    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
662    flat = _mm_subs_epu8(flat, one);
663    flat = _mm_cmpeq_epi8(flat, zero);
664    flat = _mm_and_si128(flat, mask);
665    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
666    flat2 = _mm_max_epu8(work, flat2);
667    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
668    flat2 = _mm_max_epu8(work, flat2);
669    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
670    flat2 = _mm_max_epu8(work, flat2);
671    flat2 = _mm_subs_epu8(flat2, one);
672    flat2 = _mm_cmpeq_epi8(flat2, zero);
673    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
674  }
675
676  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
677  // filter4
678  {
679    const __m128i t4 = _mm_set1_epi8(4);
680    const __m128i t3 = _mm_set1_epi8(3);
681    const __m128i t80 = _mm_set1_epi8(0x80);
682    const __m128i te0 = _mm_set1_epi8(0xe0);
683    const __m128i t1f = _mm_set1_epi8(0x1f);
684    const __m128i t1 = _mm_set1_epi8(0x1);
685    const __m128i t7f = _mm_set1_epi8(0x7f);
686    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
687
688    __m128i filt;
689    __m128i work_a;
690    __m128i filter1, filter2;
691
692    op1 = _mm_xor_si128(p1, t80);
693    op0 = _mm_xor_si128(p0, t80);
694    oq0 = _mm_xor_si128(q0, t80);
695    oq1 = _mm_xor_si128(q1, t80);
696
697    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
698    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
699    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
700
701    work_a = _mm_subs_epi8(oq0, op0);
702    filt = _mm_adds_epi8(filt, work_a);
703    filt = _mm_adds_epi8(filt, work_a);
704    filt = _mm_adds_epi8(filt, work_a);
705    // (vpx_filter + 3 * (qs0 - ps0)) & mask
706    filt = _mm_and_si128(filt, mask);
707    filter1 = _mm_adds_epi8(filt, t4);
708    filter2 = _mm_adds_epi8(filt, t3);
709
710    // Filter1 >> 3
711    work_a = _mm_cmpgt_epi8(zero, filter1);
712    filter1 = _mm_srli_epi16(filter1, 3);
713    work_a = _mm_and_si128(work_a, te0);
714    filter1 = _mm_and_si128(filter1, t1f);
715    filter1 = _mm_or_si128(filter1, work_a);
716    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
717
718    // Filter2 >> 3
719    work_a = _mm_cmpgt_epi8(zero, filter2);
720    filter2 = _mm_srli_epi16(filter2, 3);
721    work_a = _mm_and_si128(work_a, te0);
722    filter2 = _mm_and_si128(filter2, t1f);
723    filter2 = _mm_or_si128(filter2, work_a);
724    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
725
726    // filt >> 1
727    filt = _mm_adds_epi8(filter1, t1);
728    work_a = _mm_cmpgt_epi8(zero, filt);
729    filt = _mm_srli_epi16(filt, 1);
730    work_a = _mm_and_si128(work_a, t80);
731    filt = _mm_and_si128(filt, t7f);
732    filt = _mm_or_si128(filt, work_a);
733    filt = _mm_andnot_si128(hev, filt);
734    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
735    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
736    // loopfilter done
737
738    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
739    // filter8
740    {
741      const __m128i four = _mm_set1_epi16(4);
742      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
743      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
744      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
745      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
746      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
747      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
748      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
749      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
750
751      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
752      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
753      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
754      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
755      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
756      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
757      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
758      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
759      __m128i f8_lo, f8_hi;
760
761      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
762                            _mm_add_epi16(p3_lo, p2_lo));
763      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
764                            _mm_add_epi16(p2_lo, p1_lo));
765      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
766
767      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
768                            _mm_add_epi16(p3_hi, p2_hi));
769      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
770                            _mm_add_epi16(p2_hi, p1_hi));
771      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
772
773      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
774
775      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
776      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
777      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
778
779      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
780      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
781      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
782
783      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
784      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
785      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
786
787      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
788      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
789      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
790
791      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
792      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
793      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
794    }
795
796    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
797    // wide flat calculations
798    {
799      const __m128i eight = _mm_set1_epi16(8);
800      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
801      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
802      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
803      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
804      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
805      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
806      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
807      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
808      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
809      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
810      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
811      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
812      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
813      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
814      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
815      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
816
817      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
818      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
819      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
820      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
821      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
822      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
823      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
824      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
825      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
826      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
827      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
828      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
829      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
830      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
831      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
832      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
833
834      __m128i f_lo;
835      __m128i f_hi;
836
837      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
838      f_lo =
839          _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
840      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
841                           _mm_add_epi16(p2_lo, p1_lo));
842      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
843      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
844
845      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
846      f_hi =
847          _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
848      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
849                           _mm_add_epi16(p2_hi, p1_hi));
850      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
851      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
852
853      p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
854      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
855
856      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
857      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
858      p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
859      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
860
861      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
862      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
863      p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
864      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
865
866      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
867      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
868      p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
869      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
870
871      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
872      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
873      op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
874      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
875
876      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
877      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
878      op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
879      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
880
881      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
882      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
883      op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
884      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
885
886      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
887      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
888      oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
889      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
890
891      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
892      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
893      oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
894      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
895
896      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
897      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
898      oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
899      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
900
901      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
902      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
903      q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
904      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
905
906      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
907      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
908      q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
909      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
910
911      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
912      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
913      q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
914      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
915
916      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
917      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
918      q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
919      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
920    }
921    // wide flat
922    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
923  }
924}
925
926void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
927                               const unsigned char *_blimit,
928                               const unsigned char *_limit,
929                               const unsigned char *_thresh) {
930  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
931  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
932  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
933  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
934  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
935  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
936  const __m128i zero = _mm_set1_epi16(0);
937  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
938  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
939  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
940  __m128i mask, hev, flat;
941  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
942  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
943
944  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
945                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
946  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
947                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
948  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
949                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
950  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
951                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
952  p1q1 = _mm_shuffle_epi32(q1p1, 78);
953  p0q0 = _mm_shuffle_epi32(q0p0, 78);
954
955  {
956    // filter_mask and hev_mask
957    const __m128i one = _mm_set1_epi8(1);
958    const __m128i fe = _mm_set1_epi8(0xfe);
959    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
960    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
961    abs_p1p0 = abs_diff(q1p1, q0p0);
962    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
963
964    abs_p0q0 = abs_diff(q0p0, p0q0);
965    abs_p1q1 = abs_diff(q1p1, p1q1);
966    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
967    hev = _mm_subs_epu8(flat, thresh);
968    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
969
970    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
971    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
972    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
973    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
974    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
975    mask = _mm_max_epu8(abs_p1p0, mask);
976    // mask |= (abs(p1 - p0) > limit) * -1;
977    // mask |= (abs(q1 - q0) > limit) * -1;
978
979    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
980    mask = _mm_max_epu8(work, mask);
981    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
982    mask = _mm_subs_epu8(mask, limit);
983    mask = _mm_cmpeq_epi8(mask, zero);
984
985    // flat_mask4
986
987    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
988    flat = _mm_max_epu8(abs_p1p0, flat);
989    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
990    flat = _mm_subs_epu8(flat, one);
991    flat = _mm_cmpeq_epi8(flat, zero);
992    flat = _mm_and_si128(flat, mask);
993  }
994
995  {
996    const __m128i four = _mm_set1_epi16(4);
997    unsigned char *src = s;
998    {
999      __m128i workp_a, workp_b, workp_shft;
1000      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1001      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1002      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1003      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1004      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1005      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1006      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1007      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1008
1009      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1010      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1011      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1012      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1013      _mm_storel_epi64((__m128i *)&flat_op2[0],
1014                       _mm_packus_epi16(workp_shft, workp_shft));
1015
1016      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1017      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1018      _mm_storel_epi64((__m128i *)&flat_op1[0],
1019                       _mm_packus_epi16(workp_shft, workp_shft));
1020
1021      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1022      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1023      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1024      _mm_storel_epi64((__m128i *)&flat_op0[0],
1025                       _mm_packus_epi16(workp_shft, workp_shft));
1026
1027      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1028      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1029      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1030      _mm_storel_epi64((__m128i *)&flat_oq0[0],
1031                       _mm_packus_epi16(workp_shft, workp_shft));
1032
1033      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1034      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1035      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1036      _mm_storel_epi64((__m128i *)&flat_oq1[0],
1037                       _mm_packus_epi16(workp_shft, workp_shft));
1038
1039      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1040      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1041      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1042      _mm_storel_epi64((__m128i *)&flat_oq2[0],
1043                       _mm_packus_epi16(workp_shft, workp_shft));
1044    }
1045  }
1046  // lp filter
1047  {
1048    const __m128i t4 = _mm_set1_epi8(4);
1049    const __m128i t3 = _mm_set1_epi8(3);
1050    const __m128i t80 = _mm_set1_epi8(0x80);
1051    const __m128i t1 = _mm_set1_epi8(0x1);
1052    const __m128i ps1 =
1053        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
1054    const __m128i ps0 =
1055        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
1056    const __m128i qs0 =
1057        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
1058    const __m128i qs1 =
1059        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
1060    __m128i filt;
1061    __m128i work_a;
1062    __m128i filter1, filter2;
1063
1064    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1065    work_a = _mm_subs_epi8(qs0, ps0);
1066    filt = _mm_adds_epi8(filt, work_a);
1067    filt = _mm_adds_epi8(filt, work_a);
1068    filt = _mm_adds_epi8(filt, work_a);
1069    // (vpx_filter + 3 * (qs0 - ps0)) & mask
1070    filt = _mm_and_si128(filt, mask);
1071
1072    filter1 = _mm_adds_epi8(filt, t4);
1073    filter2 = _mm_adds_epi8(filt, t3);
1074
1075    // Filter1 >> 3
1076    filter1 = _mm_unpacklo_epi8(zero, filter1);
1077    filter1 = _mm_srai_epi16(filter1, 11);
1078    filter1 = _mm_packs_epi16(filter1, filter1);
1079
1080    // Filter2 >> 3
1081    filter2 = _mm_unpacklo_epi8(zero, filter2);
1082    filter2 = _mm_srai_epi16(filter2, 11);
1083    filter2 = _mm_packs_epi16(filter2, zero);
1084
1085    // filt >> 1
1086    filt = _mm_adds_epi8(filter1, t1);
1087    filt = _mm_unpacklo_epi8(zero, filt);
1088    filt = _mm_srai_epi16(filt, 9);
1089    filt = _mm_packs_epi16(filt, zero);
1090
1091    filt = _mm_andnot_si128(hev, filt);
1092
1093    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1094    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1095    work_a = _mm_andnot_si128(flat, work_a);
1096    q0 = _mm_and_si128(flat, q0);
1097    q0 = _mm_or_si128(work_a, q0);
1098
1099    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1100    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1101    work_a = _mm_andnot_si128(flat, work_a);
1102    q1 = _mm_and_si128(flat, q1);
1103    q1 = _mm_or_si128(work_a, q1);
1104
1105    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1106    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1107    work_a = _mm_andnot_si128(flat, work_a);
1108    q2 = _mm_and_si128(flat, q2);
1109    q2 = _mm_or_si128(work_a, q2);
1110
1111    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1112    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1113    work_a = _mm_andnot_si128(flat, work_a);
1114    p0 = _mm_and_si128(flat, p0);
1115    p0 = _mm_or_si128(work_a, p0);
1116
1117    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1118    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1119    work_a = _mm_andnot_si128(flat, work_a);
1120    p1 = _mm_and_si128(flat, p1);
1121    p1 = _mm_or_si128(work_a, p1);
1122
1123    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1124    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1125    work_a = _mm_andnot_si128(flat, work_a);
1126    p2 = _mm_and_si128(flat, p2);
1127    p2 = _mm_or_si128(work_a, p2);
1128
1129    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1130    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
1131    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
1132    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
1133    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
1134    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1135  }
1136}
1137
1138void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1139                                    const uint8_t *_limit0,
1140                                    const uint8_t *_thresh0,
1141                                    const uint8_t *_blimit1,
1142                                    const uint8_t *_limit1,
1143                                    const uint8_t *_thresh1) {
1144  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
1145  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
1146  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
1147  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
1148  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
1149  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
1150  const __m128i zero = _mm_set1_epi16(0);
1151  const __m128i blimit =
1152      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1153                         _mm_load_si128((const __m128i *)_blimit1));
1154  const __m128i limit =
1155      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1156                         _mm_load_si128((const __m128i *)_limit1));
1157  const __m128i thresh =
1158      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1159                         _mm_load_si128((const __m128i *)_thresh1));
1160
1161  __m128i mask, hev, flat;
1162  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1163
1164  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1165  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1166  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1167  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1168  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1169  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1170  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1171  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1172  {
1173    const __m128i abs_p1p0 =
1174        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1175    const __m128i abs_q1q0 =
1176        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1177    const __m128i one = _mm_set1_epi8(1);
1178    const __m128i fe = _mm_set1_epi8(0xfe);
1179    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1180    __m128i abs_p0q0 =
1181        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1182    __m128i abs_p1q1 =
1183        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1184    __m128i work;
1185
1186    // filter_mask and hev_mask
1187    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1188    hev = _mm_subs_epu8(flat, thresh);
1189    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1190
1191    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1192    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1193    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1194    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1195    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1196    mask = _mm_max_epu8(flat, mask);
1197    // mask |= (abs(p1 - p0) > limit) * -1;
1198    // mask |= (abs(q1 - q0) > limit) * -1;
1199    work = _mm_max_epu8(
1200        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1201        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1202    mask = _mm_max_epu8(work, mask);
1203    work = _mm_max_epu8(
1204        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1205        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1206    mask = _mm_max_epu8(work, mask);
1207    mask = _mm_subs_epu8(mask, limit);
1208    mask = _mm_cmpeq_epi8(mask, zero);
1209
1210    // flat_mask4
1211    work = _mm_max_epu8(
1212        _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
1213        _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
1214    flat = _mm_max_epu8(work, flat);
1215    work = _mm_max_epu8(
1216        _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
1217        _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
1218    flat = _mm_max_epu8(work, flat);
1219    flat = _mm_subs_epu8(flat, one);
1220    flat = _mm_cmpeq_epi8(flat, zero);
1221    flat = _mm_and_si128(flat, mask);
1222  }
1223  {
1224    const __m128i four = _mm_set1_epi16(4);
1225    unsigned char *src = s;
1226    int i = 0;
1227
1228    do {
1229      __m128i workp_a, workp_b, workp_shft;
1230      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1231      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1232      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1233      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1234      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1235      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1236      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1237      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1238
1239      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1240      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1241      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1242      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1243      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1244                       _mm_packus_epi16(workp_shft, workp_shft));
1245
1246      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1247      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1248      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1249                       _mm_packus_epi16(workp_shft, workp_shft));
1250
1251      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1252      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1253      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1254      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1255                       _mm_packus_epi16(workp_shft, workp_shft));
1256
1257      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1258      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1259      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1260      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1261                       _mm_packus_epi16(workp_shft, workp_shft));
1262
1263      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1264      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1265      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1266      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1267                       _mm_packus_epi16(workp_shft, workp_shft));
1268
1269      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1270      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1271      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1272      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1273                       _mm_packus_epi16(workp_shft, workp_shft));
1274
1275      src += 8;
1276    } while (++i < 2);
1277  }
1278  // lp filter
1279  {
1280    const __m128i t4 = _mm_set1_epi8(4);
1281    const __m128i t3 = _mm_set1_epi8(3);
1282    const __m128i t80 = _mm_set1_epi8(0x80);
1283    const __m128i te0 = _mm_set1_epi8(0xe0);
1284    const __m128i t1f = _mm_set1_epi8(0x1f);
1285    const __m128i t1 = _mm_set1_epi8(0x1);
1286    const __m128i t7f = _mm_set1_epi8(0x7f);
1287
1288    const __m128i ps1 =
1289        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
1290    const __m128i ps0 =
1291        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
1292    const __m128i qs0 =
1293        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
1294    const __m128i qs1 =
1295        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
1296    __m128i filt;
1297    __m128i work_a;
1298    __m128i filter1, filter2;
1299
1300    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1301    work_a = _mm_subs_epi8(qs0, ps0);
1302    filt = _mm_adds_epi8(filt, work_a);
1303    filt = _mm_adds_epi8(filt, work_a);
1304    filt = _mm_adds_epi8(filt, work_a);
1305    // (vpx_filter + 3 * (qs0 - ps0)) & mask
1306    filt = _mm_and_si128(filt, mask);
1307
1308    filter1 = _mm_adds_epi8(filt, t4);
1309    filter2 = _mm_adds_epi8(filt, t3);
1310
1311    // Filter1 >> 3
1312    work_a = _mm_cmpgt_epi8(zero, filter1);
1313    filter1 = _mm_srli_epi16(filter1, 3);
1314    work_a = _mm_and_si128(work_a, te0);
1315    filter1 = _mm_and_si128(filter1, t1f);
1316    filter1 = _mm_or_si128(filter1, work_a);
1317
1318    // Filter2 >> 3
1319    work_a = _mm_cmpgt_epi8(zero, filter2);
1320    filter2 = _mm_srli_epi16(filter2, 3);
1321    work_a = _mm_and_si128(work_a, te0);
1322    filter2 = _mm_and_si128(filter2, t1f);
1323    filter2 = _mm_or_si128(filter2, work_a);
1324
1325    // filt >> 1
1326    filt = _mm_adds_epi8(filter1, t1);
1327    work_a = _mm_cmpgt_epi8(zero, filt);
1328    filt = _mm_srli_epi16(filt, 1);
1329    work_a = _mm_and_si128(work_a, t80);
1330    filt = _mm_and_si128(filt, t7f);
1331    filt = _mm_or_si128(filt, work_a);
1332
1333    filt = _mm_andnot_si128(hev, filt);
1334
1335    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1336    q0 = _mm_load_si128((__m128i *)flat_oq0);
1337    work_a = _mm_andnot_si128(flat, work_a);
1338    q0 = _mm_and_si128(flat, q0);
1339    q0 = _mm_or_si128(work_a, q0);
1340
1341    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1342    q1 = _mm_load_si128((__m128i *)flat_oq1);
1343    work_a = _mm_andnot_si128(flat, work_a);
1344    q1 = _mm_and_si128(flat, q1);
1345    q1 = _mm_or_si128(work_a, q1);
1346
1347    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1348    q2 = _mm_load_si128((__m128i *)flat_oq2);
1349    work_a = _mm_andnot_si128(flat, work_a);
1350    q2 = _mm_and_si128(flat, q2);
1351    q2 = _mm_or_si128(work_a, q2);
1352
1353    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1354    p0 = _mm_load_si128((__m128i *)flat_op0);
1355    work_a = _mm_andnot_si128(flat, work_a);
1356    p0 = _mm_and_si128(flat, p0);
1357    p0 = _mm_or_si128(work_a, p0);
1358
1359    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1360    p1 = _mm_load_si128((__m128i *)flat_op1);
1361    work_a = _mm_andnot_si128(flat, work_a);
1362    p1 = _mm_and_si128(flat, p1);
1363    p1 = _mm_or_si128(work_a, p1);
1364
1365    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1366    p2 = _mm_load_si128((__m128i *)flat_op2);
1367    work_a = _mm_andnot_si128(flat, work_a);
1368    p2 = _mm_and_si128(flat, p2);
1369    p2 = _mm_or_si128(work_a, p2);
1370
1371    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1372    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1373    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1374    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1375    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1376    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1377  }
1378}
1379
1380void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1381                                    const unsigned char *_blimit0,
1382                                    const unsigned char *_limit0,
1383                                    const unsigned char *_thresh0,
1384                                    const unsigned char *_blimit1,
1385                                    const unsigned char *_limit1,
1386                                    const unsigned char *_thresh1) {
1387  const __m128i blimit =
1388      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1389                         _mm_load_si128((const __m128i *)_blimit1));
1390  const __m128i limit =
1391      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1392                         _mm_load_si128((const __m128i *)_limit1));
1393  const __m128i thresh =
1394      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1395                         _mm_load_si128((const __m128i *)_thresh1));
1396  const __m128i zero = _mm_set1_epi16(0);
1397  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1398  __m128i mask, hev, flat;
1399
1400  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1401  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1402  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1403  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1404  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1405  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1406  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1407  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1408
1409  // filter_mask and hev_mask
1410  {
1411    const __m128i abs_p1p0 =
1412        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1413    const __m128i abs_q1q0 =
1414        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1415    const __m128i fe = _mm_set1_epi8(0xfe);
1416    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1417    __m128i abs_p0q0 =
1418        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1419    __m128i abs_p1q1 =
1420        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1421    __m128i work;
1422
1423    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1424    hev = _mm_subs_epu8(flat, thresh);
1425    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1426
1427    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1428    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1429    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1430    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1431    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1432    mask = _mm_max_epu8(flat, mask);
1433    // mask |= (abs(p1 - p0) > limit) * -1;
1434    // mask |= (abs(q1 - q0) > limit) * -1;
1435    work = _mm_max_epu8(
1436        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1437        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1438    mask = _mm_max_epu8(work, mask);
1439    work = _mm_max_epu8(
1440        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1441        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1442    mask = _mm_max_epu8(work, mask);
1443    mask = _mm_subs_epu8(mask, limit);
1444    mask = _mm_cmpeq_epi8(mask, zero);
1445  }
1446
1447  // filter4
1448  {
1449    const __m128i t4 = _mm_set1_epi8(4);
1450    const __m128i t3 = _mm_set1_epi8(3);
1451    const __m128i t80 = _mm_set1_epi8(0x80);
1452    const __m128i te0 = _mm_set1_epi8(0xe0);
1453    const __m128i t1f = _mm_set1_epi8(0x1f);
1454    const __m128i t1 = _mm_set1_epi8(0x1);
1455    const __m128i t7f = _mm_set1_epi8(0x7f);
1456
1457    const __m128i ps1 =
1458        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
1459    const __m128i ps0 =
1460        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
1461    const __m128i qs0 =
1462        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
1463    const __m128i qs1 =
1464        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
1465    __m128i filt;
1466    __m128i work_a;
1467    __m128i filter1, filter2;
1468
1469    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1470    work_a = _mm_subs_epi8(qs0, ps0);
1471    filt = _mm_adds_epi8(filt, work_a);
1472    filt = _mm_adds_epi8(filt, work_a);
1473    filt = _mm_adds_epi8(filt, work_a);
1474    // (vpx_filter + 3 * (qs0 - ps0)) & mask
1475    filt = _mm_and_si128(filt, mask);
1476
1477    filter1 = _mm_adds_epi8(filt, t4);
1478    filter2 = _mm_adds_epi8(filt, t3);
1479
1480    // Filter1 >> 3
1481    work_a = _mm_cmpgt_epi8(zero, filter1);
1482    filter1 = _mm_srli_epi16(filter1, 3);
1483    work_a = _mm_and_si128(work_a, te0);
1484    filter1 = _mm_and_si128(filter1, t1f);
1485    filter1 = _mm_or_si128(filter1, work_a);
1486
1487    // Filter2 >> 3
1488    work_a = _mm_cmpgt_epi8(zero, filter2);
1489    filter2 = _mm_srli_epi16(filter2, 3);
1490    work_a = _mm_and_si128(work_a, te0);
1491    filter2 = _mm_and_si128(filter2, t1f);
1492    filter2 = _mm_or_si128(filter2, work_a);
1493
1494    // filt >> 1
1495    filt = _mm_adds_epi8(filter1, t1);
1496    work_a = _mm_cmpgt_epi8(zero, filt);
1497    filt = _mm_srli_epi16(filt, 1);
1498    work_a = _mm_and_si128(work_a, t80);
1499    filt = _mm_and_si128(filt, t7f);
1500    filt = _mm_or_si128(filt, work_a);
1501
1502    filt = _mm_andnot_si128(hev, filt);
1503
1504    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1505    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1506    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1507    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1508
1509    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1510    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1511    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1512    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1513  }
1514}
1515
1516static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1517                                 int in_p, unsigned char *out, int out_p) {
1518  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1519  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1520
1521  // 2-way interleave w/hoisting of unpacks
1522  x0 = _mm_loadl_epi64((__m128i *)in0);           // 1
1523  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
1524  x0 = _mm_unpacklo_epi8(x0, x1);                 // 1
1525
1526  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
1527  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));  // 7
1528  x1 = _mm_unpacklo_epi8(x2, x3);                     // 2
1529
1530  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));  // 9
1531  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));  // 11
1532  x2 = _mm_unpacklo_epi8(x4, x5);                     // 3
1533
1534  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));  // 13
1535  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));  // 15
1536  x3 = _mm_unpacklo_epi8(x6, x7);                     // 4
1537  x4 = _mm_unpacklo_epi16(x0, x1);                    // 9
1538
1539  x8 = _mm_loadl_epi64((__m128i *)in1);           // 2
1540  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
1541  x8 = _mm_unpacklo_epi8(x8, x9);                 // 5
1542  x5 = _mm_unpacklo_epi16(x2, x3);                // 10
1543
1544  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
1545  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));  // 8
1546  x9 = _mm_unpacklo_epi8(x10, x11);                    // 6
1547
1548  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));  // 10
1549  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));  // 12
1550  x10 = _mm_unpacklo_epi8(x12, x13);                   // 7
1551  x12 = _mm_unpacklo_epi16(x8, x9);                    // 11
1552
1553  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));  // 14
1554  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));  // 16
1555  x11 = _mm_unpacklo_epi8(x14, x15);                   // 8
1556  x13 = _mm_unpacklo_epi16(x10, x11);                  // 12
1557
1558  x6 = _mm_unpacklo_epi32(x4, x5);     // 13
1559  x7 = _mm_unpackhi_epi32(x4, x5);     // 14
1560  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
1561  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
1562
1563  // Store first 4-line result
1564  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1565  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1566  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1567  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1568
1569  x4 = _mm_unpackhi_epi16(x0, x1);
1570  x5 = _mm_unpackhi_epi16(x2, x3);
1571  x12 = _mm_unpackhi_epi16(x8, x9);
1572  x13 = _mm_unpackhi_epi16(x10, x11);
1573
1574  x6 = _mm_unpacklo_epi32(x4, x5);
1575  x7 = _mm_unpackhi_epi32(x4, x5);
1576  x14 = _mm_unpacklo_epi32(x12, x13);
1577  x15 = _mm_unpackhi_epi32(x12, x13);
1578
1579  // Store second 4-line result
1580  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1581  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1582  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1583  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1584}
1585
1586static INLINE void transpose(unsigned char *src[], int in_p,
1587                             unsigned char *dst[], int out_p,
1588                             int num_8x8_to_transpose) {
1589  int idx8x8 = 0;
1590  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1591  do {
1592    unsigned char *in = src[idx8x8];
1593    unsigned char *out = dst[idx8x8];
1594
1595    x0 =
1596        _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
1597    x1 =
1598        _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
1599    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1600    x0 = _mm_unpacklo_epi8(x0, x1);
1601
1602    x2 =
1603        _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
1604    x3 =
1605        _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
1606    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1607    x1 = _mm_unpacklo_epi8(x2, x3);
1608
1609    x4 =
1610        _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
1611    x5 =
1612        _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
1613    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1614    x2 = _mm_unpacklo_epi8(x4, x5);
1615
1616    x6 =
1617        _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
1618    x7 =
1619        _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
1620    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1621    x3 = _mm_unpacklo_epi8(x6, x7);
1622
1623    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1624    x4 = _mm_unpacklo_epi16(x0, x1);
1625    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1626    x5 = _mm_unpacklo_epi16(x2, x3);
1627    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1628    x6 = _mm_unpacklo_epi32(x4, x5);
1629    _mm_storel_pd((double *)(out + 0 * out_p),
1630                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
1631    _mm_storeh_pd((double *)(out + 1 * out_p),
1632                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
1633    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1634    x7 = _mm_unpackhi_epi32(x4, x5);
1635    _mm_storel_pd((double *)(out + 2 * out_p),
1636                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
1637    _mm_storeh_pd((double *)(out + 3 * out_p),
1638                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
1639
1640    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1641    x4 = _mm_unpackhi_epi16(x0, x1);
1642    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1643    x5 = _mm_unpackhi_epi16(x2, x3);
1644    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1645    x6 = _mm_unpacklo_epi32(x4, x5);
1646    _mm_storel_pd((double *)(out + 4 * out_p),
1647                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
1648    _mm_storeh_pd((double *)(out + 5 * out_p),
1649                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
1650    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1651    x7 = _mm_unpackhi_epi32(x4, x5);
1652
1653    _mm_storel_pd((double *)(out + 6 * out_p),
1654                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
1655    _mm_storeh_pd((double *)(out + 7 * out_p),
1656                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
1657  } while (++idx8x8 < num_8x8_to_transpose);
1658}
1659
1660void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1661                                  const uint8_t *limit0, const uint8_t *thresh0,
1662                                  const uint8_t *blimit1, const uint8_t *limit1,
1663                                  const uint8_t *thresh1) {
1664  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1665  unsigned char *src[2];
1666  unsigned char *dst[2];
1667
1668  // Transpose 8x16
1669  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1670
1671  // Loop filtering
1672  vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1673                                 blimit1, limit1, thresh1);
1674  src[0] = t_dst;
1675  src[1] = t_dst + 8;
1676  dst[0] = s - 4;
1677  dst[1] = s - 4 + p * 8;
1678
1679  // Transpose back
1680  transpose(src, 16, dst, p, 2);
1681}
1682
1683void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
1684                             const unsigned char *blimit,
1685                             const unsigned char *limit,
1686                             const unsigned char *thresh) {
1687  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
1688  unsigned char *src[1];
1689  unsigned char *dst[1];
1690
1691  // Transpose 8x8
1692  src[0] = s - 4;
1693  dst[0] = t_dst;
1694
1695  transpose(src, p, dst, 8, 1);
1696
1697  // Loop filtering
1698  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
1699
1700  src[0] = t_dst;
1701  dst[0] = s - 4;
1702
1703  // Transpose back
1704  transpose(src, 8, dst, p, 1);
1705}
1706
1707void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1708                                  const uint8_t *limit0, const uint8_t *thresh0,
1709                                  const uint8_t *blimit1, const uint8_t *limit1,
1710                                  const uint8_t *thresh1) {
1711  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1712  unsigned char *src[2];
1713  unsigned char *dst[2];
1714
1715  // Transpose 8x16
1716  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1717
1718  // Loop filtering
1719  vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1720                                 blimit1, limit1, thresh1);
1721  src[0] = t_dst;
1722  src[1] = t_dst + 8;
1723
1724  dst[0] = s - 4;
1725  dst[1] = s - 4 + p * 8;
1726
1727  // Transpose back
1728  transpose(src, 16, dst, p, 2);
1729}
1730
1731void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
1732                              const unsigned char *blimit,
1733                              const unsigned char *limit,
1734                              const unsigned char *thresh) {
1735  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
1736  unsigned char *src[2];
1737  unsigned char *dst[2];
1738
1739  src[0] = s - 8;
1740  src[1] = s;
1741  dst[0] = t_dst;
1742  dst[1] = t_dst + 8 * 8;
1743
1744  // Transpose 16x8
1745  transpose(src, p, dst, 8, 2);
1746
1747  // Loop filtering
1748  vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
1749
1750  src[0] = t_dst;
1751  src[1] = t_dst + 8 * 8;
1752  dst[0] = s - 8;
1753  dst[1] = s;
1754
1755  // Transpose back
1756  transpose(src, 8, dst, p, 2);
1757}
1758
1759void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
1760                                   const uint8_t *blimit, const uint8_t *limit,
1761                                   const uint8_t *thresh) {
1762  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
1763
1764  // Transpose 16x16
1765  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1766  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1767
1768  // Loop filtering
1769  vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
1770
1771  // Transpose back
1772  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1773  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1774}
1775