loopfilter_sse2.c revision 68e1c830ade592be74773e249bf94e2bbfb50de7
1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <emmintrin.h>  // SSE2
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx_ports/mem.h"
15#include "vpx_ports/emmintrin_compat.h"
16
17static INLINE __m128i abs_diff(__m128i a, __m128i b) {
18  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
19}
20
21// filter_mask and hev_mask
22#define FILTER_HEV_MASK do {                                                   \
23  /* (abs(q1 - q0), abs(p1 - p0) */                                            \
24  __m128i flat = abs_diff(q1p1, q0p0);                                         \
25  /* abs(p1 - q1), abs(p0 - q0) */                                             \
26  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                           \
27  __m128i abs_p0q0, abs_p1q1, work;                                            \
28                                                                               \
29  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */          \
30  hev = _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero);  \
31  hev = _mm_cmpgt_epi16(hev, thresh);                                          \
32  hev = _mm_packs_epi16(hev, hev);                                             \
33                                                                               \
34  /* const int8_t mask = filter_mask(*limit, *blimit, */                       \
35  /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */       \
36  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p0 - q0) * 2 */\
37  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0);  /* abs(p1 - q1) */\
38  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                      \
39  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1);  /* abs(p1 - q1) / 2 */      \
40  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                    \
41  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                    \
42  /* abs(p3 - p2), abs(p2 - p1) */                                             \
43  work = abs_diff(p3p2, p2p1);                                                 \
44  flat = _mm_max_epu8(work, flat);                                             \
45  /* abs(q3 - q2), abs(q2 - q1) */                                             \
46  work = abs_diff(q3q2, q2q1);                                                 \
47  flat = _mm_max_epu8(work, flat);                                             \
48  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                          \
49  mask = _mm_unpacklo_epi64(mask, flat);                                       \
50  mask = _mm_subs_epu8(mask, limit);                                           \
51  mask = _mm_cmpeq_epi8(mask, zero);                                           \
52  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                         \
53} while (0)
54
55#define FILTER4 do {                                                           \
56  const __m128i t3t4 = _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3,                    \
57                                    4, 4, 4, 4, 4, 4, 4, 4);                   \
58  const __m128i t80 = _mm_set1_epi8(0x80);                                     \
59  __m128i filter, filter2filter1, work;                                        \
60                                                                               \
61  ps1ps0 = _mm_xor_si128(p1p0, t80);  /* ^ 0x80 */                             \
62  qs1qs0 = _mm_xor_si128(q1q0, t80);                                           \
63                                                                               \
64  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */                    \
65  work = _mm_subs_epi8(ps1ps0, qs1qs0);                                        \
66  filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                        \
67  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */           \
68  filter = _mm_subs_epi8(filter, work);                                        \
69  filter = _mm_subs_epi8(filter, work);                                        \
70  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */               \
71  filter = _mm_and_si128(filter, mask);  /* & mask */                          \
72  filter = _mm_unpacklo_epi64(filter, filter);                                 \
73                                                                               \
74  /* filter1 = signed_char_clamp(filter + 4) >> 3; */                          \
75  /* filter2 = signed_char_clamp(filter + 3) >> 3; */                          \
76  filter2filter1 = _mm_adds_epi8(filter, t3t4);  /* signed_char_clamp */       \
77  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);                  \
78  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);          \
79  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);  /* >> 3 */             \
80  filter = _mm_srai_epi16(filter, 11);  /* >> 3 */                             \
81  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);                    \
82                                                                               \
83  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                        \
84  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */                       \
85  filter = _mm_unpacklo_epi8(filter, filter);                                  \
86  filter = _mm_srai_epi16(filter, 9);  /* round */                             \
87  filter = _mm_packs_epi16(filter, filter);                                    \
88  filter = _mm_andnot_si128(hev, filter);                                      \
89                                                                               \
90  hev = _mm_unpackhi_epi64(filter2filter1, filter);                            \
91  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);                 \
92                                                                               \
93  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */      \
94  qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                              \
95  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */      \
96  ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                         \
97  qs1qs0 = _mm_xor_si128(qs1qs0, t80);  /* ^ 0x80 */                           \
98  ps1ps0 = _mm_xor_si128(ps1ps0, t80);  /* ^ 0x80 */                           \
99} while (0)
100
101void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
102                               const uint8_t *_blimit, const uint8_t *_limit,
103                               const uint8_t *_thresh) {
104  const __m128i zero = _mm_set1_epi16(0);
105  const __m128i limit =
106      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
107                         _mm_loadl_epi64((const __m128i *)_limit));
108  const __m128i thresh =
109      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
110  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
111  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
112  __m128i mask, hev;
113
114  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
115                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
116  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
117                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
118  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
119                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
120  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
121                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
122  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
123  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
124  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
125  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
126
127  FILTER_HEV_MASK;
128  FILTER4;
129
130  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
131  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);  // *op0
132  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);  // *oq0
133  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
134}
135
136void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
137                             const uint8_t *_blimit, const uint8_t *_limit,
138                             const uint8_t *_thresh) {
139  const __m128i zero = _mm_set1_epi16(0);
140  const __m128i limit =
141      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
142                         _mm_loadl_epi64((const __m128i *)_limit));
143  const __m128i thresh =
144      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
145  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
146  __m128i x0, x1, x2, x3;
147  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
148  __m128i mask, hev;
149
150  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
151  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
152                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
153
154  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
155  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
156                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
157
158  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
159  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
160                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
161
162  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
163  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
164                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
165
166  // Transpose 8x8
167  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
168  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
169  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
170  x0 = _mm_unpacklo_epi16(x2, x3);
171  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
172  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
173  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
174  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
175  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
176  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
177
178  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
179  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
180  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
181  x2 = _mm_unpackhi_epi16(x2, x3);
182  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
183  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
184  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
185  q1q0 = _mm_unpacklo_epi32(q1q0, x2);
186
187  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
188  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
189  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
190  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
191  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
192
193  FILTER_HEV_MASK;
194  FILTER4;
195
196  // Transpose 8x4 to 4x8
197  // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
198  // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
199  // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
200  ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
201  // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
202  x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
203  // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
204  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
205  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
206  qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
207  // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
208  ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
209
210  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
211  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
212  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
213  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
214  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
215  ps1ps0 = _mm_srli_si128(ps1ps0, 4);
216  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
217
218  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
219  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
220  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
221  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
222  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
223  qs1qs0 = _mm_srli_si128(qs1qs0, 4);
224  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
225}
226
227void vpx_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
228                                    const unsigned char *_blimit,
229                                    const unsigned char *_limit,
230                                    const unsigned char *_thresh) {
231  const __m128i zero = _mm_set1_epi16(0);
232  const __m128i one = _mm_set1_epi8(1);
233  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
234  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
235  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
236  __m128i mask, hev, flat, flat2;
237  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
238  __m128i abs_p1p0;
239
240  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
241  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
242                                       (__m64 *)(s + 4 * p)));
243  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
244  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
245                                       (__m64 *)(s + 3 * p)));
246  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
247  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
248                                       (__m64 *)(s + 2 * p)));
249  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
250  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
251                                       (__m64 *)(s + 1 * p)));
252  p1q1 = _mm_shuffle_epi32(q1p1, 78);
253  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
254  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
255                                       (__m64 *)(s - 0 * p)));
256  p0q0 = _mm_shuffle_epi32(q0p0, 78);
257
258  {
259    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
260    abs_p1p0 = abs_diff(q1p1, q0p0);
261    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
262    fe = _mm_set1_epi8(0xfe);
263    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
264    abs_p0q0 = abs_diff(q0p0, p0q0);
265    abs_p1q1 = abs_diff(q1p1, p1q1);
266    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
267    hev = _mm_subs_epu8(flat, thresh);
268    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
269
270    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
271    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
272    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
273    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
274    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
275    mask = _mm_max_epu8(abs_p1p0, mask);
276    // mask |= (abs(p1 - p0) > limit) * -1;
277    // mask |= (abs(q1 - q0) > limit) * -1;
278
279    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
280                        abs_diff(q3p3, q2p2));
281    mask = _mm_max_epu8(work, mask);
282    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
283    mask = _mm_subs_epu8(mask, limit);
284    mask = _mm_cmpeq_epi8(mask, zero);
285  }
286
287  // lp filter
288  {
289    const __m128i t4 = _mm_set1_epi8(4);
290    const __m128i t3 = _mm_set1_epi8(3);
291    const __m128i t80 = _mm_set1_epi8(0x80);
292    const __m128i t1 = _mm_set1_epi16(0x1);
293    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
294    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
295    __m128i qs0 = _mm_xor_si128(p0q0, t80);
296    __m128i qs1 = _mm_xor_si128(p1q1, t80);
297    __m128i filt;
298    __m128i work_a;
299    __m128i filter1, filter2;
300    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
301    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
302
303    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
304    work_a = _mm_subs_epi8(qs0, qs0ps0);
305    filt = _mm_adds_epi8(filt, work_a);
306    filt = _mm_adds_epi8(filt, work_a);
307    filt = _mm_adds_epi8(filt, work_a);
308    // (vpx_filter + 3 * (qs0 - ps0)) & mask
309    filt = _mm_and_si128(filt, mask);
310
311    filter1 = _mm_adds_epi8(filt, t4);
312    filter2 = _mm_adds_epi8(filt, t3);
313
314    filter1 = _mm_unpacklo_epi8(zero, filter1);
315    filter1 = _mm_srai_epi16(filter1, 0xB);
316    filter2 = _mm_unpacklo_epi8(zero, filter2);
317    filter2 = _mm_srai_epi16(filter2, 0xB);
318
319    // Filter1 >> 3
320    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
321    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
322
323    // filt >> 1
324    filt = _mm_adds_epi16(filter1, t1);
325    filt = _mm_srai_epi16(filt, 1);
326    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
327                            filt);
328    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
329    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
330    // loopfilter done
331
332    {
333      __m128i work;
334      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
335      flat = _mm_max_epu8(abs_p1p0, flat);
336      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
337      flat = _mm_subs_epu8(flat, one);
338      flat = _mm_cmpeq_epi8(flat, zero);
339      flat = _mm_and_si128(flat, mask);
340
341      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
342      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
343                                           (__m64 *)(s + 5 * p)));
344
345      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
346      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
347                                           (__m64 *)(s + 6 * p)));
348      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
349
350      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
351      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
352                                           (__m64 *)(s + 7 * p)));
353      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
354      flat2 = _mm_max_epu8(work, flat2);
355      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
356      flat2 = _mm_subs_epu8(flat2, one);
357      flat2 = _mm_cmpeq_epi8(flat2, zero);
358      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
359    }
360
361    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
362    // flat and wide flat calculations
363    {
364      const __m128i eight = _mm_set1_epi16(8);
365      const __m128i four = _mm_set1_epi16(4);
366      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
367      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
368      __m128i pixelFilter_p, pixelFilter_q;
369      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
370      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
371
372      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
373      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
374      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
375      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
376      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
377      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
378      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
379      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
380      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
381      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
382      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
383      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
384      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
385      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
386      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
387      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
388
389      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
390                                    _mm_add_epi16(p4_16, p3_16));
391      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
392                                    _mm_add_epi16(q4_16, q3_16));
393
394      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
395      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
396
397      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
398      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
399      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
400                                                         pixelFilter_q));
401      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
402                                           _mm_add_epi16(pixetFilter_p2p1p0,
403                                                         pixetFilter_q2q1q0));
404      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
405                                           _mm_add_epi16(p7_16, p0_16)), 4);
406      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
407                                           _mm_add_epi16(q7_16, q0_16)), 4);
408      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
409      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
410                                           _mm_add_epi16(p3_16, p0_16)), 3);
411      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
412                                           _mm_add_epi16(q3_16, q0_16)), 3);
413
414      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
415
416      sum_p7 = _mm_add_epi16(p7_16, p7_16);
417      sum_q7 = _mm_add_epi16(q7_16, q7_16);
418      sum_p3 = _mm_add_epi16(p3_16, p3_16);
419      sum_q3 = _mm_add_epi16(q3_16, q3_16);
420
421      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
422      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
423      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
424                             _mm_add_epi16(sum_p7, p1_16)), 4);
425      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
426                             _mm_add_epi16(sum_q7, q1_16)), 4);
427      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
428
429      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
430      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
431      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
432                             _mm_add_epi16(sum_p3, p1_16)), 3);
433      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
434                             _mm_add_epi16(sum_q3, q1_16)), 3);
435      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
436
437      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
438      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
439      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
440      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
441
442      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
443      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
444      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
445                             _mm_add_epi16(sum_p7, p2_16)), 4);
446      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
447                             _mm_add_epi16(sum_q7, q2_16)), 4);
448      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
449
450      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
451      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
452
453      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
454                                           _mm_add_epi16(sum_p3, p2_16)), 3);
455      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
456                                           _mm_add_epi16(sum_q3, q2_16)), 3);
457      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
458
459      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
460      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
461      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
462      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
463      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
464                             _mm_add_epi16(sum_p7, p3_16)), 4);
465      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
466                             _mm_add_epi16(sum_q7, q3_16)), 4);
467      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
468
469      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
470      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
471      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
472      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
473      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
474                             _mm_add_epi16(sum_p7, p4_16)), 4);
475      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
476                             _mm_add_epi16(sum_q7, q4_16)), 4);
477      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
478
479      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
480      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
481      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
482      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
483      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
484                             _mm_add_epi16(sum_p7, p5_16)), 4);
485      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
486                             _mm_add_epi16(sum_q7, q5_16)), 4);
487      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
488
489      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
490      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
491      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
492      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
493      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
494                             _mm_add_epi16(sum_p7, p6_16)), 4);
495      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
496                             _mm_add_epi16(sum_q7, q6_16)), 4);
497      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
498    }
499    // wide flat
500    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
501
502    flat = _mm_shuffle_epi32(flat, 68);
503    flat2 = _mm_shuffle_epi32(flat2, 68);
504
505    q2p2 = _mm_andnot_si128(flat, q2p2);
506    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
507    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
508
509    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
510    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
511    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
512
513    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
514    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
515    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
516
517    q6p6 = _mm_andnot_si128(flat2, q6p6);
518    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
519    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
520    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
521    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
522
523    q5p5 = _mm_andnot_si128(flat2, q5p5);
524    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
525    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
526    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
527    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
528
529    q4p4 = _mm_andnot_si128(flat2, q4p4);
530    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
531    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
532    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
533    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
534
535    q3p3 = _mm_andnot_si128(flat2, q3p3);
536    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
537    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
538    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
539    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
540
541    q2p2 = _mm_andnot_si128(flat2, q2p2);
542    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
543    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
544    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
545    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
546
547    q1p1 = _mm_andnot_si128(flat2, q1p1);
548    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
549    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
550    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
551    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
552
553    q0p0 = _mm_andnot_si128(flat2, q0p0);
554    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
555    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
556    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
557    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
558  }
559}
560
561static INLINE __m128i filter_add2_sub2(const __m128i *const total,
562                                       const __m128i *const a1,
563                                       const __m128i *const a2,
564                                       const __m128i *const s1,
565                                       const __m128i *const s2) {
566  __m128i x = _mm_add_epi16(*a1, *total);
567  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
568  return x;
569}
570
571static INLINE __m128i filter8_mask(const __m128i *const flat,
572                                   const __m128i *const other_filt,
573                                   const __m128i *const f8_lo,
574                                   const __m128i *const f8_hi) {
575  const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
576                                      _mm_srli_epi16(*f8_hi, 3));
577  const __m128i result = _mm_and_si128(*flat, f8);
578  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
579}
580
581static INLINE __m128i filter16_mask(const __m128i *const flat,
582                                    const __m128i *const other_filt,
583                                    const __m128i *const f_lo,
584                                    const __m128i *const f_hi) {
585  const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
586                                     _mm_srli_epi16(*f_hi, 4));
587  const __m128i result = _mm_and_si128(*flat, f);
588  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
589}
590
591void vpx_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
592                                     const unsigned char *_blimit,
593                                     const unsigned char *_limit,
594                                     const unsigned char *_thresh) {
595  const __m128i zero = _mm_set1_epi16(0);
596  const __m128i one = _mm_set1_epi8(1);
597  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
598  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
599  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
600  __m128i mask, hev, flat, flat2;
601  __m128i p7, p6, p5;
602  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
603  __m128i q5, q6, q7;
604
605  __m128i op2, op1, op0, oq0, oq1, oq2;
606
607  __m128i max_abs_p1p0q1q0;
608
609  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
610  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
611  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
612  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
613  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
614  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
615  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
616  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
617  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
618  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
619  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
620  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
621  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
622  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
623  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
624  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
625
626  {
627    const __m128i abs_p1p0 = abs_diff(p1, p0);
628    const __m128i abs_q1q0 = abs_diff(q1, q0);
629    const __m128i fe = _mm_set1_epi8(0xfe);
630    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
631    __m128i abs_p0q0 = abs_diff(p0, q0);
632    __m128i abs_p1q1 = abs_diff(p1, q1);
633    __m128i work;
634    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
635
636    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
637    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
638    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
639    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
640    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
641    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
642    // mask |= (abs(p1 - p0) > limit) * -1;
643    // mask |= (abs(q1 - q0) > limit) * -1;
644    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
645    mask = _mm_max_epu8(work, mask);
646    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
647    mask = _mm_max_epu8(work, mask);
648    mask = _mm_subs_epu8(mask, limit);
649    mask = _mm_cmpeq_epi8(mask, zero);
650  }
651
652  {
653    __m128i work;
654    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
655    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
656    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
657    flat = _mm_max_epu8(work, flat);
658    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
659    flat = _mm_subs_epu8(flat, one);
660    flat = _mm_cmpeq_epi8(flat, zero);
661    flat = _mm_and_si128(flat, mask);
662    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
663    flat2 = _mm_max_epu8(work, flat2);
664    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
665    flat2 = _mm_max_epu8(work, flat2);
666    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
667    flat2 = _mm_max_epu8(work, flat2);
668    flat2 = _mm_subs_epu8(flat2, one);
669    flat2 = _mm_cmpeq_epi8(flat2, zero);
670    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
671  }
672
673  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
674  // filter4
675  {
676    const __m128i t4 = _mm_set1_epi8(4);
677    const __m128i t3 = _mm_set1_epi8(3);
678    const __m128i t80 = _mm_set1_epi8(0x80);
679    const __m128i te0 = _mm_set1_epi8(0xe0);
680    const __m128i t1f = _mm_set1_epi8(0x1f);
681    const __m128i t1 = _mm_set1_epi8(0x1);
682    const __m128i t7f = _mm_set1_epi8(0x7f);
683    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
684
685    __m128i filt;
686    __m128i work_a;
687    __m128i filter1, filter2;
688
689    op1 = _mm_xor_si128(p1, t80);
690    op0 = _mm_xor_si128(p0, t80);
691    oq0 = _mm_xor_si128(q0, t80);
692    oq1 = _mm_xor_si128(q1, t80);
693
694    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
695    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
696    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
697
698    work_a = _mm_subs_epi8(oq0, op0);
699    filt = _mm_adds_epi8(filt, work_a);
700    filt = _mm_adds_epi8(filt, work_a);
701    filt = _mm_adds_epi8(filt, work_a);
702    // (vpx_filter + 3 * (qs0 - ps0)) & mask
703    filt = _mm_and_si128(filt, mask);
704    filter1 = _mm_adds_epi8(filt, t4);
705    filter2 = _mm_adds_epi8(filt, t3);
706
707    // Filter1 >> 3
708    work_a = _mm_cmpgt_epi8(zero, filter1);
709    filter1 = _mm_srli_epi16(filter1, 3);
710    work_a = _mm_and_si128(work_a, te0);
711    filter1 = _mm_and_si128(filter1, t1f);
712    filter1 = _mm_or_si128(filter1, work_a);
713    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
714
715    // Filter2 >> 3
716    work_a = _mm_cmpgt_epi8(zero, filter2);
717    filter2 = _mm_srli_epi16(filter2, 3);
718    work_a = _mm_and_si128(work_a, te0);
719    filter2 = _mm_and_si128(filter2, t1f);
720    filter2 = _mm_or_si128(filter2, work_a);
721    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
722
723    // filt >> 1
724    filt = _mm_adds_epi8(filter1, t1);
725    work_a = _mm_cmpgt_epi8(zero, filt);
726    filt = _mm_srli_epi16(filt, 1);
727    work_a = _mm_and_si128(work_a, t80);
728    filt = _mm_and_si128(filt, t7f);
729    filt = _mm_or_si128(filt, work_a);
730    filt = _mm_andnot_si128(hev, filt);
731    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
732    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
733    // loopfilter done
734
735    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
736    // filter8
737    {
738      const __m128i four = _mm_set1_epi16(4);
739      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
740      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
741      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
742      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
743      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
744      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
745      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
746      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
747
748      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
749      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
750      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
751      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
752      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
753      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
754      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
755      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
756      __m128i f8_lo, f8_hi;
757
758      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
759                            _mm_add_epi16(p3_lo, p2_lo));
760      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
761                            _mm_add_epi16(p2_lo, p1_lo));
762      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
763
764      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
765                            _mm_add_epi16(p3_hi, p2_hi));
766      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
767                            _mm_add_epi16(p2_hi, p1_hi));
768      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
769
770      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
771
772      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
773      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
774      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
775
776      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
777      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
778      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
779
780      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
781      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
782      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
783
784      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
785      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
786      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
787
788      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
789      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
790      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
791    }
792
793    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
794    // wide flat calculations
795    {
796      const __m128i eight = _mm_set1_epi16(8);
797      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
798      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
799      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
800      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
801      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
802      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
803      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
804      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
805      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
806      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
807      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
808      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
809      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
810      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
811      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
812      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
813
814      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
815      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
816      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
817      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
818      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
819      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
820      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
821      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
822      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
823      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
824      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
825      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
826      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
827      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
828      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
829      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
830
831      __m128i f_lo;
832      __m128i f_hi;
833
834      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
835      f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
836                           _mm_add_epi16(p4_lo, f_lo));
837      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
838                           _mm_add_epi16(p2_lo, p1_lo));
839      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
840      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
841
842      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
843      f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
844                           _mm_add_epi16(p4_hi, f_hi));
845      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
846                           _mm_add_epi16(p2_hi, p1_hi));
847      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
848      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
849
850      p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
851      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
852
853      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
854      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
855      p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
856      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
857
858      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
859      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
860      p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
861      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
862
863      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
864      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
865      p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
866      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
867
868      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
869      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
870      op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
871      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
872
873      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
874      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
875      op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
876      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
877
878      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
879      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
880      op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
881      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
882
883      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
884      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
885      oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
886      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
887
888      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
889      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
890      oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
891      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
892
893      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
894      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
895      oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
896      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
897
898      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
899      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
900      q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
901      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
902
903      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
904      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
905      q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
906      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
907
908      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
909      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
910      q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
911      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
912
913      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
914      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
915      q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
916      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
917    }
918    // wide flat
919    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
920  }
921}
922
923void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
924                               const unsigned char *_blimit,
925                               const unsigned char *_limit,
926                               const unsigned char *_thresh) {
927  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
928  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
929  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
930  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
931  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
932  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
933  const __m128i zero = _mm_set1_epi16(0);
934  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
935  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
936  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
937  __m128i mask, hev, flat;
938  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
939  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
940
941  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
942                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
943  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
944                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
945  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
946                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
947  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
948                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
949  p1q1 = _mm_shuffle_epi32(q1p1, 78);
950  p0q0 = _mm_shuffle_epi32(q0p0, 78);
951
952  {
953    // filter_mask and hev_mask
954    const __m128i one = _mm_set1_epi8(1);
955    const __m128i fe = _mm_set1_epi8(0xfe);
956    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
957    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
958    abs_p1p0 = abs_diff(q1p1, q0p0);
959    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
960
961    abs_p0q0 = abs_diff(q0p0, p0q0);
962    abs_p1q1 = abs_diff(q1p1, p1q1);
963    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
964    hev = _mm_subs_epu8(flat, thresh);
965    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
966
967    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
968    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
969    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
970    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
971    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
972    mask = _mm_max_epu8(abs_p1p0, mask);
973    // mask |= (abs(p1 - p0) > limit) * -1;
974    // mask |= (abs(q1 - q0) > limit) * -1;
975
976    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
977                        abs_diff(q3p3, q2p2));
978    mask = _mm_max_epu8(work, mask);
979    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
980    mask = _mm_subs_epu8(mask, limit);
981    mask = _mm_cmpeq_epi8(mask, zero);
982
983    // flat_mask4
984
985    flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
986                        abs_diff(q3p3, q0p0));
987    flat = _mm_max_epu8(abs_p1p0, flat);
988    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
989    flat = _mm_subs_epu8(flat, one);
990    flat = _mm_cmpeq_epi8(flat, zero);
991    flat = _mm_and_si128(flat, mask);
992  }
993
994  {
995    const __m128i four = _mm_set1_epi16(4);
996    unsigned char *src = s;
997    {
998      __m128i workp_a, workp_b, workp_shft;
999      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1000      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1001      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1002      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1003      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1004      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1005      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1006      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1007
1008      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1009      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1010      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1011      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1012      _mm_storel_epi64((__m128i *)&flat_op2[0],
1013                       _mm_packus_epi16(workp_shft, workp_shft));
1014
1015      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1016      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1017      _mm_storel_epi64((__m128i *)&flat_op1[0],
1018                       _mm_packus_epi16(workp_shft, workp_shft));
1019
1020      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1021      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1022      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1023      _mm_storel_epi64((__m128i *)&flat_op0[0],
1024                       _mm_packus_epi16(workp_shft, workp_shft));
1025
1026      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1027      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1028      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1029      _mm_storel_epi64((__m128i *)&flat_oq0[0],
1030                       _mm_packus_epi16(workp_shft, workp_shft));
1031
1032      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1033      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1034      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1035      _mm_storel_epi64((__m128i *)&flat_oq1[0],
1036                       _mm_packus_epi16(workp_shft, workp_shft));
1037
1038      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1039      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1040      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1041      _mm_storel_epi64((__m128i *)&flat_oq2[0],
1042                       _mm_packus_epi16(workp_shft, workp_shft));
1043    }
1044  }
1045  // lp filter
1046  {
1047    const __m128i t4 = _mm_set1_epi8(4);
1048    const __m128i t3 = _mm_set1_epi8(3);
1049    const __m128i t80 = _mm_set1_epi8(0x80);
1050    const __m128i t1 = _mm_set1_epi8(0x1);
1051    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
1052                                      t80);
1053    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
1054                                      t80);
1055    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
1056                                      t80);
1057    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
1058                                      t80);
1059    __m128i filt;
1060    __m128i work_a;
1061    __m128i filter1, filter2;
1062
1063    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1064    work_a = _mm_subs_epi8(qs0, ps0);
1065    filt = _mm_adds_epi8(filt, work_a);
1066    filt = _mm_adds_epi8(filt, work_a);
1067    filt = _mm_adds_epi8(filt, work_a);
1068    // (vpx_filter + 3 * (qs0 - ps0)) & mask
1069    filt = _mm_and_si128(filt, mask);
1070
1071    filter1 = _mm_adds_epi8(filt, t4);
1072    filter2 = _mm_adds_epi8(filt, t3);
1073
1074    // Filter1 >> 3
1075    filter1 = _mm_unpacklo_epi8(zero, filter1);
1076    filter1 = _mm_srai_epi16(filter1, 11);
1077    filter1 = _mm_packs_epi16(filter1, filter1);
1078
1079    // Filter2 >> 3
1080    filter2 = _mm_unpacklo_epi8(zero, filter2);
1081    filter2 = _mm_srai_epi16(filter2, 11);
1082    filter2 = _mm_packs_epi16(filter2, zero);
1083
1084    // filt >> 1
1085    filt = _mm_adds_epi8(filter1, t1);
1086    filt = _mm_unpacklo_epi8(zero, filt);
1087    filt = _mm_srai_epi16(filt, 9);
1088    filt = _mm_packs_epi16(filt, zero);
1089
1090    filt = _mm_andnot_si128(hev, filt);
1091
1092    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1093    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1094    work_a = _mm_andnot_si128(flat, work_a);
1095    q0 = _mm_and_si128(flat, q0);
1096    q0 = _mm_or_si128(work_a, q0);
1097
1098    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1099    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1100    work_a = _mm_andnot_si128(flat, work_a);
1101    q1 = _mm_and_si128(flat, q1);
1102    q1 = _mm_or_si128(work_a, q1);
1103
1104    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1105    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1106    work_a = _mm_andnot_si128(flat, work_a);
1107    q2 = _mm_and_si128(flat, q2);
1108    q2 = _mm_or_si128(work_a, q2);
1109
1110    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1111    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1112    work_a = _mm_andnot_si128(flat, work_a);
1113    p0 = _mm_and_si128(flat, p0);
1114    p0 = _mm_or_si128(work_a, p0);
1115
1116    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1117    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1118    work_a = _mm_andnot_si128(flat, work_a);
1119    p1 = _mm_and_si128(flat, p1);
1120    p1 = _mm_or_si128(work_a, p1);
1121
1122    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1123    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1124    work_a = _mm_andnot_si128(flat, work_a);
1125    p2 = _mm_and_si128(flat, p2);
1126    p2 = _mm_or_si128(work_a, p2);
1127
1128    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1129    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
1130    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
1131    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
1132    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
1133    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1134  }
1135}
1136
1137void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
1138                                    const uint8_t *_blimit0,
1139                                    const uint8_t *_limit0,
1140                                    const uint8_t *_thresh0,
1141                                    const uint8_t *_blimit1,
1142                                    const uint8_t *_limit1,
1143                                    const uint8_t *_thresh1) {
1144  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
1145  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
1146  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
1147  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
1148  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
1149  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
1150  const __m128i zero = _mm_set1_epi16(0);
1151  const __m128i blimit =
1152      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1153                         _mm_load_si128((const __m128i *)_blimit1));
1154  const __m128i limit =
1155      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1156                         _mm_load_si128((const __m128i *)_limit1));
1157  const __m128i thresh =
1158      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1159                         _mm_load_si128((const __m128i *)_thresh1));
1160
1161  __m128i mask, hev, flat;
1162  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1163
1164  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1165  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1166  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1167  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1168  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1169  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1170  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1171  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1172  {
1173    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
1174                                          _mm_subs_epu8(p0, p1));
1175    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
1176                                          _mm_subs_epu8(q0, q1));
1177    const __m128i one = _mm_set1_epi8(1);
1178    const __m128i fe = _mm_set1_epi8(0xfe);
1179    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1180    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
1181                                    _mm_subs_epu8(q0, p0));
1182    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
1183                                    _mm_subs_epu8(q1, p1));
1184    __m128i work;
1185
1186    // filter_mask and hev_mask
1187    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1188    hev = _mm_subs_epu8(flat, thresh);
1189    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1190
1191    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1192    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1193    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1194    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1195    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1196    mask = _mm_max_epu8(flat, mask);
1197    // mask |= (abs(p1 - p0) > limit) * -1;
1198    // mask |= (abs(q1 - q0) > limit) * -1;
1199    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1200                                     _mm_subs_epu8(p1, p2)),
1201                         _mm_or_si128(_mm_subs_epu8(p3, p2),
1202                                      _mm_subs_epu8(p2, p3)));
1203    mask = _mm_max_epu8(work, mask);
1204    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1205                                     _mm_subs_epu8(q1, q2)),
1206                         _mm_or_si128(_mm_subs_epu8(q3, q2),
1207                                      _mm_subs_epu8(q2, q3)));
1208    mask = _mm_max_epu8(work, mask);
1209    mask = _mm_subs_epu8(mask, limit);
1210    mask = _mm_cmpeq_epi8(mask, zero);
1211
1212    // flat_mask4
1213    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
1214                                     _mm_subs_epu8(p0, p2)),
1215                         _mm_or_si128(_mm_subs_epu8(q2, q0),
1216                                      _mm_subs_epu8(q0, q2)));
1217    flat = _mm_max_epu8(work, flat);
1218    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
1219                                     _mm_subs_epu8(p0, p3)),
1220                         _mm_or_si128(_mm_subs_epu8(q3, q0),
1221                                      _mm_subs_epu8(q0, q3)));
1222    flat = _mm_max_epu8(work, flat);
1223    flat = _mm_subs_epu8(flat, one);
1224    flat = _mm_cmpeq_epi8(flat, zero);
1225    flat = _mm_and_si128(flat, mask);
1226  }
1227  {
1228    const __m128i four = _mm_set1_epi16(4);
1229    unsigned char *src = s;
1230    int i = 0;
1231
1232    do {
1233      __m128i workp_a, workp_b, workp_shft;
1234      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1235      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1236      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1237      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1238      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1239      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1240      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1241      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1242
1243      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1244      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1245      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1246      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1247      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1248                       _mm_packus_epi16(workp_shft, workp_shft));
1249
1250      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1251      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1252      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1253                       _mm_packus_epi16(workp_shft, workp_shft));
1254
1255      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1256      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1257      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1258      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1259                       _mm_packus_epi16(workp_shft, workp_shft));
1260
1261      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1262      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1263      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1264      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1265                       _mm_packus_epi16(workp_shft, workp_shft));
1266
1267      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1268      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1269      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1270      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1271                       _mm_packus_epi16(workp_shft, workp_shft));
1272
1273      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1274      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1275      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1276      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1277                       _mm_packus_epi16(workp_shft, workp_shft));
1278
1279      src += 8;
1280    } while (++i < 2);
1281  }
1282  // lp filter
1283  {
1284    const __m128i t4 = _mm_set1_epi8(4);
1285    const __m128i t3 = _mm_set1_epi8(3);
1286    const __m128i t80 = _mm_set1_epi8(0x80);
1287    const __m128i te0 = _mm_set1_epi8(0xe0);
1288    const __m128i t1f = _mm_set1_epi8(0x1f);
1289    const __m128i t1 = _mm_set1_epi8(0x1);
1290    const __m128i t7f = _mm_set1_epi8(0x7f);
1291
1292    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1293                                      t80);
1294    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1295                                      t80);
1296    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1297                                      t80);
1298    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1299                                      t80);
1300    __m128i filt;
1301    __m128i work_a;
1302    __m128i filter1, filter2;
1303
1304    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1305    work_a = _mm_subs_epi8(qs0, ps0);
1306    filt = _mm_adds_epi8(filt, work_a);
1307    filt = _mm_adds_epi8(filt, work_a);
1308    filt = _mm_adds_epi8(filt, work_a);
1309    // (vpx_filter + 3 * (qs0 - ps0)) & mask
1310    filt = _mm_and_si128(filt, mask);
1311
1312    filter1 = _mm_adds_epi8(filt, t4);
1313    filter2 = _mm_adds_epi8(filt, t3);
1314
1315    // Filter1 >> 3
1316    work_a = _mm_cmpgt_epi8(zero, filter1);
1317    filter1 = _mm_srli_epi16(filter1, 3);
1318    work_a = _mm_and_si128(work_a, te0);
1319    filter1 = _mm_and_si128(filter1, t1f);
1320    filter1 = _mm_or_si128(filter1, work_a);
1321
1322    // Filter2 >> 3
1323    work_a = _mm_cmpgt_epi8(zero, filter2);
1324    filter2 = _mm_srli_epi16(filter2, 3);
1325    work_a = _mm_and_si128(work_a, te0);
1326    filter2 = _mm_and_si128(filter2, t1f);
1327    filter2 = _mm_or_si128(filter2, work_a);
1328
1329    // filt >> 1
1330    filt = _mm_adds_epi8(filter1, t1);
1331    work_a = _mm_cmpgt_epi8(zero, filt);
1332    filt = _mm_srli_epi16(filt, 1);
1333    work_a = _mm_and_si128(work_a, t80);
1334    filt = _mm_and_si128(filt, t7f);
1335    filt = _mm_or_si128(filt, work_a);
1336
1337    filt = _mm_andnot_si128(hev, filt);
1338
1339    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1340    q0 = _mm_load_si128((__m128i *)flat_oq0);
1341    work_a = _mm_andnot_si128(flat, work_a);
1342    q0 = _mm_and_si128(flat, q0);
1343    q0 = _mm_or_si128(work_a, q0);
1344
1345    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1346    q1 = _mm_load_si128((__m128i *)flat_oq1);
1347    work_a = _mm_andnot_si128(flat, work_a);
1348    q1 = _mm_and_si128(flat, q1);
1349    q1 = _mm_or_si128(work_a, q1);
1350
1351    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1352    q2 = _mm_load_si128((__m128i *)flat_oq2);
1353    work_a = _mm_andnot_si128(flat, work_a);
1354    q2 = _mm_and_si128(flat, q2);
1355    q2 = _mm_or_si128(work_a, q2);
1356
1357    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1358    p0 = _mm_load_si128((__m128i *)flat_op0);
1359    work_a = _mm_andnot_si128(flat, work_a);
1360    p0 = _mm_and_si128(flat, p0);
1361    p0 = _mm_or_si128(work_a, p0);
1362
1363    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1364    p1 = _mm_load_si128((__m128i *)flat_op1);
1365    work_a = _mm_andnot_si128(flat, work_a);
1366    p1 = _mm_and_si128(flat, p1);
1367    p1 = _mm_or_si128(work_a, p1);
1368
1369    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1370    p2 = _mm_load_si128((__m128i *)flat_op2);
1371    work_a = _mm_andnot_si128(flat, work_a);
1372    p2 = _mm_and_si128(flat, p2);
1373    p2 = _mm_or_si128(work_a, p2);
1374
1375    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1376    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1377    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1378    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1379    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1380    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1381  }
1382}
1383
1384void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1385                                    const unsigned char *_blimit0,
1386                                    const unsigned char *_limit0,
1387                                    const unsigned char *_thresh0,
1388                                    const unsigned char *_blimit1,
1389                                    const unsigned char *_limit1,
1390                                    const unsigned char *_thresh1) {
1391  const __m128i blimit =
1392      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1393                         _mm_load_si128((const __m128i *)_blimit1));
1394  const __m128i limit =
1395      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1396                         _mm_load_si128((const __m128i *)_limit1));
1397  const __m128i thresh =
1398      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1399                         _mm_load_si128((const __m128i *)_thresh1));
1400  const __m128i zero = _mm_set1_epi16(0);
1401  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1402  __m128i mask, hev, flat;
1403
1404  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1405  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1406  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1407  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1408  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1409  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1410  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1411  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1412
1413  // filter_mask and hev_mask
1414  {
1415    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
1416                                          _mm_subs_epu8(p0, p1));
1417    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
1418                                          _mm_subs_epu8(q0, q1));
1419    const __m128i fe = _mm_set1_epi8(0xfe);
1420    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1421    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
1422                                    _mm_subs_epu8(q0, p0));
1423    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
1424                                    _mm_subs_epu8(q1, p1));
1425    __m128i work;
1426
1427    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1428    hev = _mm_subs_epu8(flat, thresh);
1429    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1430
1431    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1432    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1433    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1434    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1435    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1436    mask = _mm_max_epu8(flat, mask);
1437    // mask |= (abs(p1 - p0) > limit) * -1;
1438    // mask |= (abs(q1 - q0) > limit) * -1;
1439    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1440                                     _mm_subs_epu8(p1, p2)),
1441                         _mm_or_si128(_mm_subs_epu8(p3, p2),
1442                                      _mm_subs_epu8(p2, p3)));
1443    mask = _mm_max_epu8(work, mask);
1444    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1445                                     _mm_subs_epu8(q1, q2)),
1446                         _mm_or_si128(_mm_subs_epu8(q3, q2),
1447                                      _mm_subs_epu8(q2, q3)));
1448    mask = _mm_max_epu8(work, mask);
1449    mask = _mm_subs_epu8(mask, limit);
1450    mask = _mm_cmpeq_epi8(mask, zero);
1451  }
1452
1453  // filter4
1454  {
1455    const __m128i t4 = _mm_set1_epi8(4);
1456    const __m128i t3 = _mm_set1_epi8(3);
1457    const __m128i t80 = _mm_set1_epi8(0x80);
1458    const __m128i te0 = _mm_set1_epi8(0xe0);
1459    const __m128i t1f = _mm_set1_epi8(0x1f);
1460    const __m128i t1 = _mm_set1_epi8(0x1);
1461    const __m128i t7f = _mm_set1_epi8(0x7f);
1462
1463    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1464                                      t80);
1465    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1466                                      t80);
1467    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1468                                      t80);
1469    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1470                                      t80);
1471    __m128i filt;
1472    __m128i work_a;
1473    __m128i filter1, filter2;
1474
1475    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1476    work_a = _mm_subs_epi8(qs0, ps0);
1477    filt = _mm_adds_epi8(filt, work_a);
1478    filt = _mm_adds_epi8(filt, work_a);
1479    filt = _mm_adds_epi8(filt, work_a);
1480    // (vpx_filter + 3 * (qs0 - ps0)) & mask
1481    filt = _mm_and_si128(filt, mask);
1482
1483    filter1 = _mm_adds_epi8(filt, t4);
1484    filter2 = _mm_adds_epi8(filt, t3);
1485
1486    // Filter1 >> 3
1487    work_a = _mm_cmpgt_epi8(zero, filter1);
1488    filter1 = _mm_srli_epi16(filter1, 3);
1489    work_a = _mm_and_si128(work_a, te0);
1490    filter1 = _mm_and_si128(filter1, t1f);
1491    filter1 = _mm_or_si128(filter1, work_a);
1492
1493    // Filter2 >> 3
1494    work_a = _mm_cmpgt_epi8(zero, filter2);
1495    filter2 = _mm_srli_epi16(filter2, 3);
1496    work_a = _mm_and_si128(work_a, te0);
1497    filter2 = _mm_and_si128(filter2, t1f);
1498    filter2 = _mm_or_si128(filter2, work_a);
1499
1500    // filt >> 1
1501    filt = _mm_adds_epi8(filter1, t1);
1502    work_a = _mm_cmpgt_epi8(zero, filt);
1503    filt = _mm_srli_epi16(filt, 1);
1504    work_a = _mm_and_si128(work_a, t80);
1505    filt = _mm_and_si128(filt, t7f);
1506    filt = _mm_or_si128(filt, work_a);
1507
1508    filt = _mm_andnot_si128(hev, filt);
1509
1510    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1511    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1512    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1513    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1514
1515    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1516    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1517    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1518    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1519  }
1520}
1521
1522static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1523                                 int in_p, unsigned char *out, int out_p) {
1524  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1525  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1526
1527  // 2-way interleave w/hoisting of unpacks
1528  x0 = _mm_loadl_epi64((__m128i *)in0);  // 1
1529  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
1530  x0 = _mm_unpacklo_epi8(x0, x1);  // 1
1531
1532  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
1533  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));  // 7
1534  x1 = _mm_unpacklo_epi8(x2, x3);  // 2
1535
1536  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));  // 9
1537  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));  // 11
1538  x2 = _mm_unpacklo_epi8(x4, x5);  // 3
1539
1540  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));  // 13
1541  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));  // 15
1542  x3 = _mm_unpacklo_epi8(x6, x7);  // 4
1543  x4 = _mm_unpacklo_epi16(x0, x1);  // 9
1544
1545  x8 = _mm_loadl_epi64((__m128i *)in1);  // 2
1546  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
1547  x8 = _mm_unpacklo_epi8(x8, x9);  // 5
1548  x5 = _mm_unpacklo_epi16(x2, x3);  // 10
1549
1550  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
1551  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));  // 8
1552  x9 = _mm_unpacklo_epi8(x10, x11);  // 6
1553
1554  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));  // 10
1555  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));  // 12
1556  x10 = _mm_unpacklo_epi8(x12, x13);  // 7
1557  x12 = _mm_unpacklo_epi16(x8, x9);  // 11
1558
1559  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));  // 14
1560  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));  // 16
1561  x11 = _mm_unpacklo_epi8(x14, x15);  // 8
1562  x13 = _mm_unpacklo_epi16(x10, x11);  // 12
1563
1564  x6 = _mm_unpacklo_epi32(x4, x5);  // 13
1565  x7 = _mm_unpackhi_epi32(x4, x5);  // 14
1566  x14 = _mm_unpacklo_epi32(x12, x13);  // 15
1567  x15 = _mm_unpackhi_epi32(x12, x13);  // 16
1568
1569  // Store first 4-line result
1570  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1571  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1572  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1573  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1574
1575  x4 = _mm_unpackhi_epi16(x0, x1);
1576  x5 = _mm_unpackhi_epi16(x2, x3);
1577  x12 = _mm_unpackhi_epi16(x8, x9);
1578  x13 = _mm_unpackhi_epi16(x10, x11);
1579
1580  x6 = _mm_unpacklo_epi32(x4, x5);
1581  x7 = _mm_unpackhi_epi32(x4, x5);
1582  x14 = _mm_unpacklo_epi32(x12, x13);
1583  x15 = _mm_unpackhi_epi32(x12, x13);
1584
1585  // Store second 4-line result
1586  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1587  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1588  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1589  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1590}
1591
1592static INLINE void transpose(unsigned char *src[], int in_p,
1593                             unsigned char *dst[], int out_p,
1594                             int num_8x8_to_transpose) {
1595  int idx8x8 = 0;
1596  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1597  do {
1598    unsigned char *in = src[idx8x8];
1599    unsigned char *out = dst[idx8x8];
1600
1601    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
1602    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
1603    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1604    x0 = _mm_unpacklo_epi8(x0, x1);
1605
1606    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
1607    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
1608    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1609    x1 = _mm_unpacklo_epi8(x2, x3);
1610
1611    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
1612    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
1613    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1614    x2 = _mm_unpacklo_epi8(x4, x5);
1615
1616    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
1617    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
1618    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1619    x3 = _mm_unpacklo_epi8(x6, x7);
1620
1621    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1622    x4 = _mm_unpacklo_epi16(x0, x1);
1623    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1624    x5 = _mm_unpacklo_epi16(x2, x3);
1625    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1626    x6 = _mm_unpacklo_epi32(x4, x5);
1627    _mm_storel_pd((double *)(out + 0*out_p),
1628                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
1629    _mm_storeh_pd((double *)(out + 1*out_p),
1630                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
1631    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1632    x7 = _mm_unpackhi_epi32(x4, x5);
1633    _mm_storel_pd((double *)(out + 2*out_p),
1634                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
1635    _mm_storeh_pd((double *)(out + 3*out_p),
1636                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
1637
1638    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1639    x4 = _mm_unpackhi_epi16(x0, x1);
1640    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1641    x5 = _mm_unpackhi_epi16(x2, x3);
1642    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1643    x6 = _mm_unpacklo_epi32(x4, x5);
1644    _mm_storel_pd((double *)(out + 4*out_p),
1645                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
1646    _mm_storeh_pd((double *)(out + 5*out_p),
1647                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
1648    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1649    x7 = _mm_unpackhi_epi32(x4, x5);
1650
1651    _mm_storel_pd((double *)(out + 6*out_p),
1652                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
1653    _mm_storeh_pd((double *)(out + 7*out_p),
1654                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
1655  } while (++idx8x8 < num_8x8_to_transpose);
1656}
1657
1658void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1659                                  const uint8_t *limit0,
1660                                  const uint8_t *thresh0,
1661                                  const uint8_t *blimit1,
1662                                  const uint8_t *limit1,
1663                                  const uint8_t *thresh1) {
1664  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1665  unsigned char *src[2];
1666  unsigned char *dst[2];
1667
1668  // Transpose 8x16
1669  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1670
1671  // Loop filtering
1672  vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1673                                 blimit1, limit1, thresh1);
1674  src[0] = t_dst;
1675  src[1] = t_dst + 8;
1676  dst[0] = s - 4;
1677  dst[1] = s - 4 + p * 8;
1678
1679  // Transpose back
1680  transpose(src, 16, dst, p, 2);
1681}
1682
1683void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
1684                             const unsigned char *blimit,
1685                             const unsigned char *limit,
1686                             const unsigned char *thresh) {
1687  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
1688  unsigned char *src[1];
1689  unsigned char *dst[1];
1690
1691  // Transpose 8x8
1692  src[0] = s - 4;
1693  dst[0] = t_dst;
1694
1695  transpose(src, p, dst, 8, 1);
1696
1697  // Loop filtering
1698  vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
1699
1700  src[0] = t_dst;
1701  dst[0] = s - 4;
1702
1703  // Transpose back
1704  transpose(src, 8, dst, p, 1);
1705}
1706
1707void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1708                                  const uint8_t *limit0,
1709                                  const uint8_t *thresh0,
1710                                  const uint8_t *blimit1,
1711                                  const uint8_t *limit1,
1712                                  const uint8_t *thresh1) {
1713  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1714  unsigned char *src[2];
1715  unsigned char *dst[2];
1716
1717  // Transpose 8x16
1718  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1719
1720  // Loop filtering
1721  vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1722                                 blimit1, limit1, thresh1);
1723  src[0] = t_dst;
1724  src[1] = t_dst + 8;
1725
1726  dst[0] = s - 4;
1727  dst[1] = s - 4 + p * 8;
1728
1729  // Transpose back
1730  transpose(src, 16, dst, p, 2);
1731}
1732
1733void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
1734                              const unsigned char *blimit,
1735                              const unsigned char *limit,
1736                              const unsigned char *thresh) {
1737  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
1738  unsigned char *src[2];
1739  unsigned char *dst[2];
1740
1741  src[0] = s - 8;
1742  src[1] = s;
1743  dst[0] = t_dst;
1744  dst[1] = t_dst + 8 * 8;
1745
1746  // Transpose 16x8
1747  transpose(src, p, dst, 8, 2);
1748
1749  // Loop filtering
1750  vpx_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
1751
1752  src[0] = t_dst;
1753  src[1] = t_dst + 8 * 8;
1754  dst[0] = s - 8;
1755  dst[1] = s;
1756
1757  // Transpose back
1758  transpose(src, 8, dst, p, 2);
1759}
1760
1761void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
1762                                   const uint8_t *blimit, const uint8_t *limit,
1763                                   const uint8_t *thresh) {
1764  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
1765
1766  // Transpose 16x16
1767  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1768  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1769
1770  // Loop filtering
1771  vpx_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
1772
1773  // Transpose back
1774  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1775  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1776}
1777