loopfilter_avx2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/*
2 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include <immintrin.h>  /* AVX2 */
12
13#include "./vpx_dsp_rtcd.h"
14#include "vpx_ports/mem.h"
15
16static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
17        const unsigned char *_blimit, const unsigned char *_limit,
18        const unsigned char *_thresh) {
19    __m128i mask, hev, flat, flat2;
20    const __m128i zero = _mm_set1_epi16(0);
21    const __m128i one = _mm_set1_epi8(1);
22    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
23    __m128i abs_p1p0;
24
25    const __m128i thresh = _mm_broadcastb_epi8(
26            _mm_cvtsi32_si128((int) _thresh[0]));
27    const __m128i limit = _mm_broadcastb_epi8(
28            _mm_cvtsi32_si128((int) _limit[0]));
29    const __m128i blimit = _mm_broadcastb_epi8(
30            _mm_cvtsi32_si128((int) _blimit[0]));
31
32    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
33    q4p4 = _mm_castps_si128(
34            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
35    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
36    q3p3 = _mm_castps_si128(
37            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
38    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
39    q2p2 = _mm_castps_si128(
40            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
41    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
42    q1p1 = _mm_castps_si128(
43            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
44    p1q1 = _mm_shuffle_epi32(q1p1, 78);
45    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
46    q0p0 = _mm_castps_si128(
47            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
48    p0q0 = _mm_shuffle_epi32(q0p0, 78);
49
50    {
51        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
52        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
53                _mm_subs_epu8(q0p0, q1p1));
54        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
55        fe = _mm_set1_epi8(0xfe);
56        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
57        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
58                _mm_subs_epu8(p0q0, q0p0));
59        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
60                _mm_subs_epu8(p1q1, q1p1));
61        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
62        hev = _mm_subs_epu8(flat, thresh);
63        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
64
65        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
66        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
67        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
68        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
69        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
70        mask = _mm_max_epu8(abs_p1p0, mask);
71        // mask |= (abs(p1 - p0) > limit) * -1;
72        // mask |= (abs(q1 - q0) > limit) * -1;
73
74        work = _mm_max_epu8(
75                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
76                        _mm_subs_epu8(q1p1, q2p2)),
77                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
78                        _mm_subs_epu8(q2p2, q3p3)));
79        mask = _mm_max_epu8(work, mask);
80        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
81        mask = _mm_subs_epu8(mask, limit);
82        mask = _mm_cmpeq_epi8(mask, zero);
83    }
84
85    // lp filter
86    {
87        const __m128i t4 = _mm_set1_epi8(4);
88        const __m128i t3 = _mm_set1_epi8(3);
89        const __m128i t80 = _mm_set1_epi8(0x80);
90        const __m128i t1 = _mm_set1_epi16(0x1);
91        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
92        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
93        __m128i qs0 = _mm_xor_si128(p0q0, t80);
94        __m128i qs1 = _mm_xor_si128(p1q1, t80);
95        __m128i filt;
96        __m128i work_a;
97        __m128i filter1, filter2;
98        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
99        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
100
101        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
102        work_a = _mm_subs_epi8(qs0, qs0ps0);
103        filt = _mm_adds_epi8(filt, work_a);
104        filt = _mm_adds_epi8(filt, work_a);
105        filt = _mm_adds_epi8(filt, work_a);
106        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
107        filt = _mm_and_si128(filt, mask);
108
109        filter1 = _mm_adds_epi8(filt, t4);
110        filter2 = _mm_adds_epi8(filt, t3);
111
112        filter1 = _mm_unpacklo_epi8(zero, filter1);
113        filter1 = _mm_srai_epi16(filter1, 0xB);
114        filter2 = _mm_unpacklo_epi8(zero, filter2);
115        filter2 = _mm_srai_epi16(filter2, 0xB);
116
117        /* Filter1 >> 3 */
118        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
119        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
120
121        /* filt >> 1 */
122        filt = _mm_adds_epi16(filter1, t1);
123        filt = _mm_srai_epi16(filt, 1);
124        filt = _mm_andnot_si128(
125                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
126        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
127        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
128        // loopfilter done
129
130        {
131            __m128i work;
132            flat = _mm_max_epu8(
133                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
134                            _mm_subs_epu8(q0p0, q2p2)),
135                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
136                            _mm_subs_epu8(q0p0, q3p3)));
137            flat = _mm_max_epu8(abs_p1p0, flat);
138            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
139            flat = _mm_subs_epu8(flat, one);
140            flat = _mm_cmpeq_epi8(flat, zero);
141            flat = _mm_and_si128(flat, mask);
142
143            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
144            q5p5 = _mm_castps_si128(
145                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
146                            (__m64 *) (s + 5 * p)));
147
148            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
149            q6p6 = _mm_castps_si128(
150                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
151                            (__m64 *) (s + 6 * p)));
152
153            flat2 = _mm_max_epu8(
154                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
155                            _mm_subs_epu8(q0p0, q4p4)),
156                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
157                            _mm_subs_epu8(q0p0, q5p5)));
158
159            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
160            q7p7 = _mm_castps_si128(
161                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
162                            (__m64 *) (s + 7 * p)));
163
164            work = _mm_max_epu8(
165                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
166                            _mm_subs_epu8(q0p0, q6p6)),
167                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
168                            _mm_subs_epu8(q0p0, q7p7)));
169
170            flat2 = _mm_max_epu8(work, flat2);
171            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
172            flat2 = _mm_subs_epu8(flat2, one);
173            flat2 = _mm_cmpeq_epi8(flat2, zero);
174            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
175        }
176
177        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
178        // flat and wide flat calculations
179        {
180            const __m128i eight = _mm_set1_epi16(8);
181            const __m128i four = _mm_set1_epi16(4);
182            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
183            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
184            __m128i pixelFilter_p, pixelFilter_q;
185            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
186            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
187
188            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
189            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
190            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
191            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
192            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
193            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
194            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
195            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
196            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
197            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
198            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
199            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
200            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
201            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
202            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
203            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
204
205            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
206                    _mm_add_epi16(p4_16, p3_16));
207            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
208                    _mm_add_epi16(q4_16, q3_16));
209
210            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
211                    _mm_add_epi16(p2_16, p1_16));
212            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
213
214            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
215                    _mm_add_epi16(q2_16, q1_16));
216            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
217            pixelFilter_p = _mm_add_epi16(eight,
218                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
219            pixetFilter_p2p1p0 = _mm_add_epi16(four,
220                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
221            res_p = _mm_srli_epi16(
222                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
223                    4);
224            res_q = _mm_srli_epi16(
225                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
226                    4);
227            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
228            res_p = _mm_srli_epi16(
229                    _mm_add_epi16(pixetFilter_p2p1p0,
230                            _mm_add_epi16(p3_16, p0_16)), 3);
231            res_q = _mm_srli_epi16(
232                    _mm_add_epi16(pixetFilter_p2p1p0,
233                            _mm_add_epi16(q3_16, q0_16)), 3);
234
235            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
236
237            sum_p7 = _mm_add_epi16(p7_16, p7_16);
238            sum_q7 = _mm_add_epi16(q7_16, q7_16);
239            sum_p3 = _mm_add_epi16(p3_16, p3_16);
240            sum_q3 = _mm_add_epi16(q3_16, q3_16);
241
242            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
243            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
244            res_p = _mm_srli_epi16(
245                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
246                    4);
247            res_q = _mm_srli_epi16(
248                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
249                    4);
250            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
251
252            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
253            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
254            res_p = _mm_srli_epi16(
255                    _mm_add_epi16(pixetFilter_p2p1p0,
256                            _mm_add_epi16(sum_p3, p1_16)), 3);
257            res_q = _mm_srli_epi16(
258                    _mm_add_epi16(pixetFilter_q2q1q0,
259                            _mm_add_epi16(sum_q3, q1_16)), 3);
260            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
261
262            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
263            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
264            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
265            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
266
267            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
268            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
269            res_p = _mm_srli_epi16(
270                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
271                    4);
272            res_q = _mm_srli_epi16(
273                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
274                    4);
275            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
276
277            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
278            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
279
280            res_p = _mm_srli_epi16(
281                    _mm_add_epi16(pixetFilter_p2p1p0,
282                            _mm_add_epi16(sum_p3, p2_16)), 3);
283            res_q = _mm_srli_epi16(
284                    _mm_add_epi16(pixetFilter_q2q1q0,
285                            _mm_add_epi16(sum_q3, q2_16)), 3);
286            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
287
288            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
289            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
290            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
291            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
292            res_p = _mm_srli_epi16(
293                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
294                    4);
295            res_q = _mm_srli_epi16(
296                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
297                    4);
298            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
299
300            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
301            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
302            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
303            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
304            res_p = _mm_srli_epi16(
305                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
306                    4);
307            res_q = _mm_srli_epi16(
308                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
309                    4);
310            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
311
312            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
313            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
314            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
315            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
316            res_p = _mm_srli_epi16(
317                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
318                    4);
319            res_q = _mm_srli_epi16(
320                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
321                    4);
322            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
323
324            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
325            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
326            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
327            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
328            res_p = _mm_srli_epi16(
329                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
330                    4);
331            res_q = _mm_srli_epi16(
332                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
333                    4);
334            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
335        }
336        // wide flat
337        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
338
339        flat = _mm_shuffle_epi32(flat, 68);
340        flat2 = _mm_shuffle_epi32(flat2, 68);
341
342        q2p2 = _mm_andnot_si128(flat, q2p2);
343        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
344        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
345
346        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
347        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
348        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
349
350        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
351        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
352        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
353
354        q6p6 = _mm_andnot_si128(flat2, q6p6);
355        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
356        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
357        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
358        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
359
360        q5p5 = _mm_andnot_si128(flat2, q5p5);
361        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
362        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
363        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
364        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
365
366        q4p4 = _mm_andnot_si128(flat2, q4p4);
367        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
368        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
369        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
370        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
371
372        q3p3 = _mm_andnot_si128(flat2, q3p3);
373        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
374        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
375        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
376        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
377
378        q2p2 = _mm_andnot_si128(flat2, q2p2);
379        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
380        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
381        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
382        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
383
384        q1p1 = _mm_andnot_si128(flat2, q1p1);
385        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
386        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
387        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
388        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
389
390        q0p0 = _mm_andnot_si128(flat2, q0p0);
391        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
392        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
393        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
394        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
395    }
396}
397
398DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
399  0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
400  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
401};
402
403static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
404        const unsigned char *_blimit, const unsigned char *_limit,
405        const unsigned char *_thresh) {
406    __m128i mask, hev, flat, flat2;
407    const __m128i zero = _mm_set1_epi16(0);
408    const __m128i one = _mm_set1_epi8(1);
409    __m128i p7, p6, p5;
410    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
411    __m128i q5, q6, q7;
412    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
413            q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
414            p256_0, q256_0;
415
416    const __m128i thresh = _mm_broadcastb_epi8(
417            _mm_cvtsi32_si128((int) _thresh[0]));
418    const __m128i limit = _mm_broadcastb_epi8(
419            _mm_cvtsi32_si128((int) _limit[0]));
420    const __m128i blimit = _mm_broadcastb_epi8(
421            _mm_cvtsi32_si128((int) _blimit[0]));
422
423    p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
424                                (__m128d const *)(s - 5 * p)));
425    p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
426                                (__m128d const *)(s - 4 * p)));
427    p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
428                                (__m128d const *)(s - 3 * p)));
429    p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
430                                (__m128d const *)(s - 2 * p)));
431    p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
432                                (__m128d const *)(s - 1 * p)));
433    q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
434                                (__m128d const *)(s - 0 * p)));
435    q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
436                                (__m128d const *)(s + 1 * p)));
437    q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
438                                (__m128d const *)(s + 2 * p)));
439    q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
440                                (__m128d const *)(s + 3 * p)));
441    q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
442                                (__m128d const *)(s + 4 * p)));
443
444    p4 = _mm256_castsi256_si128(p256_4);
445    p3 = _mm256_castsi256_si128(p256_3);
446    p2 = _mm256_castsi256_si128(p256_2);
447    p1 = _mm256_castsi256_si128(p256_1);
448    p0 = _mm256_castsi256_si128(p256_0);
449    q0 = _mm256_castsi256_si128(q256_0);
450    q1 = _mm256_castsi256_si128(q256_1);
451    q2 = _mm256_castsi256_si128(q256_2);
452    q3 = _mm256_castsi256_si128(q256_3);
453    q4 = _mm256_castsi256_si128(q256_4);
454
455    {
456        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
457                _mm_subs_epu8(p0, p1));
458        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
459                _mm_subs_epu8(q0, q1));
460        const __m128i fe = _mm_set1_epi8(0xfe);
461        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
462        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
463                _mm_subs_epu8(q0, p0));
464        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
465                _mm_subs_epu8(q1, p1));
466        __m128i work;
467        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
468        hev = _mm_subs_epu8(flat, thresh);
469        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
470
471        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
472        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
473        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
474        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
475        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
476        mask = _mm_max_epu8(flat, mask);
477        // mask |= (abs(p1 - p0) > limit) * -1;
478        // mask |= (abs(q1 - q0) > limit) * -1;
479        work = _mm_max_epu8(
480                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
481                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
482        mask = _mm_max_epu8(work, mask);
483        work = _mm_max_epu8(
484                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
485                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
486        mask = _mm_max_epu8(work, mask);
487        mask = _mm_subs_epu8(mask, limit);
488        mask = _mm_cmpeq_epi8(mask, zero);
489    }
490
491    // lp filter
492    {
493        const __m128i t4 = _mm_set1_epi8(4);
494        const __m128i t3 = _mm_set1_epi8(3);
495        const __m128i t80 = _mm_set1_epi8(0x80);
496        const __m128i te0 = _mm_set1_epi8(0xe0);
497        const __m128i t1f = _mm_set1_epi8(0x1f);
498        const __m128i t1 = _mm_set1_epi8(0x1);
499        const __m128i t7f = _mm_set1_epi8(0x7f);
500
501        __m128i ps1 = _mm_xor_si128(p1, t80);
502        __m128i ps0 = _mm_xor_si128(p0, t80);
503        __m128i qs0 = _mm_xor_si128(q0, t80);
504        __m128i qs1 = _mm_xor_si128(q1, t80);
505        __m128i filt;
506        __m128i work_a;
507        __m128i filter1, filter2;
508        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
509                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
510                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
511                flat_q2;
512
513        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
514        work_a = _mm_subs_epi8(qs0, ps0);
515        filt = _mm_adds_epi8(filt, work_a);
516        filt = _mm_adds_epi8(filt, work_a);
517        filt = _mm_adds_epi8(filt, work_a);
518        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
519        filt = _mm_and_si128(filt, mask);
520
521        filter1 = _mm_adds_epi8(filt, t4);
522        filter2 = _mm_adds_epi8(filt, t3);
523
524        /* Filter1 >> 3 */
525        work_a = _mm_cmpgt_epi8(zero, filter1);
526        filter1 = _mm_srli_epi16(filter1, 3);
527        work_a = _mm_and_si128(work_a, te0);
528        filter1 = _mm_and_si128(filter1, t1f);
529        filter1 = _mm_or_si128(filter1, work_a);
530        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
531
532        /* Filter2 >> 3 */
533        work_a = _mm_cmpgt_epi8(zero, filter2);
534        filter2 = _mm_srli_epi16(filter2, 3);
535        work_a = _mm_and_si128(work_a, te0);
536        filter2 = _mm_and_si128(filter2, t1f);
537        filter2 = _mm_or_si128(filter2, work_a);
538        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
539
540        /* filt >> 1 */
541        filt = _mm_adds_epi8(filter1, t1);
542        work_a = _mm_cmpgt_epi8(zero, filt);
543        filt = _mm_srli_epi16(filt, 1);
544        work_a = _mm_and_si128(work_a, t80);
545        filt = _mm_and_si128(filt, t7f);
546        filt = _mm_or_si128(filt, work_a);
547        filt = _mm_andnot_si128(hev, filt);
548        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
549        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
550        // loopfilter done
551
552        {
553            __m128i work;
554            work = _mm_max_epu8(
555                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
556                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
557            flat = _mm_max_epu8(work, flat);
558            work = _mm_max_epu8(
559                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
560                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
561            flat = _mm_max_epu8(work, flat);
562            work = _mm_max_epu8(
563                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
564                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
565            flat = _mm_subs_epu8(flat, one);
566            flat = _mm_cmpeq_epi8(flat, zero);
567            flat = _mm_and_si128(flat, mask);
568
569            p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
570                                        (__m128d const *)(s - 6 * p)));
571            q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
572                                        (__m128d const *)(s + 5 * p)));
573            p5 = _mm256_castsi256_si128(p256_5);
574            q5 = _mm256_castsi256_si128(q256_5);
575            flat2 = _mm_max_epu8(
576                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
577                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
578
579            flat2 = _mm_max_epu8(work, flat2);
580            p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
581                                        (__m128d const *)(s - 7 * p)));
582            q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
583                                        (__m128d const *)(s + 6 * p)));
584            p6 = _mm256_castsi256_si128(p256_6);
585            q6 = _mm256_castsi256_si128(q256_6);
586            work = _mm_max_epu8(
587                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
588                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
589
590            flat2 = _mm_max_epu8(work, flat2);
591
592            p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
593                                        (__m128d const *)(s - 8 * p)));
594            q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
595                                        (__m128d const *)(s + 7 * p)));
596            p7 = _mm256_castsi256_si128(p256_7);
597            q7 = _mm256_castsi256_si128(q256_7);
598            work = _mm_max_epu8(
599                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
600                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
601
602            flat2 = _mm_max_epu8(work, flat2);
603            flat2 = _mm_subs_epu8(flat2, one);
604            flat2 = _mm_cmpeq_epi8(flat2, zero);
605            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
606        }
607
608        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
609        // flat and wide flat calculations
610        {
611            const __m256i eight = _mm256_set1_epi16(8);
612            const __m256i four = _mm256_set1_epi16(4);
613            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
614                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
615                    res_q;
616
617            const __m256i filter = _mm256_load_si256(
618                                  (__m256i const *)filt_loopfilter_avx2);
619            p256_7 = _mm256_shuffle_epi8(p256_7, filter);
620            p256_6 = _mm256_shuffle_epi8(p256_6, filter);
621            p256_5 = _mm256_shuffle_epi8(p256_5, filter);
622            p256_4 = _mm256_shuffle_epi8(p256_4, filter);
623            p256_3 = _mm256_shuffle_epi8(p256_3, filter);
624            p256_2 = _mm256_shuffle_epi8(p256_2, filter);
625            p256_1 = _mm256_shuffle_epi8(p256_1, filter);
626            p256_0 = _mm256_shuffle_epi8(p256_0, filter);
627            q256_0 = _mm256_shuffle_epi8(q256_0, filter);
628            q256_1 = _mm256_shuffle_epi8(q256_1, filter);
629            q256_2 = _mm256_shuffle_epi8(q256_2, filter);
630            q256_3 = _mm256_shuffle_epi8(q256_3, filter);
631            q256_4 = _mm256_shuffle_epi8(q256_4, filter);
632            q256_5 = _mm256_shuffle_epi8(q256_5, filter);
633            q256_6 = _mm256_shuffle_epi8(q256_6, filter);
634            q256_7 = _mm256_shuffle_epi8(q256_7, filter);
635
636            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
637                    _mm256_add_epi16(p256_4, p256_3));
638            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
639                    _mm256_add_epi16(q256_4, q256_3));
640
641            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
642                    _mm256_add_epi16(p256_2, p256_1));
643            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
644
645            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
646                    _mm256_add_epi16(q256_2, q256_1));
647            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
648
649            pixelFilter_p = _mm256_add_epi16(eight,
650                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
651
652            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
653                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
654
655            res_p = _mm256_srli_epi16(
656                    _mm256_add_epi16(pixelFilter_p,
657                            _mm256_add_epi16(p256_7, p256_0)), 4);
658
659            flat2_p0 = _mm256_castsi256_si128(
660                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
661                            168));
662
663            res_q = _mm256_srli_epi16(
664                    _mm256_add_epi16(pixelFilter_p,
665                            _mm256_add_epi16(q256_7, q256_0)), 4);
666
667            flat2_q0 = _mm256_castsi256_si128(
668                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
669                            168));
670
671            res_p = _mm256_srli_epi16(
672                    _mm256_add_epi16(pixetFilter_p2p1p0,
673                            _mm256_add_epi16(p256_3, p256_0)), 3);
674
675            flat_p0 = _mm256_castsi256_si128(
676                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
677                            168));
678
679            res_q = _mm256_srli_epi16(
680                    _mm256_add_epi16(pixetFilter_p2p1p0,
681                            _mm256_add_epi16(q256_3, q256_0)), 3);
682
683            flat_q0 = _mm256_castsi256_si128(
684                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
685                            168));
686
687            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
688
689            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
690
691            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
692
693            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
694
695            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
696
697            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
698
699            res_p = _mm256_srli_epi16(
700                    _mm256_add_epi16(pixelFilter_p,
701                            _mm256_add_epi16(sum_p7, p256_1)), 4);
702
703            flat2_p1 = _mm256_castsi256_si128(
704                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
705                            168));
706
707            res_q = _mm256_srli_epi16(
708                    _mm256_add_epi16(pixelFilter_q,
709                            _mm256_add_epi16(sum_q7, q256_1)), 4);
710
711            flat2_q1 = _mm256_castsi256_si128(
712                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
713                            168));
714
715            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
716
717            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
718
719            res_p = _mm256_srli_epi16(
720                    _mm256_add_epi16(pixetFilter_p2p1p0,
721                            _mm256_add_epi16(sum_p3, p256_1)), 3);
722
723            flat_p1 = _mm256_castsi256_si128(
724                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
725                            168));
726
727            res_q = _mm256_srli_epi16(
728                    _mm256_add_epi16(pixetFilter_q2q1q0,
729                            _mm256_add_epi16(sum_q3, q256_1)), 3);
730
731            flat_q1 = _mm256_castsi256_si128(
732                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
733                            168));
734
735            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
736
737            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
738
739            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
740
741            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
742
743            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
744
745            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
746
747            res_p = _mm256_srli_epi16(
748                    _mm256_add_epi16(pixelFilter_p,
749                            _mm256_add_epi16(sum_p7, p256_2)), 4);
750
751            flat2_p2 = _mm256_castsi256_si128(
752                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
753                            168));
754
755            res_q = _mm256_srli_epi16(
756                    _mm256_add_epi16(pixelFilter_q,
757                            _mm256_add_epi16(sum_q7, q256_2)), 4);
758
759            flat2_q2 = _mm256_castsi256_si128(
760                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
761                            168));
762
763            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
764
765            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
766
767            res_p = _mm256_srli_epi16(
768                    _mm256_add_epi16(pixetFilter_p2p1p0,
769                            _mm256_add_epi16(sum_p3, p256_2)), 3);
770
771            flat_p2 = _mm256_castsi256_si128(
772                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
773                            168));
774
775            res_q = _mm256_srli_epi16(
776                    _mm256_add_epi16(pixetFilter_q2q1q0,
777                            _mm256_add_epi16(sum_q3, q256_2)), 3);
778
779            flat_q2 = _mm256_castsi256_si128(
780                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
781                            168));
782
783            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
784
785            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
786
787            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
788
789            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
790
791            res_p = _mm256_srli_epi16(
792                    _mm256_add_epi16(pixelFilter_p,
793                            _mm256_add_epi16(sum_p7, p256_3)), 4);
794
795            flat2_p3 = _mm256_castsi256_si128(
796                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
797                            168));
798
799            res_q = _mm256_srli_epi16(
800                    _mm256_add_epi16(pixelFilter_q,
801                            _mm256_add_epi16(sum_q7, q256_3)), 4);
802
803            flat2_q3 = _mm256_castsi256_si128(
804                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
805                            168));
806
807            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
808
809            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
810
811            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
812
813            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
814
815            res_p = _mm256_srli_epi16(
816                    _mm256_add_epi16(pixelFilter_p,
817                            _mm256_add_epi16(sum_p7, p256_4)), 4);
818
819            flat2_p4 = _mm256_castsi256_si128(
820                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
821                            168));
822
823            res_q = _mm256_srli_epi16(
824                    _mm256_add_epi16(pixelFilter_q,
825                            _mm256_add_epi16(sum_q7, q256_4)), 4);
826
827            flat2_q4 = _mm256_castsi256_si128(
828                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
829                            168));
830
831            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
832
833            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
834
835            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
836
837            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
838
839            res_p = _mm256_srli_epi16(
840                    _mm256_add_epi16(pixelFilter_p,
841                            _mm256_add_epi16(sum_p7, p256_5)), 4);
842
843            flat2_p5 = _mm256_castsi256_si128(
844                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
845                            168));
846
847            res_q = _mm256_srli_epi16(
848                    _mm256_add_epi16(pixelFilter_q,
849                            _mm256_add_epi16(sum_q7, q256_5)), 4);
850
851            flat2_q5 = _mm256_castsi256_si128(
852                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
853                            168));
854
855            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
856
857            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
858
859            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
860
861            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
862
863            res_p = _mm256_srli_epi16(
864                    _mm256_add_epi16(pixelFilter_p,
865                            _mm256_add_epi16(sum_p7, p256_6)), 4);
866
867            flat2_p6 = _mm256_castsi256_si128(
868                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
869                            168));
870
871            res_q = _mm256_srli_epi16(
872                    _mm256_add_epi16(pixelFilter_q,
873                            _mm256_add_epi16(sum_q7, q256_6)), 4);
874
875            flat2_q6 = _mm256_castsi256_si128(
876                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
877                            168));
878        }
879
880        // wide flat
881        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
882
883        p2 = _mm_andnot_si128(flat, p2);
884        flat_p2 = _mm_and_si128(flat, flat_p2);
885        p2 = _mm_or_si128(flat_p2, p2);
886
887        p1 = _mm_andnot_si128(flat, ps1);
888        flat_p1 = _mm_and_si128(flat, flat_p1);
889        p1 = _mm_or_si128(flat_p1, p1);
890
891        p0 = _mm_andnot_si128(flat, ps0);
892        flat_p0 = _mm_and_si128(flat, flat_p0);
893        p0 = _mm_or_si128(flat_p0, p0);
894
895        q0 = _mm_andnot_si128(flat, qs0);
896        flat_q0 = _mm_and_si128(flat, flat_q0);
897        q0 = _mm_or_si128(flat_q0, q0);
898
899        q1 = _mm_andnot_si128(flat, qs1);
900        flat_q1 = _mm_and_si128(flat, flat_q1);
901        q1 = _mm_or_si128(flat_q1, q1);
902
903        q2 = _mm_andnot_si128(flat, q2);
904        flat_q2 = _mm_and_si128(flat, flat_q2);
905        q2 = _mm_or_si128(flat_q2, q2);
906
907        p6 = _mm_andnot_si128(flat2, p6);
908        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
909        p6 = _mm_or_si128(flat2_p6, p6);
910        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
911
912        p5 = _mm_andnot_si128(flat2, p5);
913        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
914        p5 = _mm_or_si128(flat2_p5, p5);
915        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
916
917        p4 = _mm_andnot_si128(flat2, p4);
918        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
919        p4 = _mm_or_si128(flat2_p4, p4);
920        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
921
922        p3 = _mm_andnot_si128(flat2, p3);
923        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
924        p3 = _mm_or_si128(flat2_p3, p3);
925        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
926
927        p2 = _mm_andnot_si128(flat2, p2);
928        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
929        p2 = _mm_or_si128(flat2_p2, p2);
930        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
931
932        p1 = _mm_andnot_si128(flat2, p1);
933        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
934        p1 = _mm_or_si128(flat2_p1, p1);
935        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
936
937        p0 = _mm_andnot_si128(flat2, p0);
938        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
939        p0 = _mm_or_si128(flat2_p0, p0);
940        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
941
942        q0 = _mm_andnot_si128(flat2, q0);
943        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
944        q0 = _mm_or_si128(flat2_q0, q0);
945        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
946
947        q1 = _mm_andnot_si128(flat2, q1);
948        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
949        q1 = _mm_or_si128(flat2_q1, q1);
950        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
951
952        q2 = _mm_andnot_si128(flat2, q2);
953        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
954        q2 = _mm_or_si128(flat2_q2, q2);
955        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
956
957        q3 = _mm_andnot_si128(flat2, q3);
958        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
959        q3 = _mm_or_si128(flat2_q3, q3);
960        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
961
962        q4 = _mm_andnot_si128(flat2, q4);
963        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
964        q4 = _mm_or_si128(flat2_q4, q4);
965        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
966
967        q5 = _mm_andnot_si128(flat2, q5);
968        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
969        q5 = _mm_or_si128(flat2_q5, q5);
970        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
971
972        q6 = _mm_andnot_si128(flat2, q6);
973        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
974        q6 = _mm_or_si128(flat2_q6, q6);
975        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
976    }
977}
978
979void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
980        const unsigned char *_blimit, const unsigned char *_limit,
981        const unsigned char *_thresh, int count) {
982    if (count == 1)
983        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
984    else
985        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
986}
987