15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/*
25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *
45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Use of this source code is governed by a BSD-style license
55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  that can be found in the LICENSE file in the root of the source
65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  tree. An additional intellectual property rights grant can be found
75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  in the file PATENTS.  All contributing project authors may
85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  be found in the AUTHORS file in the root of the source tree.
95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */
105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <immintrin.h>  /* AVX2 */
125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_ports/mem.h"
157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const unsigned char *_blimit, const unsigned char *_limit,
185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const unsigned char *_thresh) {
195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __m128i mask, hev, flat, flat2;
205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    const __m128i zero = _mm_set1_epi16(0);
215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    const __m128i one = _mm_set1_epi8(1);
225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __m128i abs_p1p0;
245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    const __m128i thresh = _mm_broadcastb_epi8(
265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            _mm_cvtsi32_si128((int) _thresh[0]));
275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    const __m128i limit = _mm_broadcastb_epi8(
285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            _mm_cvtsi32_si128((int) _limit[0]));
295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    const __m128i blimit = _mm_broadcastb_epi8(
305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            _mm_cvtsi32_si128((int) _blimit[0]));
315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    q4p4 = _mm_castps_si128(
345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    q3p3 = _mm_castps_si128(
375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    q2p2 = _mm_castps_si128(
405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    q1p1 = _mm_castps_si128(
435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    p1q1 = _mm_shuffle_epi32(q1p1, 78);
455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    q0p0 = _mm_castps_si128(
475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    p0q0 = _mm_shuffle_epi32(q0p0, 78);
495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    {
515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_subs_epu8(q0p0, q1p1));
545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        fe = _mm_set1_epi8(0xfe);
565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_subs_epu8(p0q0, q0p0));
595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_subs_epu8(p1q1, q1p1));
615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        hev = _mm_subs_epu8(flat, thresh);
635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_max_epu8(abs_p1p0, mask);
715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // mask |= (abs(p1 - p0) > limit) * -1;
725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // mask |= (abs(q1 - q0) > limit) * -1;
735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        work = _mm_max_epu8(
755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        _mm_subs_epu8(q1p1, q2p2)),
775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                        _mm_subs_epu8(q2p2, q3p3)));
795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_max_epu8(work, mask);
805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_subs_epu8(mask, limit);
825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_cmpeq_epi8(mask, zero);
835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    // lp filter
865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    {
875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i t4 = _mm_set1_epi8(4);
885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i t3 = _mm_set1_epi8(3);
895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i t80 = _mm_set1_epi8(0x80);
905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i t1 = _mm_set1_epi16(0x1);
915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i qs0 = _mm_xor_si128(p0q0, t80);
945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i qs1 = _mm_xor_si128(p1q1, t80);
955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i filt;
965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i work_a;
975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i filter1, filter2;
985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        work_a = _mm_subs_epi8(qs0, qs0ps0);
1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_adds_epi8(filt, work_a);
1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_adds_epi8(filt, work_a);
1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_adds_epi8(filt, work_a);
1067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_and_si128(filt, mask);
1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter1 = _mm_adds_epi8(filt, t4);
1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter2 = _mm_adds_epi8(filt, t3);
1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter1 = _mm_unpacklo_epi8(zero, filter1);
1135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter1 = _mm_srai_epi16(filter1, 0xB);
1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter2 = _mm_unpacklo_epi8(zero, filter2);
1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter2 = _mm_srai_epi16(filter2, 0xB);
1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* Filter1 >> 3 */
1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* filt >> 1 */
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_adds_epi16(filter1, t1);
1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_srai_epi16(filt, 1);
1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_andnot_si128(
1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // loopfilter done
1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        {
1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            __m128i work;
1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat = _mm_max_epu8(
1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_subs_epu8(q0p0, q2p2)),
1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_subs_epu8(q0p0, q3p3)));
1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat = _mm_max_epu8(abs_p1p0, flat);
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat = _mm_subs_epu8(flat, one);
1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat = _mm_cmpeq_epi8(flat, zero);
1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat = _mm_and_si128(flat, mask);
1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q5p5 = _mm_castps_si128(
1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            (__m64 *) (s + 5 * p)));
1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q6p6 = _mm_castps_si128(
1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            (__m64 *) (s + 6 * p)));
1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_max_epu8(
1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_subs_epu8(q0p0, q4p4)),
1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_subs_epu8(q0p0, q5p5)));
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
1605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q7p7 = _mm_castps_si128(
1615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
1625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            (__m64 *) (s + 7 * p)));
1635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            work = _mm_max_epu8(
1655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
1665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_subs_epu8(q0p0, q6p6)),
1675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
1685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_subs_epu8(q0p0, q7p7)));
1695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_max_epu8(work, flat2);
1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_subs_epu8(flat2, one);
1735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_cmpeq_epi8(flat2, zero);
1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        }
1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // flat and wide flat calculations
1795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        {
1805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            const __m128i eight = _mm_set1_epi16(8);
1815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            const __m128i four = _mm_set1_epi16(4);
1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
1835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
1845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            __m128i pixelFilter_p, pixelFilter_q;
1855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
1865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
1875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
1895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
1905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
1915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
1925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
2035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
2045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(p4_16, p3_16));
2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(q4_16, q3_16));
2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(p2_16, p1_16));
2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(q2_16, q1_16));
2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm_add_epi16(eight,
2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_p2p1p0 = _mm_add_epi16(four,
2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm_srli_epi16(
2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm_srli_epi16(
2255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
2265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
2275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm_srli_epi16(
2295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixetFilter_p2p1p0,
2305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_add_epi16(p3_16, p0_16)), 3);
2315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm_srli_epi16(
2325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixetFilter_p2p1p0,
2335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_add_epi16(q3_16, q0_16)), 3);
2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm_add_epi16(p7_16, p7_16);
2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm_add_epi16(q7_16, q7_16);
2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p3 = _mm_add_epi16(p3_16, p3_16);
2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q3 = _mm_add_epi16(q3_16, q3_16);
2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
2435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
2445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm_srli_epi16(
2455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
2465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
2475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm_srli_epi16(
2485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm_srli_epi16(
2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixetFilter_p2p1p0,
2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_add_epi16(sum_p3, p1_16)), 3);
2575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm_srli_epi16(
2585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixetFilter_q2q1q0,
2595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_add_epi16(sum_q3, q1_16)), 3);
2605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
2645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm_srli_epi16(
2705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
2715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
2725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm_srli_epi16(
2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
2785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm_srli_epi16(
2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixetFilter_p2p1p0,
2825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_add_epi16(sum_p3, p2_16)), 3);
2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm_srli_epi16(
2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixetFilter_q2q1q0,
2855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm_add_epi16(sum_q3, q2_16)), 3);
2865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
2875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
2895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
2905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
2915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
2925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm_srli_epi16(
2935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
2945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
2955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm_srli_epi16(
2965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
2975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
2985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
2995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
3015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
3025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
3035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
3045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm_srli_epi16(
3055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
3065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
3075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm_srli_epi16(
3085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
3095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
3115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
3135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
3145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
3155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
3165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm_srli_epi16(
3175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
3185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
3195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm_srli_epi16(
3205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
3215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
3225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
3235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
3255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
3265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
3275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
3285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm_srli_epi16(
3295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
3315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm_srli_epi16(
3325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
3335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    4);
3345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
3355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        }
3365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // wide flat
3375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat = _mm_shuffle_epi32(flat, 68);
3405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2 = _mm_shuffle_epi32(flat2, 68);
3415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q2p2 = _mm_andnot_si128(flat, q2p2);
3435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
3445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
3455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
3485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
3495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
3515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
3525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
3535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q6p6 = _mm_andnot_si128(flat2, q6p6);
3555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
3565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
3575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
3585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
3595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q5p5 = _mm_andnot_si128(flat2, q5p5);
3615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
3625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
3635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
3645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
3655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q4p4 = _mm_andnot_si128(flat2, q4p4);
3675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
3685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
3695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
3705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
3715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q3p3 = _mm_andnot_si128(flat2, q3p3);
3735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q2p2 = _mm_andnot_si128(flat2, q2p2);
3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
3805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
3815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
3825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
3835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q1p1 = _mm_andnot_si128(flat2, q1p1);
3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
3865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
3875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
3885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
3895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q0p0 = _mm_andnot_si128(flat2, q0p0);
3915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
3925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
3935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
3945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
3955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
3965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
3975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh VenkatasubramanianDECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
3997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
4007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
4017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian};
4027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuangstatic void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
4045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const unsigned char *_blimit, const unsigned char *_limit,
4055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const unsigned char *_thresh) {
4065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __m128i mask, hev, flat, flat2;
4075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    const __m128i zero = _mm_set1_epi16(0);
4085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    const __m128i one = _mm_set1_epi8(1);
4095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __m128i p7, p6, p5;
4105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
4115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    __m128i q5, q6, q7;
4127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
4137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
4147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_0, q256_0;
4155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    const __m128i thresh = _mm_broadcastb_epi8(
4175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            _mm_cvtsi32_si128((int) _thresh[0]));
4185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    const __m128i limit = _mm_broadcastb_epi8(
4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            _mm_cvtsi32_si128((int) _limit[0]));
4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    const __m128i blimit = _mm_broadcastb_epi8(
4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            _mm_cvtsi32_si128((int) _blimit[0]));
4225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
4247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                (__m128d const *)(s - 5 * p)));
4257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
4267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                (__m128d const *)(s - 4 * p)));
4277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
4287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                (__m128d const *)(s - 3 * p)));
4297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
4307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                (__m128d const *)(s - 2 * p)));
4317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
4327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                (__m128d const *)(s - 1 * p)));
4337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
4347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                (__m128d const *)(s - 0 * p)));
4357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
4367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                (__m128d const *)(s + 1 * p)));
4377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
4387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                (__m128d const *)(s + 2 * p)));
4397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
4407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                (__m128d const *)(s + 3 * p)));
4417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
4427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                (__m128d const *)(s + 4 * p)));
4437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    p4 = _mm256_castsi256_si128(p256_4);
4457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    p3 = _mm256_castsi256_si128(p256_3);
4467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    p2 = _mm256_castsi256_si128(p256_2);
4477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    p1 = _mm256_castsi256_si128(p256_1);
4487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    p0 = _mm256_castsi256_si128(p256_0);
4497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    q0 = _mm256_castsi256_si128(q256_0);
4507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    q1 = _mm256_castsi256_si128(q256_1);
4517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    q2 = _mm256_castsi256_si128(q256_2);
4527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    q3 = _mm256_castsi256_si128(q256_3);
4537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    q4 = _mm256_castsi256_si128(q256_4);
4545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    {
4565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
4575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_subs_epu8(p0, p1));
4585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
4595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_subs_epu8(q0, q1));
4605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i fe = _mm_set1_epi8(0xfe);
4615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
4625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
4635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_subs_epu8(q0, p0));
4645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
4655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_subs_epu8(q1, p1));
4665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i work;
4675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
4685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        hev = _mm_subs_epu8(flat, thresh);
4695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
4705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
4725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
4735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
4745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
4755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
4765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_max_epu8(flat, mask);
4775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // mask |= (abs(p1 - p0) > limit) * -1;
4785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // mask |= (abs(q1 - q0) > limit) * -1;
4795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        work = _mm_max_epu8(
4805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
4815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
4825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_max_epu8(work, mask);
4835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        work = _mm_max_epu8(
4845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
4855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
4865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_max_epu8(work, mask);
4875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_subs_epu8(mask, limit);
4885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mask = _mm_cmpeq_epi8(mask, zero);
4895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
4905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    // lp filter
4925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    {
4935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i t4 = _mm_set1_epi8(4);
4945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i t3 = _mm_set1_epi8(3);
4955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i t80 = _mm_set1_epi8(0x80);
4965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i te0 = _mm_set1_epi8(0xe0);
4975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i t1f = _mm_set1_epi8(0x1f);
4985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i t1 = _mm_set1_epi8(0x1);
4995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const __m128i t7f = _mm_set1_epi8(0x7f);
5005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i ps1 = _mm_xor_si128(p1, t80);
5025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i ps0 = _mm_xor_si128(p0, t80);
5035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i qs0 = _mm_xor_si128(q0, t80);
5045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i qs1 = _mm_xor_si128(q1, t80);
5055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i filt;
5065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i work_a;
5075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i filter1, filter2;
5085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
5095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
5105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
5115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                flat_q2;
5125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
5145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        work_a = _mm_subs_epi8(qs0, ps0);
5155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_adds_epi8(filt, work_a);
5165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_adds_epi8(filt, work_a);
5175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_adds_epi8(filt, work_a);
5187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
5195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_and_si128(filt, mask);
5205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter1 = _mm_adds_epi8(filt, t4);
5225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter2 = _mm_adds_epi8(filt, t3);
5235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* Filter1 >> 3 */
5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        work_a = _mm_cmpgt_epi8(zero, filter1);
5265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter1 = _mm_srli_epi16(filter1, 3);
5275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        work_a = _mm_and_si128(work_a, te0);
5285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter1 = _mm_and_si128(filter1, t1f);
5295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter1 = _mm_or_si128(filter1, work_a);
5305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
5315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* Filter2 >> 3 */
5335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        work_a = _mm_cmpgt_epi8(zero, filter2);
5345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter2 = _mm_srli_epi16(filter2, 3);
5355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        work_a = _mm_and_si128(work_a, te0);
5365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter2 = _mm_and_si128(filter2, t1f);
5375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filter2 = _mm_or_si128(filter2, work_a);
5385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
5395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        /* filt >> 1 */
5415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_adds_epi8(filter1, t1);
5425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        work_a = _mm_cmpgt_epi8(zero, filt);
5435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_srli_epi16(filt, 1);
5445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        work_a = _mm_and_si128(work_a, t80);
5455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_and_si128(filt, t7f);
5465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_or_si128(filt, work_a);
5475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        filt = _mm_andnot_si128(hev, filt);
5485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
5495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
5505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // loopfilter done
5515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        {
5535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            __m128i work;
5545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            work = _mm_max_epu8(
5555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
5565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
5575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat = _mm_max_epu8(work, flat);
5585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            work = _mm_max_epu8(
5595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
5605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat = _mm_max_epu8(work, flat);
5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            work = _mm_max_epu8(
5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
5655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat = _mm_subs_epu8(flat, one);
5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat = _mm_cmpeq_epi8(flat, zero);
5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat = _mm_and_si128(flat, mask);
5685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
5707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        (__m128d const *)(s - 6 * p)));
5717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
5727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        (__m128d const *)(s + 5 * p)));
5737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p5 = _mm256_castsi256_si128(p256_5);
5747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q5 = _mm256_castsi256_si128(q256_5);
5755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_max_epu8(
5765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
5775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
5785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_max_epu8(work, flat2);
5807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
5817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        (__m128d const *)(s - 7 * p)));
5827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
5837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        (__m128d const *)(s + 6 * p)));
5847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p6 = _mm256_castsi256_si128(p256_6);
5857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q6 = _mm256_castsi256_si128(q256_6);
5865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            work = _mm_max_epu8(
5875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
5885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
5895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_max_epu8(work, flat2);
5915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
5937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        (__m128d const *)(s - 8 * p)));
5947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
5957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                        (__m128d const *)(s + 7 * p)));
5967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p7 = _mm256_castsi256_si128(p256_7);
5977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q7 = _mm256_castsi256_si128(q256_7);
5985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            work = _mm_max_epu8(
5995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
6005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
6015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_max_epu8(work, flat2);
6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_subs_epu8(flat2, one);
6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_cmpeq_epi8(flat2, zero);
6055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
6065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        }
6075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
6095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // flat and wide flat calculations
6105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        {
6115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            const __m256i eight = _mm256_set1_epi16(8);
6125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            const __m256i four = _mm256_set1_epi16(4);
6135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
6145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
6155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    res_q;
6165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            const __m256i filter = _mm256_load_si256(
6187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                  (__m256i const *)filt_loopfilter_avx2);
6197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_7 = _mm256_shuffle_epi8(p256_7, filter);
6207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_6 = _mm256_shuffle_epi8(p256_6, filter);
6217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_5 = _mm256_shuffle_epi8(p256_5, filter);
6227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_4 = _mm256_shuffle_epi8(p256_4, filter);
6237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_3 = _mm256_shuffle_epi8(p256_3, filter);
6247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_2 = _mm256_shuffle_epi8(p256_2, filter);
6257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_1 = _mm256_shuffle_epi8(p256_1, filter);
6267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            p256_0 = _mm256_shuffle_epi8(p256_0, filter);
6277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_0 = _mm256_shuffle_epi8(q256_0, filter);
6287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_1 = _mm256_shuffle_epi8(q256_1, filter);
6297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_2 = _mm256_shuffle_epi8(q256_2, filter);
6307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_3 = _mm256_shuffle_epi8(q256_3, filter);
6317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_4 = _mm256_shuffle_epi8(q256_4, filter);
6327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_5 = _mm256_shuffle_epi8(q256_5, filter);
6337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_6 = _mm256_shuffle_epi8(q256_6, filter);
6347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian            q256_7 = _mm256_shuffle_epi8(q256_7, filter);
6355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
6375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(p256_4, p256_3));
6385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
6395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(q256_4, q256_3));
6405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
6425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(p256_2, p256_1));
6435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
6445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(q256_2, q256_1));
6475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
6485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm256_add_epi16(eight,
6505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
6515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
6535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
6545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm256_srli_epi16(
6565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_p,
6575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(p256_7, p256_0)), 4);
6585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_p0 = _mm256_castsi256_si128(
6605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
6615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
6625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm256_srli_epi16(
6645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_p,
6655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(q256_7, q256_0)), 4);
6665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q0 = _mm256_castsi256_si128(
6685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
6695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
6705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm256_srli_epi16(
6725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixetFilter_p2p1p0,
6735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(p256_3, p256_0)), 3);
6745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat_p0 = _mm256_castsi256_si128(
6765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
6775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
6785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm256_srli_epi16(
6805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixetFilter_p2p1p0,
6815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(q256_3, q256_0)), 3);
6825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat_q0 = _mm256_castsi256_si128(
6845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
6855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
6865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
6885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
6905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
6925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
6945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
6965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
6985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm256_srli_epi16(
7005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_p,
7015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_p7, p256_1)), 4);
7025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_p1 = _mm256_castsi256_si128(
7045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
7055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
7065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm256_srli_epi16(
7085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_q,
7095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_q7, q256_1)), 4);
7105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q1 = _mm256_castsi256_si128(
7125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
7135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
7145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
7165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
7185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm256_srli_epi16(
7205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixetFilter_p2p1p0,
7215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_p3, p256_1)), 3);
7225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat_p1 = _mm256_castsi256_si128(
7245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
7255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
7265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm256_srli_epi16(
7285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixetFilter_q2q1q0,
7295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_q3, q256_1)), 3);
7305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat_q1 = _mm256_castsi256_si128(
7325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
7335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
7345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
7365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
7385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
7405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
7425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
7445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
7465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm256_srli_epi16(
7485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_p,
7495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_p7, p256_2)), 4);
7505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_p2 = _mm256_castsi256_si128(
7525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
7535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
7545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm256_srli_epi16(
7565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_q,
7575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_q7, q256_2)), 4);
7585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q2 = _mm256_castsi256_si128(
7605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
7615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
7625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
7645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
7665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm256_srli_epi16(
7685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixetFilter_p2p1p0,
7695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_p3, p256_2)), 3);
7705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat_p2 = _mm256_castsi256_si128(
7725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
7735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
7745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm256_srli_epi16(
7765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixetFilter_q2q1q0,
7775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_q3, q256_2)), 3);
7785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat_q2 = _mm256_castsi256_si128(
7805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
7815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
7825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
7845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
7865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
7885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
7905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm256_srli_epi16(
7925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_p,
7935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_p7, p256_3)), 4);
7945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_p3 = _mm256_castsi256_si128(
7965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
7975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
7985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
7995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm256_srli_epi16(
8005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_q,
8015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_q7, q256_3)), 4);
8025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q3 = _mm256_castsi256_si128(
8045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
8055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
8065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
8085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
8105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
8125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
8145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm256_srli_epi16(
8165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_p,
8175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_p7, p256_4)), 4);
8185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_p4 = _mm256_castsi256_si128(
8205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
8215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
8225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm256_srli_epi16(
8245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_q,
8255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_q7, q256_4)), 4);
8265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q4 = _mm256_castsi256_si128(
8285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
8295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
8305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
8325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
8345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
8365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
8385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm256_srli_epi16(
8405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_p,
8415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_p7, p256_5)), 4);
8425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_p5 = _mm256_castsi256_si128(
8445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
8455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
8465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm256_srli_epi16(
8485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_q,
8495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_q7, q256_5)), 4);
8505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q5 = _mm256_castsi256_si128(
8525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
8535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
8545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
8565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
8585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
8605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
8625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_p = _mm256_srli_epi16(
8645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_p,
8655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_p7, p256_6)), 4);
8665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_p6 = _mm256_castsi256_si128(
8685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
8695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
8705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            res_q = _mm256_srli_epi16(
8725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_add_epi16(pixelFilter_q,
8735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            _mm256_add_epi16(sum_q7, q256_6)), 4);
8745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            flat2_q6 = _mm256_castsi256_si128(
8765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
8775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            168));
8785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        }
8795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // wide flat
8815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
8825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p2 = _mm_andnot_si128(flat, p2);
8845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat_p2 = _mm_and_si128(flat, flat_p2);
8855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p2 = _mm_or_si128(flat_p2, p2);
8865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p1 = _mm_andnot_si128(flat, ps1);
8885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat_p1 = _mm_and_si128(flat, flat_p1);
8895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p1 = _mm_or_si128(flat_p1, p1);
8905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p0 = _mm_andnot_si128(flat, ps0);
8925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat_p0 = _mm_and_si128(flat, flat_p0);
8935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p0 = _mm_or_si128(flat_p0, p0);
8945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q0 = _mm_andnot_si128(flat, qs0);
8965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat_q0 = _mm_and_si128(flat, flat_q0);
8975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q0 = _mm_or_si128(flat_q0, q0);
8985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
8995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q1 = _mm_andnot_si128(flat, qs1);
9005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat_q1 = _mm_and_si128(flat, flat_q1);
9015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q1 = _mm_or_si128(flat_q1, q1);
9025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q2 = _mm_andnot_si128(flat, q2);
9045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat_q2 = _mm_and_si128(flat, flat_q2);
9055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q2 = _mm_or_si128(flat_q2, q2);
9065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p6 = _mm_andnot_si128(flat2, p6);
9085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
9095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p6 = _mm_or_si128(flat2_p6, p6);
9105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
9115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p5 = _mm_andnot_si128(flat2, p5);
9135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
9145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p5 = _mm_or_si128(flat2_p5, p5);
9155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
9165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p4 = _mm_andnot_si128(flat2, p4);
9185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
9195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p4 = _mm_or_si128(flat2_p4, p4);
9205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
9215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p3 = _mm_andnot_si128(flat2, p3);
9235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
9245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p3 = _mm_or_si128(flat2_p3, p3);
9255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
9265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p2 = _mm_andnot_si128(flat2, p2);
9285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
9295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p2 = _mm_or_si128(flat2_p2, p2);
9305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
9315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p1 = _mm_andnot_si128(flat2, p1);
9335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
9345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p1 = _mm_or_si128(flat2_p1, p1);
9355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
9365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p0 = _mm_andnot_si128(flat2, p0);
9385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
9395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        p0 = _mm_or_si128(flat2_p0, p0);
9405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
9415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q0 = _mm_andnot_si128(flat2, q0);
9435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
9445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q0 = _mm_or_si128(flat2_q0, q0);
9455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
9465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q1 = _mm_andnot_si128(flat2, q1);
9485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
9495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q1 = _mm_or_si128(flat2_q1, q1);
9505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
9515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q2 = _mm_andnot_si128(flat2, q2);
9535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
9545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q2 = _mm_or_si128(flat2_q2, q2);
9555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
9565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q3 = _mm_andnot_si128(flat2, q3);
9585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
9595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q3 = _mm_or_si128(flat2_q3, q3);
9605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
9615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q4 = _mm_andnot_si128(flat2, q4);
9635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
9645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q4 = _mm_or_si128(flat2_q4, q4);
9655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
9665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q5 = _mm_andnot_si128(flat2, q5);
9685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
9695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q5 = _mm_or_si128(flat2_q5, q5);
9705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
9715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q6 = _mm_andnot_si128(flat2, q6);
9735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
9745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        q6 = _mm_or_si128(flat2_q6, q6);
9755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
9765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
9775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
9785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
9797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
9805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const unsigned char *_blimit, const unsigned char *_limit,
9815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        const unsigned char *_thresh, int count) {
9825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    if (count == 1)
9835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
9845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    else
9855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
9865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
987