1233d2500723e5594f3e7c70896ffeeef32b9c950ywan/*
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan *
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan *  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan */
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include <emmintrin.h>  // SSE2
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_loopfilter.h"
13233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx_ports/emmintrin_compat.h"
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan
15233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            int p,
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            const unsigned char *_blimit,
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            const unsigned char *_limit,
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                            const unsigned char *_thresh) {
20233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i zero = _mm_set1_epi16(0);
21233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i one = _mm_set1_epi8(1);
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i mask, hev, flat, flat2;
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i abs_p1p0;
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       (__m64 *)(s + 4 * p)));
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       (__m64 *)(s + 3 * p)));
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       (__m64 *)(s + 2 * p)));
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       (__m64 *)(s + 1 * p)));
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p1q1 = _mm_shuffle_epi32(q1p1, 78);
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       (__m64 *)(s - 0 * p)));
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p0q0 = _mm_shuffle_epi32(q0p0, 78);
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            _mm_subs_epu8(q0p0, q1p1));
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan    fe = _mm_set1_epi8(0xfe);
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            _mm_subs_epu8(p0q0, q0p0));
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            _mm_subs_epu8(p1q1, q1p1));
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan    hev = _mm_subs_epu8(flat, thresh);
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(abs_p1p0, mask);
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(p1 - p0) > limit) * -1;
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(q1 - q0) > limit) * -1;
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(q1p1, q2p2)),
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan                        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(q2p2, q3p3)));
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(work, mask);
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_subs_epu8(mask, limit);
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_cmpeq_epi8(mask, zero);
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // lp filter
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t4 = _mm_set1_epi8(4);
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t3 = _mm_set1_epi8(3);
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t80 = _mm_set1_epi8(0x80);
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t1 = _mm_set1_epi16(0x1);
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i qs0 = _mm_xor_si128(p0q0, t80);
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i qs1 = _mm_xor_si128(p1q1, t80);
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i filt;
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i work_a;
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i filter1, filter2;
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_subs_epi8(qs0, qs0ps0);
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // (vp9_filter + 3 * (qs0 - ps0)) & mask
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(filt, mask);
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_adds_epi8(filt, t4);
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_adds_epi8(filt, t3);
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_unpacklo_epi8(zero, filter1);
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_srai_epi16(filter1, 0xB);
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_unpacklo_epi8(zero, filter2);
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_srai_epi16(filter2, 0xB);
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // Filter1 >> 3
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan
117233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // filt >> 1
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi16(filter1, t1);
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_srai_epi16(filt, 1);
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            filt);
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // loopfilter done
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __m128i work;
128233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
129233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       _mm_subs_epu8(q0p0, q2p2)),
130233d2500723e5594f3e7c70896ffeeef32b9c950ywan                          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
131233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       _mm_subs_epu8(q0p0, q3p3)));
132233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat = _mm_max_epu8(abs_p1p0, flat);
133233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
134233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat = _mm_subs_epu8(flat, one);
135233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat = _mm_cmpeq_epi8(flat, zero);
136233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat = _mm_and_si128(flat, mask);
137233d2500723e5594f3e7c70896ffeeef32b9c950ywan
138233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
139233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
140233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           (__m64 *)(s + 5 * p)));
141233d2500723e5594f3e7c70896ffeeef32b9c950ywan
142233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
143233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
144233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           (__m64 *)(s + 6 * p)));
145233d2500723e5594f3e7c70896ffeeef32b9c950ywan
146233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
147233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        _mm_subs_epu8(q0p0, q4p4)),
148233d2500723e5594f3e7c70896ffeeef32b9c950ywan                           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
149233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        _mm_subs_epu8(q0p0, q5p5)));
150233d2500723e5594f3e7c70896ffeeef32b9c950ywan
151233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
152233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
153233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           (__m64 *)(s + 7 * p)));
154233d2500723e5594f3e7c70896ffeeef32b9c950ywan
155233d2500723e5594f3e7c70896ffeeef32b9c950ywan      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
156233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       _mm_subs_epu8(q0p0, q6p6)),
157233d2500723e5594f3e7c70896ffeeef32b9c950ywan                          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
158233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       _mm_subs_epu8(q0p0, q7p7)));
159233d2500723e5594f3e7c70896ffeeef32b9c950ywan
160233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_max_epu8(work, flat2);
161233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
162233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_subs_epu8(flat2, one);
163233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_cmpeq_epi8(flat2, zero);
164233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
165233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
166233d2500723e5594f3e7c70896ffeeef32b9c950ywan
167233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
168233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // flat and wide flat calculations
169233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
170233d2500723e5594f3e7c70896ffeeef32b9c950ywan      const __m128i eight = _mm_set1_epi16(8);
171233d2500723e5594f3e7c70896ffeeef32b9c950ywan      const __m128i four = _mm_set1_epi16(4);
172233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
173233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
174233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __m128i pixelFilter_p, pixelFilter_q;
175233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
176233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
177233d2500723e5594f3e7c70896ffeeef32b9c950ywan
178233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
179233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
180233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
181233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
182233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
183233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
184233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
185233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
186233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
187233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
188233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
189233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
190233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
191233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
192233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
193233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
194233d2500723e5594f3e7c70896ffeeef32b9c950ywan
195233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
196233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    _mm_add_epi16(p4_16, p3_16));
197233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
198233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    _mm_add_epi16(q4_16, q3_16));
199233d2500723e5594f3e7c70896ffeeef32b9c950ywan
200233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
201233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
202233d2500723e5594f3e7c70896ffeeef32b9c950ywan
203233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
204233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
205233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
206233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                         pixelFilter_q));
207233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
208233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           _mm_add_epi16(pixetFilter_p2p1p0,
209233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                                         pixetFilter_q2q1q0));
210233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
211233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           _mm_add_epi16(p7_16, p0_16)), 4);
212233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
213233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           _mm_add_epi16(q7_16, q0_16)), 4);
214233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
215233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
216233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           _mm_add_epi16(p3_16, p0_16)), 3);
217233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
218233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           _mm_add_epi16(q3_16, q0_16)), 3);
219233d2500723e5594f3e7c70896ffeeef32b9c950ywan
220233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
221233d2500723e5594f3e7c70896ffeeef32b9c950ywan
222233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_p7 = _mm_add_epi16(p7_16, p7_16);
223233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_q7 = _mm_add_epi16(q7_16, q7_16);
224233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_p3 = _mm_add_epi16(p3_16, p3_16);
225233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_q3 = _mm_add_epi16(q3_16, q3_16);
226233d2500723e5594f3e7c70896ffeeef32b9c950ywan
227233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
228233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
229233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
230233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_p7, p1_16)), 4);
231233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
232233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_q7, q1_16)), 4);
233233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
234233d2500723e5594f3e7c70896ffeeef32b9c950ywan
235233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
236233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
237233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
238233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_p3, p1_16)), 3);
239233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
240233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_q3, q1_16)), 3);
241233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
242233d2500723e5594f3e7c70896ffeeef32b9c950ywan
243233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
244233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
245233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
246233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
247233d2500723e5594f3e7c70896ffeeef32b9c950ywan
248233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
249233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
250233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
251233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_p7, p2_16)), 4);
252233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
253233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_q7, q2_16)), 4);
254233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
255233d2500723e5594f3e7c70896ffeeef32b9c950ywan
256233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
257233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
258233d2500723e5594f3e7c70896ffeeef32b9c950ywan
259233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
260233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           _mm_add_epi16(sum_p3, p2_16)), 3);
261233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
262233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                           _mm_add_epi16(sum_q3, q2_16)), 3);
263233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
264233d2500723e5594f3e7c70896ffeeef32b9c950ywan
265233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
266233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
267233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
268233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
269233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
270233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_p7, p3_16)), 4);
271233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
272233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_q7, q3_16)), 4);
273233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
274233d2500723e5594f3e7c70896ffeeef32b9c950ywan
275233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
276233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
277233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
278233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
279233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
280233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_p7, p4_16)), 4);
281233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
282233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_q7, q4_16)), 4);
283233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
284233d2500723e5594f3e7c70896ffeeef32b9c950ywan
285233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
286233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
287233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
288233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
289233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
290233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_p7, p5_16)), 4);
291233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
292233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_q7, q5_16)), 4);
293233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
294233d2500723e5594f3e7c70896ffeeef32b9c950ywan
295233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
296233d2500723e5594f3e7c70896ffeeef32b9c950ywan      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
297233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
298233d2500723e5594f3e7c70896ffeeef32b9c950ywan      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
299233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
300233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_p7, p6_16)), 4);
301233d2500723e5594f3e7c70896ffeeef32b9c950ywan      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
302233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             _mm_add_epi16(sum_q7, q6_16)), 4);
303233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
304233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
305233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // wide flat
306233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
307233d2500723e5594f3e7c70896ffeeef32b9c950ywan
308233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_shuffle_epi32(flat, 68);
309233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat2 = _mm_shuffle_epi32(flat2, 68);
310233d2500723e5594f3e7c70896ffeeef32b9c950ywan
311233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2p2 = _mm_andnot_si128(flat, q2p2);
312233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
313233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
314233d2500723e5594f3e7c70896ffeeef32b9c950ywan
315233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
316233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
317233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
318233d2500723e5594f3e7c70896ffeeef32b9c950ywan
319233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
320233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
321233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
322233d2500723e5594f3e7c70896ffeeef32b9c950ywan
323233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q6p6 = _mm_andnot_si128(flat2, q6p6);
324233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
325233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
326233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
327233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
328233d2500723e5594f3e7c70896ffeeef32b9c950ywan
329233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q5p5 = _mm_andnot_si128(flat2, q5p5);
330233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
331233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
332233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
333233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
334233d2500723e5594f3e7c70896ffeeef32b9c950ywan
335233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q4p4 = _mm_andnot_si128(flat2, q4p4);
336233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
337233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
338233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
339233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
340233d2500723e5594f3e7c70896ffeeef32b9c950ywan
341233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q3p3 = _mm_andnot_si128(flat2, q3p3);
342233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
343233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
344233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
345233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
346233d2500723e5594f3e7c70896ffeeef32b9c950ywan
347233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2p2 = _mm_andnot_si128(flat2, q2p2);
348233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
349233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
350233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
351233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
352233d2500723e5594f3e7c70896ffeeef32b9c950ywan
353233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1p1 = _mm_andnot_si128(flat2, q1p1);
354233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
355233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
356233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
357233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
358233d2500723e5594f3e7c70896ffeeef32b9c950ywan
359233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0p0 = _mm_andnot_si128(flat2, q0p0);
360233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
361233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
362233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
363233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
364233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
365233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
366233d2500723e5594f3e7c70896ffeeef32b9c950ywan
367233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
368233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                             int p,
369233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                             const unsigned char *_blimit,
370233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                             const unsigned char *_limit,
371233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                             const unsigned char *_thresh) {
372233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16);
373233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16);
374233d2500723e5594f3e7c70896ffeeef32b9c950ywan
375233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16);
376233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16);
377233d2500723e5594f3e7c70896ffeeef32b9c950ywan
378233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16);
379233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16);
380233d2500723e5594f3e7c70896ffeeef32b9c950ywan
381233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i zero = _mm_set1_epi16(0);
382233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i one = _mm_set1_epi8(1);
383233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
384233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
385233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
386233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i mask, hev, flat, flat2;
387233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i p7, p6, p5;
388233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
389233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i q5, q6, q7;
390233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int i = 0;
391233d2500723e5594f3e7c70896ffeeef32b9c950ywan
392233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
393233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
394233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
395233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
396233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
397233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
398233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
399233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
400233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
401233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
402233d2500723e5594f3e7c70896ffeeef32b9c950ywan
403233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_store_si128((__m128i *)&ap[4 * 16], p4);
404233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_store_si128((__m128i *)&ap[3 * 16], p3);
405233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_store_si128((__m128i *)&ap[2 * 16], p2);
406233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_store_si128((__m128i *)&ap[1 * 16], p1);
407233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_store_si128((__m128i *)&ap[0 * 16], p0);
408233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_store_si128((__m128i *)&aq[4 * 16], q4);
409233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_store_si128((__m128i *)&aq[3 * 16], q3);
410233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_store_si128((__m128i *)&aq[2 * 16], q2);
411233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_store_si128((__m128i *)&aq[1 * 16], q1);
412233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_store_si128((__m128i *)&aq[0 * 16], q0);
413233d2500723e5594f3e7c70896ffeeef32b9c950ywan
414233d2500723e5594f3e7c70896ffeeef32b9c950ywan
415233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
416233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
417233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          _mm_subs_epu8(p0, p1));
418233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
419233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          _mm_subs_epu8(q0, q1));
420233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i fe = _mm_set1_epi8(0xfe);
421233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
422233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
423233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    _mm_subs_epu8(q0, p0));
424233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
425233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    _mm_subs_epu8(q1, p1));
426233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i work;
427233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
428233d2500723e5594f3e7c70896ffeeef32b9c950ywan    hev = _mm_subs_epu8(flat, thresh);
429233d2500723e5594f3e7c70896ffeeef32b9c950ywan    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
430233d2500723e5594f3e7c70896ffeeef32b9c950ywan
431233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
432233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
433233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
434233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
435233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
436233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(flat, mask);
437233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(p1 - p0) > limit) * -1;
438233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(q1 - q0) > limit) * -1;
439233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
440233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(p1, p2)),
441233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_or_si128(_mm_subs_epu8(p3, p2),
442233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      _mm_subs_epu8(p2, p3)));
443233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(work, mask);
444233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
445233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(q1, q2)),
446233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_or_si128(_mm_subs_epu8(q3, q2),
447233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      _mm_subs_epu8(q2, q3)));
448233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(work, mask);
449233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_subs_epu8(mask, limit);
450233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_cmpeq_epi8(mask, zero);
451233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
452233d2500723e5594f3e7c70896ffeeef32b9c950ywan
453233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // lp filter
454233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
455233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t4 = _mm_set1_epi8(4);
456233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t3 = _mm_set1_epi8(3);
457233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t80 = _mm_set1_epi8(0x80);
458233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i te0 = _mm_set1_epi8(0xe0);
459233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t1f = _mm_set1_epi8(0x1f);
460233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t1 = _mm_set1_epi8(0x1);
461233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t7f = _mm_set1_epi8(0x7f);
462233d2500723e5594f3e7c70896ffeeef32b9c950ywan
463233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i ps1 = _mm_xor_si128(p1, t80);
464233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i ps0 = _mm_xor_si128(p0, t80);
465233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i qs0 = _mm_xor_si128(q0, t80);
466233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i qs1 = _mm_xor_si128(q1, t80);
467233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i filt;
468233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i work_a;
469233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i filter1, filter2;
470233d2500723e5594f3e7c70896ffeeef32b9c950ywan
471233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
472233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_subs_epi8(qs0, ps0);
473233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
474233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
475233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
476233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // (vp9_filter + 3 * (qs0 - ps0)) & mask
477233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(filt, mask);
478233d2500723e5594f3e7c70896ffeeef32b9c950ywan
479233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_adds_epi8(filt, t4);
480233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_adds_epi8(filt, t3);
481233d2500723e5594f3e7c70896ffeeef32b9c950ywan
482233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // Filter1 >> 3
483233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_cmpgt_epi8(zero, filter1);
484233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_srli_epi16(filter1, 3);
485233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_and_si128(work_a, te0);
486233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_and_si128(filter1, t1f);
487233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_or_si128(filter1, work_a);
488233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
489233d2500723e5594f3e7c70896ffeeef32b9c950ywan
490233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // Filter2 >> 3
491233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_cmpgt_epi8(zero, filter2);
492233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_srli_epi16(filter2, 3);
493233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_and_si128(work_a, te0);
494233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_and_si128(filter2, t1f);
495233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_or_si128(filter2, work_a);
496233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
497233d2500723e5594f3e7c70896ffeeef32b9c950ywan
498233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // filt >> 1
499233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filter1, t1);
500233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_cmpgt_epi8(zero, filt);
501233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_srli_epi16(filt, 1);
502233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_and_si128(work_a, t80);
503233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(filt, t7f);
504233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_or_si128(filt, work_a);
505233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_andnot_si128(hev, filt);
506233d2500723e5594f3e7c70896ffeeef32b9c950ywan    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
507233d2500723e5594f3e7c70896ffeeef32b9c950ywan    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
508233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // loopfilter done
509233d2500723e5594f3e7c70896ffeeef32b9c950ywan
510233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
511233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __m128i work;
512233d2500723e5594f3e7c70896ffeeef32b9c950ywan      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
513233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       _mm_subs_epu8(p0, p2)),
514233d2500723e5594f3e7c70896ffeeef32b9c950ywan                           _mm_or_si128(_mm_subs_epu8(q2, q0),
515233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        _mm_subs_epu8(q0, q2)));
516233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat = _mm_max_epu8(work, flat);
517233d2500723e5594f3e7c70896ffeeef32b9c950ywan      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
518233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       _mm_subs_epu8(p0, p3)),
519233d2500723e5594f3e7c70896ffeeef32b9c950ywan                           _mm_or_si128(_mm_subs_epu8(q3, q0),
520233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        _mm_subs_epu8(q0, q3)));
521233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat = _mm_max_epu8(work, flat);
522233d2500723e5594f3e7c70896ffeeef32b9c950ywan      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
523233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       _mm_subs_epu8(p0, p4)),
524233d2500723e5594f3e7c70896ffeeef32b9c950ywan                           _mm_or_si128(_mm_subs_epu8(q4, q0),
525233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        _mm_subs_epu8(q0, q4)));
526233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat = _mm_subs_epu8(flat, one);
527233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat = _mm_cmpeq_epi8(flat, zero);
528233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat = _mm_and_si128(flat, mask);
529233d2500723e5594f3e7c70896ffeeef32b9c950ywan
530233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
531233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
532233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
533233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       _mm_subs_epu8(p0, p5)),
534233d2500723e5594f3e7c70896ffeeef32b9c950ywan                           _mm_or_si128(_mm_subs_epu8(q5, q0),
535233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        _mm_subs_epu8(q0, q5)));
536233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_store_si128((__m128i *)&ap[5 * 16], p5);
537233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_store_si128((__m128i *)&aq[5 * 16], q5);
538233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_max_epu8(work, flat2);
539233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
540233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
541233d2500723e5594f3e7c70896ffeeef32b9c950ywan      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
542233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       _mm_subs_epu8(p0, p6)),
543233d2500723e5594f3e7c70896ffeeef32b9c950ywan                           _mm_or_si128(_mm_subs_epu8(q6, q0),
544233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        _mm_subs_epu8(q0, q6)));
545233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_store_si128((__m128i *)&ap[6 * 16], p6);
546233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_store_si128((__m128i *)&aq[6 * 16], q6);
547233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_max_epu8(work, flat2);
548233d2500723e5594f3e7c70896ffeeef32b9c950ywan
549233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
550233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
551233d2500723e5594f3e7c70896ffeeef32b9c950ywan      work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
552233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                       _mm_subs_epu8(p0, p7)),
553233d2500723e5594f3e7c70896ffeeef32b9c950ywan                           _mm_or_si128(_mm_subs_epu8(q7, q0),
554233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                        _mm_subs_epu8(q0, q7)));
555233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_store_si128((__m128i *)&ap[7 * 16], p7);
556233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_store_si128((__m128i *)&aq[7 * 16], q7);
557233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_max_epu8(work, flat2);
558233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_subs_epu8(flat2, one);
559233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_cmpeq_epi8(flat2, zero);
560233d2500723e5594f3e7c70896ffeeef32b9c950ywan      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
561233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
562233d2500723e5594f3e7c70896ffeeef32b9c950ywan
563233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
564233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // flat and wide flat calculations
565233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
566233d2500723e5594f3e7c70896ffeeef32b9c950ywan      const __m128i eight = _mm_set1_epi16(8);
567233d2500723e5594f3e7c70896ffeeef32b9c950ywan      const __m128i four = _mm_set1_epi16(4);
568233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __m128i temp_flat2 = flat2;
569233d2500723e5594f3e7c70896ffeeef32b9c950ywan      unsigned char *src = s;
570233d2500723e5594f3e7c70896ffeeef32b9c950ywan      int i = 0;
571233d2500723e5594f3e7c70896ffeeef32b9c950ywan      do {
572233d2500723e5594f3e7c70896ffeeef32b9c950ywan        __m128i workp_shft;
573233d2500723e5594f3e7c70896ffeeef32b9c950ywan        __m128i a, b, c;
574233d2500723e5594f3e7c70896ffeeef32b9c950ywan
575233d2500723e5594f3e7c70896ffeeef32b9c950ywan        unsigned int off = i * 8;
576233d2500723e5594f3e7c70896ffeeef32b9c950ywan        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)),
577233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
578233d2500723e5594f3e7c70896ffeeef32b9c950ywan        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)),
579233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
580233d2500723e5594f3e7c70896ffeeef32b9c950ywan        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)),
581233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
582233d2500723e5594f3e7c70896ffeeef32b9c950ywan        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)),
583233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
584233d2500723e5594f3e7c70896ffeeef32b9c950ywan        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)),
585233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
586233d2500723e5594f3e7c70896ffeeef32b9c950ywan        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)),
587233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
588233d2500723e5594f3e7c70896ffeeef32b9c950ywan        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)),
589233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
590233d2500723e5594f3e7c70896ffeeef32b9c950ywan        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)),
591233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
592233d2500723e5594f3e7c70896ffeeef32b9c950ywan        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)),
593233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
594233d2500723e5594f3e7c70896ffeeef32b9c950ywan        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)),
595233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
596233d2500723e5594f3e7c70896ffeeef32b9c950ywan        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)),
597233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
598233d2500723e5594f3e7c70896ffeeef32b9c950ywan        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)),
599233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
600233d2500723e5594f3e7c70896ffeeef32b9c950ywan        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)),
601233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
602233d2500723e5594f3e7c70896ffeeef32b9c950ywan        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)),
603233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
604233d2500723e5594f3e7c70896ffeeef32b9c950ywan        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)),
605233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
606233d2500723e5594f3e7c70896ffeeef32b9c950ywan        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)),
607233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               zero);
608233d2500723e5594f3e7c70896ffeeef32b9c950ywan
609233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
610233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
611233d2500723e5594f3e7c70896ffeeef32b9c950ywan
612233d2500723e5594f3e7c70896ffeeef32b9c950ywan        b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
613233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
614233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
615233d2500723e5594f3e7c70896ffeeef32b9c950ywan
616233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8],
617233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
618233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          , b));
619233d2500723e5594f3e7c70896ffeeef32b9c950ywan
620233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
621233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
622233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8],
623233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
624233d2500723e5594f3e7c70896ffeeef32b9c950ywan
625233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q1, a);
626233d2500723e5594f3e7c70896ffeeef32b9c950ywan        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
627233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8],
628233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
629233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          , b));
630233d2500723e5594f3e7c70896ffeeef32b9c950ywan
631233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
632233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
633233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8],
634233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
635233d2500723e5594f3e7c70896ffeeef32b9c950ywan
636233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q2, a);
637233d2500723e5594f3e7c70896ffeeef32b9c950ywan        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
638233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat_op[i * 8],
639233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
640233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          , b));
641233d2500723e5594f3e7c70896ffeeef32b9c950ywan
642233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
643233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
644233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8],
645233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
646233d2500723e5594f3e7c70896ffeeef32b9c950ywan
647233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q3, a);
648233d2500723e5594f3e7c70896ffeeef32b9c950ywan        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
649233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat_oq[i * 8],
650233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
651233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          , b));
652233d2500723e5594f3e7c70896ffeeef32b9c950ywan
653233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
654233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
655233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8],
656233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
657233d2500723e5594f3e7c70896ffeeef32b9c950ywan
658233d2500723e5594f3e7c70896ffeeef32b9c950ywan        b = _mm_add_epi16(q3, b);
659233d2500723e5594f3e7c70896ffeeef32b9c950ywan        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
660233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8],
661233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
662233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          , b));
663233d2500723e5594f3e7c70896ffeeef32b9c950ywan
664233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(q4, c);
665233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
666233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
667233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8],
668233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
669233d2500723e5594f3e7c70896ffeeef32b9c950ywan
670233d2500723e5594f3e7c70896ffeeef32b9c950ywan        b = _mm_add_epi16(q3, b);
671233d2500723e5594f3e7c70896ffeeef32b9c950ywan        b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
672233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8],
673233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
674233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          , b));
675233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q5, a);
676233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
677233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
678233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8],
679233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
680233d2500723e5594f3e7c70896ffeeef32b9c950ywan
681233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q6, a);
682233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
683233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
684233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_op[i * 8],
685233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
686233d2500723e5594f3e7c70896ffeeef32b9c950ywan
687233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q7, a);
688233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
689233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
690233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_oq[i * 8],
691233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
692233d2500723e5594f3e7c70896ffeeef32b9c950ywan
693233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q7, a);
694233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
695233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
696233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8],
697233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
698233d2500723e5594f3e7c70896ffeeef32b9c950ywan
699233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q7, a);
700233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
701233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
702233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8],
703233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
704233d2500723e5594f3e7c70896ffeeef32b9c950ywan
705233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q7, a);
706233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
707233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
708233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8],
709233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
710233d2500723e5594f3e7c70896ffeeef32b9c950ywan
711233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q7, a);
712233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
713233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
714233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8],
715233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
716233d2500723e5594f3e7c70896ffeeef32b9c950ywan
717233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q7, a);
718233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
719233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
720233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8],
721233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
722233d2500723e5594f3e7c70896ffeeef32b9c950ywan
723233d2500723e5594f3e7c70896ffeeef32b9c950ywan        a = _mm_add_epi16(q7, a);
724233d2500723e5594f3e7c70896ffeeef32b9c950ywan        c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
725233d2500723e5594f3e7c70896ffeeef32b9c950ywan        workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
726233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8],
727233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_packus_epi16(workp_shft, workp_shft));
728233d2500723e5594f3e7c70896ffeeef32b9c950ywan
729233d2500723e5594f3e7c70896ffeeef32b9c950ywan        temp_flat2 = _mm_srli_si128(temp_flat2, 8);
730233d2500723e5594f3e7c70896ffeeef32b9c950ywan        src += 8;
731233d2500723e5594f3e7c70896ffeeef32b9c950ywan      } while (++i < 2);
732233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
733233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // wide flat
734233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
735233d2500723e5594f3e7c70896ffeeef32b9c950ywan
736233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_load_si128((__m128i *)&ap[2 * 16]);
737233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
738233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
739233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_and_si128(flat, p2);
740233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_or_si128(work_a, p2);
741233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_store_si128((__m128i *)&flat_op[2 * 16], p2);
742233d2500723e5594f3e7c70896ffeeef32b9c950ywan
743233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
744233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, ps1);
745233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_and_si128(flat, p1);
746233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_or_si128(work_a, p1);
747233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_store_si128((__m128i *)&flat_op[1 * 16], p1);
748233d2500723e5594f3e7c70896ffeeef32b9c950ywan
749233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_load_si128((__m128i *)&flat_op[0]);
750233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, ps0);
751233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_and_si128(flat, p0);
752233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_or_si128(work_a, p0);
753233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_store_si128((__m128i *)&flat_op[0], p0);
754233d2500723e5594f3e7c70896ffeeef32b9c950ywan
755233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_load_si128((__m128i *)&flat_oq[0]);
756233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, qs0);
757233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_and_si128(flat, q0);
758233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_or_si128(work_a, q0);
759233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_store_si128((__m128i *)&flat_oq[0], q0);
760233d2500723e5594f3e7c70896ffeeef32b9c950ywan
761233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
762233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, qs1);
763233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_and_si128(flat, q1);
764233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_or_si128(work_a, q1);
765233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1);
766233d2500723e5594f3e7c70896ffeeef32b9c950ywan
767233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_load_si128((__m128i *)&aq[2 * 16]);
768233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
769233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
770233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_and_si128(flat, q2);
771233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_or_si128(work_a, q2);
772233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2);
773233d2500723e5594f3e7c70896ffeeef32b9c950ywan
774233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // write out op6 - op3
775233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
776233d2500723e5594f3e7c70896ffeeef32b9c950ywan      unsigned char *dst = (s - 7 * p);
777233d2500723e5594f3e7c70896ffeeef32b9c950ywan      for (i = 6; i > 2; i--) {
778233d2500723e5594f3e7c70896ffeeef32b9c950ywan        __m128i flat2_output;
779233d2500723e5594f3e7c70896ffeeef32b9c950ywan        work_a = _mm_load_si128((__m128i *)&ap[i * 16]);
780233d2500723e5594f3e7c70896ffeeef32b9c950ywan        flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]);
781233d2500723e5594f3e7c70896ffeeef32b9c950ywan        work_a = _mm_andnot_si128(flat2, work_a);
782233d2500723e5594f3e7c70896ffeeef32b9c950ywan        flat2_output = _mm_and_si128(flat2, flat2_output);
783233d2500723e5594f3e7c70896ffeeef32b9c950ywan        work_a = _mm_or_si128(work_a, flat2_output);
784233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storeu_si128((__m128i *)dst, work_a);
785233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst += p;
786233d2500723e5594f3e7c70896ffeeef32b9c950ywan      }
787233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
788233d2500723e5594f3e7c70896ffeeef32b9c950ywan
789233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
790233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]);
791233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat2, work_a);
792233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_and_si128(flat2, p2);
793233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_or_si128(work_a, p2);
794233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
795233d2500723e5594f3e7c70896ffeeef32b9c950ywan
796233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
797233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]);
798233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat2, work_a);
799233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_and_si128(flat2, p1);
800233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_or_si128(work_a, p1);
801233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
802233d2500723e5594f3e7c70896ffeeef32b9c950ywan
803233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_load_si128((__m128i *)&flat_op[0]);
804233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_load_si128((__m128i *)&flat2_op[0]);
805233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat2, work_a);
806233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_and_si128(flat2, p0);
807233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_or_si128(work_a, p0);
808233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
809233d2500723e5594f3e7c70896ffeeef32b9c950ywan
810233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_load_si128((__m128i *)&flat_oq[0]);
811233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_load_si128((__m128i *)&flat2_oq[0]);
812233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat2, work_a);
813233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_and_si128(flat2, q0);
814233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_or_si128(work_a, q0);
815233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
816233d2500723e5594f3e7c70896ffeeef32b9c950ywan
817233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
818233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_load_si128((__m128i *)&flat2_oq[16]);
819233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat2, work_a);
820233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_and_si128(flat2, q1);
821233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_or_si128(work_a, q1);
822233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
823233d2500723e5594f3e7c70896ffeeef32b9c950ywan
824233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
825233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]);
826233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat2, work_a);
827233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_and_si128(flat2, q2);
828233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_or_si128(work_a, q2);
829233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
830233d2500723e5594f3e7c70896ffeeef32b9c950ywan
831233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // write out oq3 - oq7
832233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
833233d2500723e5594f3e7c70896ffeeef32b9c950ywan      unsigned char *dst = (s + 3 * p);
834233d2500723e5594f3e7c70896ffeeef32b9c950ywan      for (i = 3; i < 7; i++) {
835233d2500723e5594f3e7c70896ffeeef32b9c950ywan        __m128i flat2_output;
836233d2500723e5594f3e7c70896ffeeef32b9c950ywan        work_a = _mm_load_si128((__m128i *)&aq[i * 16]);
837233d2500723e5594f3e7c70896ffeeef32b9c950ywan        flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]);
838233d2500723e5594f3e7c70896ffeeef32b9c950ywan        work_a = _mm_andnot_si128(flat2, work_a);
839233d2500723e5594f3e7c70896ffeeef32b9c950ywan        flat2_output = _mm_and_si128(flat2, flat2_output);
840233d2500723e5594f3e7c70896ffeeef32b9c950ywan        work_a = _mm_or_si128(work_a, flat2_output);
841233d2500723e5594f3e7c70896ffeeef32b9c950ywan        _mm_storeu_si128((__m128i *)dst, work_a);
842233d2500723e5594f3e7c70896ffeeef32b9c950ywan        dst += p;
843233d2500723e5594f3e7c70896ffeeef32b9c950ywan      }
844233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
845233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
846233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
847233d2500723e5594f3e7c70896ffeeef32b9c950ywan
848233d2500723e5594f3e7c70896ffeeef32b9c950ywan// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
849233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
850233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                const unsigned char *_blimit,
851233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                const unsigned char *_limit,
852233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                const unsigned char *_thresh, int count) {
853233d2500723e5594f3e7c70896ffeeef32b9c950ywan  if (count == 1)
854233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
855233d2500723e5594f3e7c70896ffeeef32b9c950ywan  else
856233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
857233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
858233d2500723e5594f3e7c70896ffeeef32b9c950ywan
859233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
860233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               const unsigned char *_blimit,
861233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               const unsigned char *_limit,
862233d2500723e5594f3e7c70896ffeeef32b9c950ywan                               const unsigned char *_thresh, int count) {
863233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
864233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
865233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
866233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
867233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
868233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
869233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i zero = _mm_set1_epi16(0);
870233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
871233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
872233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
873233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i mask, hev, flat;
874233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
875233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
876233d2500723e5594f3e7c70896ffeeef32b9c950ywan
877233d2500723e5594f3e7c70896ffeeef32b9c950ywan  (void)count;
878233d2500723e5594f3e7c70896ffeeef32b9c950ywan
879233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
880233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
881233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
882233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
883233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
884233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
885233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
886233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
887233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p1q1 = _mm_shuffle_epi32(q1p1, 78);
888233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p0q0 = _mm_shuffle_epi32(q0p0, 78);
889233d2500723e5594f3e7c70896ffeeef32b9c950ywan
890233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
891233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // filter_mask and hev_mask
892233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i one = _mm_set1_epi8(1);
893233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i fe = _mm_set1_epi8(0xfe);
894233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
895233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
896233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
897233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            _mm_subs_epu8(q0p0, q1p1));
898233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
899233d2500723e5594f3e7c70896ffeeef32b9c950ywan
900233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
901233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            _mm_subs_epu8(p0q0, q0p0));
902233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
903233d2500723e5594f3e7c70896ffeeef32b9c950ywan                            _mm_subs_epu8(p1q1, q1p1));
904233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
905233d2500723e5594f3e7c70896ffeeef32b9c950ywan    hev = _mm_subs_epu8(flat, thresh);
906233d2500723e5594f3e7c70896ffeeef32b9c950ywan    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
907233d2500723e5594f3e7c70896ffeeef32b9c950ywan
908233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
909233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
910233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
911233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
912233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
913233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(abs_p1p0, mask);
914233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(p1 - p0) > limit) * -1;
915233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(q1 - q0) > limit) * -1;
916233d2500723e5594f3e7c70896ffeeef32b9c950ywan
917233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
918233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(q1p1, q2p2)),
919233d2500723e5594f3e7c70896ffeeef32b9c950ywan                        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
920233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(q2p2, q3p3)));
921233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(work, mask);
922233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
923233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_subs_epu8(mask, limit);
924233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_cmpeq_epi8(mask, zero);
925233d2500723e5594f3e7c70896ffeeef32b9c950ywan
926233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // flat_mask4
927233d2500723e5594f3e7c70896ffeeef32b9c950ywan
928233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
929233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(q0p0, q2p2)),
930233d2500723e5594f3e7c70896ffeeef32b9c950ywan                        _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
931233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(q0p0, q3p3)));
932233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_max_epu8(abs_p1p0, flat);
933233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
934233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_subs_epu8(flat, one);
935233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_cmpeq_epi8(flat, zero);
936233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_and_si128(flat, mask);
937233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
938233d2500723e5594f3e7c70896ffeeef32b9c950ywan
939233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
940233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i four = _mm_set1_epi16(4);
941233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned char *src = s;
942233d2500723e5594f3e7c70896ffeeef32b9c950ywan    {
943233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __m128i workp_a, workp_b, workp_shft;
944233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
945233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
946233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
947233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
948233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
949233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
950233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
951233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
952233d2500723e5594f3e7c70896ffeeef32b9c950ywan
953233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
954233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
955233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
956233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
957233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_op2[0],
958233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
959233d2500723e5594f3e7c70896ffeeef32b9c950ywan
960233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
961233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
962233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_op1[0],
963233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
964233d2500723e5594f3e7c70896ffeeef32b9c950ywan
965233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
966233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
967233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
968233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_op0[0],
969233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
970233d2500723e5594f3e7c70896ffeeef32b9c950ywan
971233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
972233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
973233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
974233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_oq0[0],
975233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
976233d2500723e5594f3e7c70896ffeeef32b9c950ywan
977233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
978233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
979233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
980233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_oq1[0],
981233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
982233d2500723e5594f3e7c70896ffeeef32b9c950ywan
983233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
984233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
985233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
986233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_oq2[0],
987233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
988233d2500723e5594f3e7c70896ffeeef32b9c950ywan    }
989233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
990233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // lp filter
991233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
992233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t4 = _mm_set1_epi8(4);
993233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t3 = _mm_set1_epi8(3);
994233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t80 = _mm_set1_epi8(0x80);
995233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t1 = _mm_set1_epi8(0x1);
996233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
997233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
998233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
999233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
1000233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
1001233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
1002233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
1003233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
1004233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i filt;
1005233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i work_a;
1006233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i filter1, filter2;
1007233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1008233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1009233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_subs_epi8(qs0, ps0);
1010233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
1011233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
1012233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
1013233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // (vp9_filter + 3 * (qs0 - ps0)) & mask
1014233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(filt, mask);
1015233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1016233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_adds_epi8(filt, t4);
1017233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_adds_epi8(filt, t3);
1018233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1019233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // Filter1 >> 3
1020233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_unpacklo_epi8(zero, filter1);
1021233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_srai_epi16(filter1, 11);
1022233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_packs_epi16(filter1, filter1);
1023233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1024233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // Filter2 >> 3
1025233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_unpacklo_epi8(zero, filter2);
1026233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_srai_epi16(filter2, 11);
1027233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_packs_epi16(filter2, zero);
1028233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1029233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // filt >> 1
1030233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filter1, t1);
1031233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_unpacklo_epi8(zero, filt);
1032233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_srai_epi16(filt, 9);
1033233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_packs_epi16(filt, zero);
1034233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1035233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_andnot_si128(hev, filt);
1036233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1037233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1038233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1039233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1040233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_and_si128(flat, q0);
1041233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_or_si128(work_a, q0);
1042233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1043233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1044233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1045233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1046233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_and_si128(flat, q1);
1047233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_or_si128(work_a, q1);
1048233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1049233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1050233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1051233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1052233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_and_si128(flat, q2);
1053233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_or_si128(work_a, q2);
1054233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1055233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1056233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1057233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1058233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_and_si128(flat, p0);
1059233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_or_si128(work_a, p0);
1060233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1061233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1062233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1063233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1064233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_and_si128(flat, p1);
1065233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_or_si128(work_a, p1);
1066233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1067233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1068233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1069233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1070233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_and_si128(flat, p2);
1071233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_or_si128(work_a, p2);
1072233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1073233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1074233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
1075233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
1076233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
1077233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
1078233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1079233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1080233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1081233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1082233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
1083233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *_blimit0,
1084233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *_limit0,
1085233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *_thresh0,
1086233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *_blimit1,
1087233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *_limit1,
1088233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const uint8_t *_thresh1) {
1089233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
1090233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
1091233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
1092233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
1093233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
1094233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
1095233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i zero = _mm_set1_epi16(0);
1096233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i blimit =
1097233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1098233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_load_si128((const __m128i *)_blimit1));
1099233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i limit =
1100233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1101233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_load_si128((const __m128i *)_limit1));
1102233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i thresh =
1103233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1104233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_load_si128((const __m128i *)_thresh1));
1105233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1106233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i mask, hev, flat;
1107233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1108233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1109233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1110233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1111233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1112233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1113233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1114233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1115233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1116233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1117233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
1118233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
1119233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          _mm_subs_epu8(p0, p1));
1120233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
1121233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          _mm_subs_epu8(q0, q1));
1122233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i one = _mm_set1_epi8(1);
1123233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i fe = _mm_set1_epi8(0xfe);
1124233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1125233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
1126233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    _mm_subs_epu8(q0, p0));
1127233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
1128233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    _mm_subs_epu8(q1, p1));
1129233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i work;
1130233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1131233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // filter_mask and hev_mask
1132233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1133233d2500723e5594f3e7c70896ffeeef32b9c950ywan    hev = _mm_subs_epu8(flat, thresh);
1134233d2500723e5594f3e7c70896ffeeef32b9c950ywan    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1135233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1136233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1137233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1138233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1139233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1140233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1141233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(flat, mask);
1142233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(p1 - p0) > limit) * -1;
1143233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(q1 - q0) > limit) * -1;
1144233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1145233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(p1, p2)),
1146233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_or_si128(_mm_subs_epu8(p3, p2),
1147233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      _mm_subs_epu8(p2, p3)));
1148233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(work, mask);
1149233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1150233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(q1, q2)),
1151233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_or_si128(_mm_subs_epu8(q3, q2),
1152233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      _mm_subs_epu8(q2, q3)));
1153233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(work, mask);
1154233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_subs_epu8(mask, limit);
1155233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_cmpeq_epi8(mask, zero);
1156233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1157233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // flat_mask4
1158233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
1159233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(p0, p2)),
1160233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_or_si128(_mm_subs_epu8(q2, q0),
1161233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      _mm_subs_epu8(q0, q2)));
1162233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_max_epu8(work, flat);
1163233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
1164233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(p0, p3)),
1165233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_or_si128(_mm_subs_epu8(q3, q0),
1166233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      _mm_subs_epu8(q0, q3)));
1167233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_max_epu8(work, flat);
1168233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_subs_epu8(flat, one);
1169233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_cmpeq_epi8(flat, zero);
1170233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_and_si128(flat, mask);
1171233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1172233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
1173233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i four = _mm_set1_epi16(4);
1174233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned char *src = s;
1175233d2500723e5594f3e7c70896ffeeef32b9c950ywan    int i = 0;
1176233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1177233d2500723e5594f3e7c70896ffeeef32b9c950ywan    do {
1178233d2500723e5594f3e7c70896ffeeef32b9c950ywan      __m128i workp_a, workp_b, workp_shft;
1179233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1180233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1181233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1182233d2500723e5594f3e7c70896ffeeef32b9c950ywan      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1183233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1184233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1185233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1186233d2500723e5594f3e7c70896ffeeef32b9c950ywan      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1187233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1188233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1189233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1190233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1191233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1192233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1193233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
1194233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1195233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1196233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1197233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1198233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
1199233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1200233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1201233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1202233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1203233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1204233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
1205233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1206233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1207233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1208233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1209233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1210233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
1211233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1212233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1213233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1214233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1215233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1216233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
1217233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1218233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1219233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1220233d2500723e5594f3e7c70896ffeeef32b9c950ywan      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1221233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1222233d2500723e5594f3e7c70896ffeeef32b9c950ywan                       _mm_packus_epi16(workp_shft, workp_shft));
1223233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1224233d2500723e5594f3e7c70896ffeeef32b9c950ywan      src += 8;
1225233d2500723e5594f3e7c70896ffeeef32b9c950ywan    } while (++i < 2);
1226233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1227233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // lp filter
1228233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
1229233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t4 = _mm_set1_epi8(4);
1230233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t3 = _mm_set1_epi8(3);
1231233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t80 = _mm_set1_epi8(0x80);
1232233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i te0 = _mm_set1_epi8(0xe0);
1233233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t1f = _mm_set1_epi8(0x1f);
1234233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t1 = _mm_set1_epi8(0x1);
1235233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t7f = _mm_set1_epi8(0x7f);
1236233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1237233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1238233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
1239233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1240233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
1241233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1242233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
1243233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1244233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
1245233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i filt;
1246233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i work_a;
1247233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i filter1, filter2;
1248233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1249233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1250233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_subs_epi8(qs0, ps0);
1251233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
1252233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
1253233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
1254233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // (vp9_filter + 3 * (qs0 - ps0)) & mask
1255233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(filt, mask);
1256233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1257233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_adds_epi8(filt, t4);
1258233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_adds_epi8(filt, t3);
1259233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1260233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // Filter1 >> 3
1261233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_cmpgt_epi8(zero, filter1);
1262233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_srli_epi16(filter1, 3);
1263233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_and_si128(work_a, te0);
1264233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_and_si128(filter1, t1f);
1265233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_or_si128(filter1, work_a);
1266233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1267233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // Filter2 >> 3
1268233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_cmpgt_epi8(zero, filter2);
1269233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_srli_epi16(filter2, 3);
1270233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_and_si128(work_a, te0);
1271233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_and_si128(filter2, t1f);
1272233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_or_si128(filter2, work_a);
1273233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1274233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // filt >> 1
1275233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filter1, t1);
1276233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_cmpgt_epi8(zero, filt);
1277233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_srli_epi16(filt, 1);
1278233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_and_si128(work_a, t80);
1279233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(filt, t7f);
1280233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_or_si128(filt, work_a);
1281233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1282233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_andnot_si128(hev, filt);
1283233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1284233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1285233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_load_si128((__m128i *)flat_oq0);
1286233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1287233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_and_si128(flat, q0);
1288233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_or_si128(work_a, q0);
1289233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1290233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1291233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_load_si128((__m128i *)flat_oq1);
1292233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1293233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_and_si128(flat, q1);
1294233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_or_si128(work_a, q1);
1295233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1296233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1297233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_load_si128((__m128i *)flat_oq2);
1298233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1299233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_and_si128(flat, q2);
1300233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q2 = _mm_or_si128(work_a, q2);
1301233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1302233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1303233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_load_si128((__m128i *)flat_op0);
1304233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1305233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_and_si128(flat, p0);
1306233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_or_si128(work_a, p0);
1307233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1308233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1309233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_load_si128((__m128i *)flat_op1);
1310233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1311233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_and_si128(flat, p1);
1312233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_or_si128(work_a, p1);
1313233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1314233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1315233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_load_si128((__m128i *)flat_op2);
1316233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_andnot_si128(flat, work_a);
1317233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_and_si128(flat, p2);
1318233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p2 = _mm_or_si128(work_a, p2);
1319233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1320233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1321233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1322233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1323233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1324233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1325233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1326233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1327233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1328233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1329233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1330233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const unsigned char *_blimit0,
1331233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const unsigned char *_limit0,
1332233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const unsigned char *_thresh0,
1333233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const unsigned char *_blimit1,
1334233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const unsigned char *_limit1,
1335233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    const unsigned char *_thresh1) {
1336233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i blimit =
1337233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1338233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_load_si128((const __m128i *)_blimit1));
1339233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i limit =
1340233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1341233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_load_si128((const __m128i *)_limit1));
1342233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i thresh =
1343233d2500723e5594f3e7c70896ffeeef32b9c950ywan      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1344233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_load_si128((const __m128i *)_thresh1));
1345233d2500723e5594f3e7c70896ffeeef32b9c950ywan  const __m128i zero = _mm_set1_epi16(0);
1346233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1347233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i mask, hev, flat;
1348233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1349233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1350233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1351233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1352233d2500723e5594f3e7c70896ffeeef32b9c950ywan  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1353233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1354233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1355233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1356233d2500723e5594f3e7c70896ffeeef32b9c950ywan  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1357233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1358233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // filter_mask and hev_mask
1359233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
1360233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
1361233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          _mm_subs_epu8(p0, p1));
1362233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
1363233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                          _mm_subs_epu8(q0, q1));
1364233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i fe = _mm_set1_epi8(0xfe);
1365233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1366233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
1367233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    _mm_subs_epu8(q0, p0));
1368233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
1369233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                    _mm_subs_epu8(q1, p1));
1370233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i work;
1371233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1372233d2500723e5594f3e7c70896ffeeef32b9c950ywan    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1373233d2500723e5594f3e7c70896ffeeef32b9c950ywan    hev = _mm_subs_epu8(flat, thresh);
1374233d2500723e5594f3e7c70896ffeeef32b9c950ywan    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1375233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1376233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1377233d2500723e5594f3e7c70896ffeeef32b9c950ywan    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1378233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1379233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1380233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1381233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(flat, mask);
1382233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(p1 - p0) > limit) * -1;
1383233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // mask |= (abs(q1 - q0) > limit) * -1;
1384233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1385233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(p1, p2)),
1386233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_or_si128(_mm_subs_epu8(p3, p2),
1387233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      _mm_subs_epu8(p2, p3)));
1388233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(work, mask);
1389233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1390233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                     _mm_subs_epu8(q1, q2)),
1391233d2500723e5594f3e7c70896ffeeef32b9c950ywan                         _mm_or_si128(_mm_subs_epu8(q3, q2),
1392233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      _mm_subs_epu8(q2, q3)));
1393233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_max_epu8(work, mask);
1394233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_subs_epu8(mask, limit);
1395233d2500723e5594f3e7c70896ffeeef32b9c950ywan    mask = _mm_cmpeq_epi8(mask, zero);
1396233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1397233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1398233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // filter4
1399233d2500723e5594f3e7c70896ffeeef32b9c950ywan  {
1400233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t4 = _mm_set1_epi8(4);
1401233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t3 = _mm_set1_epi8(3);
1402233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t80 = _mm_set1_epi8(0x80);
1403233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i te0 = _mm_set1_epi8(0xe0);
1404233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t1f = _mm_set1_epi8(0x1f);
1405233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t1 = _mm_set1_epi8(0x1);
1406233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i t7f = _mm_set1_epi8(0x7f);
1407233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1408233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1409233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
1410233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1411233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
1412233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1413233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
1414233d2500723e5594f3e7c70896ffeeef32b9c950ywan    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1415233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                      t80);
1416233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i filt;
1417233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i work_a;
1418233d2500723e5594f3e7c70896ffeeef32b9c950ywan    __m128i filter1, filter2;
1419233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1420233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1421233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_subs_epi8(qs0, ps0);
1422233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
1423233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
1424233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filt, work_a);
1425233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // (vp9_filter + 3 * (qs0 - ps0)) & mask
1426233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(filt, mask);
1427233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1428233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_adds_epi8(filt, t4);
1429233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_adds_epi8(filt, t3);
1430233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1431233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // Filter1 >> 3
1432233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_cmpgt_epi8(zero, filter1);
1433233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_srli_epi16(filter1, 3);
1434233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_and_si128(work_a, te0);
1435233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_and_si128(filter1, t1f);
1436233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter1 = _mm_or_si128(filter1, work_a);
1437233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1438233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // Filter2 >> 3
1439233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_cmpgt_epi8(zero, filter2);
1440233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_srli_epi16(filter2, 3);
1441233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_and_si128(work_a, te0);
1442233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_and_si128(filter2, t1f);
1443233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filter2 = _mm_or_si128(filter2, work_a);
1444233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1445233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // filt >> 1
1446233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_adds_epi8(filter1, t1);
1447233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_cmpgt_epi8(zero, filt);
1448233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_srli_epi16(filt, 1);
1449233d2500723e5594f3e7c70896ffeeef32b9c950ywan    work_a = _mm_and_si128(work_a, t80);
1450233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_and_si128(filt, t7f);
1451233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_or_si128(filt, work_a);
1452233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1453233d2500723e5594f3e7c70896ffeeef32b9c950ywan    filt = _mm_andnot_si128(hev, filt);
1454233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1455233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1456233d2500723e5594f3e7c70896ffeeef32b9c950ywan    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1457233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1458233d2500723e5594f3e7c70896ffeeef32b9c950ywan    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1459233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1460233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1461233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1462233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1463233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1464233d2500723e5594f3e7c70896ffeeef32b9c950ywan  }
1465233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1466233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1467233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1468233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                 int in_p, unsigned char *out, int out_p) {
1469233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1470233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1471233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1472233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Read in 16 lines
1473233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x0 = _mm_loadl_epi64((__m128i *)in0);
1474233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x8 = _mm_loadl_epi64((__m128i *)in1);
1475233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
1476233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
1477233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
1478233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
1479233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
1480233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
1481233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
1482233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
1483233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
1484233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
1485233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
1486233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
1487233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
1488233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
1489233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1490233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x0 = _mm_unpacklo_epi8(x0, x1);
1491233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x1 = _mm_unpacklo_epi8(x2, x3);
1492233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x2 = _mm_unpacklo_epi8(x4, x5);
1493233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x3 = _mm_unpacklo_epi8(x6, x7);
1494233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1495233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x8 = _mm_unpacklo_epi8(x8, x9);
1496233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x9 = _mm_unpacklo_epi8(x10, x11);
1497233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x10 = _mm_unpacklo_epi8(x12, x13);
1498233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x11 = _mm_unpacklo_epi8(x14, x15);
1499233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1500233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x4 = _mm_unpacklo_epi16(x0, x1);
1501233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x5 = _mm_unpacklo_epi16(x2, x3);
1502233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x12 = _mm_unpacklo_epi16(x8, x9);
1503233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x13 = _mm_unpacklo_epi16(x10, x11);
1504233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1505233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x6 = _mm_unpacklo_epi32(x4, x5);
1506233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x7 = _mm_unpackhi_epi32(x4, x5);
1507233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x14 = _mm_unpacklo_epi32(x12, x13);
1508233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x15 = _mm_unpackhi_epi32(x12, x13);
1509233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1510233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Store first 4-line result
1511233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1512233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1513233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1514233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1515233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1516233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x4 = _mm_unpackhi_epi16(x0, x1);
1517233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x5 = _mm_unpackhi_epi16(x2, x3);
1518233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x12 = _mm_unpackhi_epi16(x8, x9);
1519233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x13 = _mm_unpackhi_epi16(x10, x11);
1520233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1521233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x6 = _mm_unpacklo_epi32(x4, x5);
1522233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x7 = _mm_unpackhi_epi32(x4, x5);
1523233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x14 = _mm_unpacklo_epi32(x12, x13);
1524233d2500723e5594f3e7c70896ffeeef32b9c950ywan  x15 = _mm_unpackhi_epi32(x12, x13);
1525233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1526233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Store second 4-line result
1527233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1528233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1529233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1530233d2500723e5594f3e7c70896ffeeef32b9c950ywan  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1531233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1532233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1533233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic INLINE void transpose(unsigned char *src[], int in_p,
1534233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             unsigned char *dst[], int out_p,
1535233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             int num_8x8_to_transpose) {
1536233d2500723e5594f3e7c70896ffeeef32b9c950ywan  int idx8x8 = 0;
1537233d2500723e5594f3e7c70896ffeeef32b9c950ywan  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1538233d2500723e5594f3e7c70896ffeeef32b9c950ywan  do {
1539233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned char *in = src[idx8x8];
1540233d2500723e5594f3e7c70896ffeeef32b9c950ywan    unsigned char *out = dst[idx8x8];
1541233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1542233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
1543233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
1544233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
1545233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
1546233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
1547233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
1548233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
1549233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
1550233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1551233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x0 = _mm_unpacklo_epi8(x0, x1);
1552233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1553233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x1 = _mm_unpacklo_epi8(x2, x3);
1554233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1555233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x2 = _mm_unpacklo_epi8(x4, x5);
1556233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1557233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x3 = _mm_unpacklo_epi8(x6, x7);
1558233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1559233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x4 = _mm_unpacklo_epi16(x0, x1);
1560233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1561233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x5 = _mm_unpacklo_epi16(x2, x3);
1562233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1563233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x6 = _mm_unpacklo_epi32(x4, x5);
1564233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1565233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x7 = _mm_unpackhi_epi32(x4, x5);
1566233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1567233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_pd((double *)(out + 0*out_p),
1568233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
1569233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeh_pd((double *)(out + 1*out_p),
1570233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
1571233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_pd((double *)(out + 2*out_p),
1572233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
1573233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeh_pd((double *)(out + 3*out_p),
1574233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
1575233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1576233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1577233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x4 = _mm_unpackhi_epi16(x0, x1);
1578233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1579233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x5 = _mm_unpackhi_epi16(x2, x3);
1580233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1581233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x6 = _mm_unpacklo_epi32(x4, x5);
1582233d2500723e5594f3e7c70896ffeeef32b9c950ywan    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1583233d2500723e5594f3e7c70896ffeeef32b9c950ywan    x7 = _mm_unpackhi_epi32(x4, x5);
1584233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1585233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_pd((double *)(out + 4*out_p),
1586233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
1587233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeh_pd((double *)(out + 5*out_p),
1588233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
1589233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storel_pd((double *)(out + 6*out_p),
1590233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
1591233d2500723e5594f3e7c70896ffeeef32b9c950ywan    _mm_storeh_pd((double *)(out + 7*out_p),
1592233d2500723e5594f3e7c70896ffeeef32b9c950ywan                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
1593233d2500723e5594f3e7c70896ffeeef32b9c950ywan  } while (++idx8x8 < num_8x8_to_transpose);
1594233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1595233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1596233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1597233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *limit0,
1598233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *thresh0,
1599233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *blimit1,
1600233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *limit1,
1601233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *thresh1) {
1602233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
1603233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned char *src[2];
1604233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned char *dst[2];
1605233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1606233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Transpose 8x16
1607233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1608233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1609233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Loop filtering
1610233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1611233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                 blimit1, limit1, thresh1);
1612233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src[0] = t_dst;
1613233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src[1] = t_dst + 8;
1614233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dst[0] = s - 4;
1615233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dst[1] = s - 4 + p * 8;
1616233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1617233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Transpose back
1618233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose(src, 16, dst, p, 2);
1619233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1620233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1621233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
1622233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             const unsigned char *blimit,
1623233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             const unsigned char *limit,
1624233d2500723e5594f3e7c70896ffeeef32b9c950ywan                             const unsigned char *thresh, int count) {
1625233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
1626233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned char *src[1];
1627233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned char *dst[1];
1628233d2500723e5594f3e7c70896ffeeef32b9c950ywan  (void)count;
1629233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1630233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Transpose 8x8
1631233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src[0] = s - 4;
1632233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dst[0] = t_dst;
1633233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1634233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose(src, p, dst, 8, 1);
1635233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1636233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Loop filtering
1637233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
1638233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1639233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src[0] = t_dst;
1640233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dst[0] = s - 4;
1641233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1642233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Transpose back
1643233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose(src, 8, dst, p, 1);
1644233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1645233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1646233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1647233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *limit0,
1648233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *thresh0,
1649233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *blimit1,
1650233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *limit1,
1651233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                  const uint8_t *thresh1) {
1652233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
1653233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned char *src[2];
1654233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned char *dst[2];
1655233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1656233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Transpose 8x16
1657233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1658233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1659233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Loop filtering
1660233d2500723e5594f3e7c70896ffeeef32b9c950ywan  vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1661233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                 blimit1, limit1, thresh1);
1662233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src[0] = t_dst;
1663233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src[1] = t_dst + 8;
1664233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1665233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dst[0] = s - 4;
1666233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dst[1] = s - 4 + p * 8;
1667233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1668233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Transpose back
1669233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose(src, 16, dst, p, 2);
1670233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1671233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1672233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
1673233d2500723e5594f3e7c70896ffeeef32b9c950ywan                              const unsigned char *blimit,
1674233d2500723e5594f3e7c70896ffeeef32b9c950ywan                              const unsigned char *limit,
1675233d2500723e5594f3e7c70896ffeeef32b9c950ywan                              const unsigned char *thresh) {
1676233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
1677233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned char *src[2];
1678233d2500723e5594f3e7c70896ffeeef32b9c950ywan  unsigned char *dst[2];
1679233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1680233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src[0] = s - 8;
1681233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src[1] = s;
1682233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dst[0] = t_dst;
1683233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dst[1] = t_dst + 8 * 8;
1684233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1685233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Transpose 16x8
1686233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose(src, p, dst, 8, 2);
1687233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1688233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Loop filtering
1689233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
1690233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1691233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src[0] = t_dst;
1692233d2500723e5594f3e7c70896ffeeef32b9c950ywan  src[1] = t_dst + 8 * 8;
1693233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dst[0] = s - 8;
1694233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dst[1] = s;
1695233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1696233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Transpose back
1697233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose(src, 8, dst, p, 2);
1698233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1699233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1700233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
1701233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                   const uint8_t *blimit, const uint8_t *limit,
1702233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                   const uint8_t *thresh) {
1703233d2500723e5594f3e7c70896ffeeef32b9c950ywan  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1704233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1705233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Transpose 16x16
1706233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1707233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1708233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1709233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Loop filtering
1710233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
1711233d2500723e5594f3e7c70896ffeeef32b9c950ywan                                   thresh);
1712233d2500723e5594f3e7c70896ffeeef32b9c950ywan
1713233d2500723e5594f3e7c70896ffeeef32b9c950ywan  // Transpose back
1714233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1715233d2500723e5594f3e7c70896ffeeef32b9c950ywan  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1716233d2500723e5594f3e7c70896ffeeef32b9c950ywan}
1717