16fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org/*
26fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
36fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *
46fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  Use of this source code is governed by a BSD-style license
56fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  that can be found in the LICENSE file in the root of the source
66fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  tree. An additional intellectual property rights grant can be found
76fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  in the file PATENTS.  All contributing project authors may
86fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org *  be found in the AUTHORS file in the root of the source tree.
96fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org */
106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
11d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org#include <emmintrin.h>  // SSE2
126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org#include "vp9/common/vp9_loopfilter.h"
13d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org#include "vpx_ports/emmintrin_compat.h"
146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.orgstatic INLINE __m128i abs_diff(__m128i a, __m128i b) {
1687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
1787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org}
1887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
1947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
2047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org                                            int p,
2147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org                                            const unsigned char *_blimit,
2247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org                                            const unsigned char *_limit,
2347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org                                            const unsigned char *_thresh) {
24d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org  const __m128i zero = _mm_set1_epi16(0);
2506d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org  const __m128i one = _mm_set1_epi8(1);
26d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
27d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
28d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
29d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  __m128i mask, hev, flat, flat2;
30f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
31f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  __m128i abs_p1p0;
32d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org
33f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
34f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
35f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                       (__m64 *)(s + 4 * p)));
36f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
37f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3),
38f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                       (__m64 *)(s + 3 * p)));
39f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
40f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2),
41f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                       (__m64 *)(s + 2 * p)));
42f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
43f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1),
44f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                       (__m64 *)(s + 1 * p)));
45f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  p1q1 = _mm_shuffle_epi32(q1p1, 78);
46f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
47f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0),
48f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                       (__m64 *)(s - 0 * p)));
49f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org  p0q0 = _mm_shuffle_epi32(q0p0, 78);
5006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org
51d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org  {
52f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
5387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    abs_p1p0 = abs_diff(q1p1, q0p0);
54f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
55f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    fe = _mm_set1_epi8(0xfe);
56f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
5787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    abs_p0q0 = abs_diff(q0p0, p0q0);
5887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    abs_p1q1 = abs_diff(q1p1, p1q1);
59d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
60d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    hev = _mm_subs_epu8(flat, thresh);
61d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
62d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org
63d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
64d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
65d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
66d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
67d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
68f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mask = _mm_max_epu8(abs_p1p0, mask);
69d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    // mask |= (abs(p1 - p0) > limit) * -1;
70d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    // mask |= (abs(q1 - q0) > limit) * -1;
71f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
7287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
7387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                        abs_diff(q3p3, q2p2));
74d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    mask = _mm_max_epu8(work, mask);
75f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
76d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    mask = _mm_subs_epu8(mask, limit);
77d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    mask = _mm_cmpeq_epi8(mask, zero);
78d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org  }
79d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org
80d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org  // lp filter
81d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org  {
82d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    const __m128i t4 = _mm_set1_epi8(4);
83d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    const __m128i t3 = _mm_set1_epi8(3);
84d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    const __m128i t80 = _mm_set1_epi8(0x80);
85f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    const __m128i t1 = _mm_set1_epi16(0x1);
86f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
87f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
88f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    __m128i qs0 = _mm_xor_si128(p0q0, t80);
89f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    __m128i qs1 = _mm_xor_si128(p1q1, t80);
90d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    __m128i filt;
91d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    __m128i work_a;
92d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    __m128i filter1, filter2;
93f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
94f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
95d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org
96f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
97f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    work_a = _mm_subs_epi8(qs0, qs0ps0);
98d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    filt = _mm_adds_epi8(filt, work_a);
99d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    filt = _mm_adds_epi8(filt, work_a);
100d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    filt = _mm_adds_epi8(filt, work_a);
101d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // (vp9_filter + 3 * (qs0 - ps0)) & mask
102d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    filt = _mm_and_si128(filt, mask);
103d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org
104d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    filter1 = _mm_adds_epi8(filt, t4);
105d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org    filter2 = _mm_adds_epi8(filt, t3);
106d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org
107f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    filter1 = _mm_unpacklo_epi8(zero, filter1);
108f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    filter1 = _mm_srai_epi16(filter1, 0xB);
109f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    filter2 = _mm_unpacklo_epi8(zero, filter2);
110f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    filter2 = _mm_srai_epi16(filter2, 0xB);
111f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
112d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // Filter1 >> 3
113f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
114f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
115d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org
116d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // filt >> 1
117f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    filt = _mm_adds_epi16(filter1, t1);
118f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    filt = _mm_srai_epi16(filt, 1);
119f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
120f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                            filt);
121f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
122f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
12306d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org    // loopfilter done
12406d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org
12506d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org    {
12606d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      __m128i work;
12787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
128f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat = _mm_max_epu8(abs_p1p0, flat);
129f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
13006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      flat = _mm_subs_epu8(flat, one);
13106d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      flat = _mm_cmpeq_epi8(flat, zero);
13206d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      flat = _mm_and_si128(flat, mask);
13306d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org
134f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
135f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5),
136f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                           (__m64 *)(s + 5 * p)));
137f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
138f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
139f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6),
140f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                           (__m64 *)(s + 6 * p)));
14187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
142f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
143f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
144f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7),
145f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                           (__m64 *)(s + 7 * p)));
14687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
14706d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      flat2 = _mm_max_epu8(work, flat2);
148f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
14906d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      flat2 = _mm_subs_epu8(flat2, one);
15006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      flat2 = _mm_cmpeq_epi8(flat2, zero);
15106d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
15206d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org    }
15306d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org
15406d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
15506d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org    // flat and wide flat calculations
15606d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org    {
15706d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      const __m128i eight = _mm_set1_epi16(8);
15806d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      const __m128i four = _mm_set1_epi16(4);
159f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
160f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
161f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      __m128i pixelFilter_p, pixelFilter_q;
162f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
163f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
164f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
165f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      p7_16 = _mm_unpacklo_epi8(q7p7, zero);;
166f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
167f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
168f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
169f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
170f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
171f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
172f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
173f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
174f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
175f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
176f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
177f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
178f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
179f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
180f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      q7_16 = _mm_unpackhi_epi8(q7p7, zero);
181f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
182f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
183f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                    _mm_add_epi16(p4_16, p3_16));
184f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
185f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                    _mm_add_epi16(q4_16, q3_16));
186f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
187f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
188f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_p =  _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
189f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
190f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
191f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_q =  _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
192f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_p =  _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p,
193f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                                         pixelFilter_q));
194f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixetFilter_p2p1p0 =   _mm_add_epi16(four,
195f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                           _mm_add_epi16(pixetFilter_p2p1p0,
196f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                                         pixetFilter_q2q1q0));
197f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
198f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                           _mm_add_epi16(p7_16, p0_16)), 4);
199f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
200f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                           _mm_add_epi16(q7_16, q0_16)), 4);
201f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
202f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
203f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                           _mm_add_epi16(p3_16, p0_16)), 3);
204f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
205f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                           _mm_add_epi16(q3_16, q0_16)), 3);
206f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
207f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat_q0p0 = _mm_packus_epi16(res_p, res_q);
208f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
209f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_p7 = _mm_add_epi16(p7_16, p7_16);
210f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_q7 = _mm_add_epi16(q7_16, q7_16);
211f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_p3 = _mm_add_epi16(p3_16, p3_16);
212f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_q3 = _mm_add_epi16(q3_16, q3_16);
213f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
214f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
215f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
216f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
217f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_p7, p1_16)), 4);
218f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
219f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_q7, q1_16)), 4);
220f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
221f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
222f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
223f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
224f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
225f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_p3, p1_16)), 3);
226f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
227f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_q3, q1_16)), 3);
228f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat_q1p1 = _mm_packus_epi16(res_p, res_q);
229f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
230f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
231f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
232f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
233f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_q3 = _mm_add_epi16(sum_q3, q3_16);
234f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
235f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
236f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
237f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
238f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_p7, p2_16)), 4);
239f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
240f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_q7, q2_16)), 4);
241f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
242f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
243f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
244f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
245f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
246f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0,
247f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                           _mm_add_epi16(sum_p3, p2_16)), 3);
248f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0,
249f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                                           _mm_add_epi16(sum_q3, q2_16)), 3);
250f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat_q2p2 = _mm_packus_epi16(res_p, res_q);
251f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
252f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
253f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
254f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
255f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
256f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
257f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_p7, p3_16)), 4);
258f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
259f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_q7, q3_16)), 4);
260f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
261f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
262f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
263f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
264f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
265f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
266f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
267f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_p7, p4_16)), 4);
268f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
269f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_q7, q4_16)), 4);
270f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
271f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
272f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
273f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
274f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
275f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
276f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
277f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_p7, p5_16)), 4);
278f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
279f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_q7, q5_16)), 4);
280f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
281f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
282f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
283f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
284f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
285f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
286f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p,
287f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_p7, p6_16)), 4);
288f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q,
289f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org                             _mm_add_epi16(sum_q7, q6_16)), 4);
290f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
29106d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org    }
29206d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org    // wide flat
29306d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29406d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org
295f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat = _mm_shuffle_epi32(flat, 68);
296f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat2 = _mm_shuffle_epi32(flat2, 68);
297f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
298f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q2p2 = _mm_andnot_si128(flat, q2p2);
299f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
300f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q2p2 = _mm_or_si128(q2p2, flat_q2p2);
301f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
302f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
303f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
304f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
305f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
306f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
307f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
308f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
309f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
310f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q6p6 = _mm_andnot_si128(flat2, q6p6);
311f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
312f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
313f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
314f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
315f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
316f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q5p5 = _mm_andnot_si128(flat2, q5p5);
317f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
318f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
319f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
320f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
321f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
322f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q4p4 = _mm_andnot_si128(flat2, q4p4);
323f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
324f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
325f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
326f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
327f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
328f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q3p3 = _mm_andnot_si128(flat2, q3p3);
329f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
330f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
331f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
332f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
333f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
334f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q2p2 = _mm_andnot_si128(flat2, q2p2);
335f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
336f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
337f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
338f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
339f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
340f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q1p1 = _mm_andnot_si128(flat2, q1p1);
341f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
342f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
343f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
344f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
345f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org
346f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q0p0 = _mm_andnot_si128(flat2, q0p0);
347f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
348f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
349f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
350f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org    _mm_storeh_pi((__m64 *)(s - 0 * p),  _mm_castsi128_ps(q0p0));
351d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org  }
352d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org}
353d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org
35487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.orgstatic INLINE __m128i filter_add2_sub2(const __m128i *const total,
35587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                       const __m128i *const a1,
35687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                       const __m128i *const a2,
35787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                       const __m128i *const s1,
35887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                       const __m128i *const s2) {
35987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  __m128i x = _mm_add_epi16(*a1, *total);
36087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
36187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  return x;
36287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org}
36387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
36487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.orgstatic INLINE __m128i filter8_mask(const __m128i *const flat,
36587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                   const __m128i *const other_filt,
36687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                   const __m128i *const f8_lo,
36787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                   const __m128i *const f8_hi) {
36887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3),
36987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                      _mm_srli_epi16(*f8_hi, 3));
37087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  const __m128i result = _mm_and_si128(*flat, f8);
37187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
37287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org}
37387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
37487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.orgstatic INLINE __m128i filter16_mask(const __m128i *const flat,
37587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                    const __m128i *const other_filt,
37687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                    const __m128i *const f_lo,
37787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                    const __m128i *const f_hi) {
37887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4),
37987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                                     _mm_srli_epi16(*f_hi, 4));
38087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  const __m128i result = _mm_and_si128(*flat, f);
38187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
38287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org}
38387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
38447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
38547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org                                             int p,
38647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org                                             const unsigned char *_blimit,
38747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org                                             const unsigned char *_limit,
38847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org                                             const unsigned char *_thresh) {
38947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i zero = _mm_set1_epi16(0);
39047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  const __m128i one = _mm_set1_epi8(1);
391d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
392d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
393d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
394d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  __m128i mask, hev, flat, flat2;
39547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i p7, p6, p5;
39647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
39747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  __m128i q5, q6, q7;
39847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
39987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  __m128i op2, op1, op0, oq0, oq1, oq2;
40087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
40187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  __m128i max_abs_p1p0q1q0;
40287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
40387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
40487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
40587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
40647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
40747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
40847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
40947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
41047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
41147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
41247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
41347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
41447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
41547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
41687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
41787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
41887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
41947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
42047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  {
42187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    const __m128i abs_p1p0 = abs_diff(p1, p0);
42287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    const __m128i abs_q1q0 = abs_diff(q1, q0);
42347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    const __m128i fe = _mm_set1_epi8(0xfe);
42487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
42587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    __m128i abs_p0q0 = abs_diff(p0, q0);
42687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    __m128i abs_p1q1 = abs_diff(p1, q1);
42747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    __m128i work;
42887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
42947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
43047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
43147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
43247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
43347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
43447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
43587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
43647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    // mask |= (abs(p1 - p0) > limit) * -1;
43747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    // mask |= (abs(q1 - q0) > limit) * -1;
43887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
43947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    mask = _mm_max_epu8(work, mask);
44087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
44147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    mask = _mm_max_epu8(work, mask);
44247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    mask = _mm_subs_epu8(mask, limit);
44347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    mask = _mm_cmpeq_epi8(mask, zero);
44447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  }
44547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
44687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  {
44787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    __m128i work;
44887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
44987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
45087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
45187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat = _mm_max_epu8(work, flat);
45287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
45387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat = _mm_subs_epu8(flat, one);
45487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat = _mm_cmpeq_epi8(flat, zero);
45587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat = _mm_and_si128(flat, mask);
45687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
45787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat2 = _mm_max_epu8(work, flat2);
45887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
45987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat2 = _mm_max_epu8(work, flat2);
46087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
46187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat2 = _mm_max_epu8(work, flat2);
46287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat2 = _mm_subs_epu8(flat2, one);
46387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat2 = _mm_cmpeq_epi8(flat2, zero);
46487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
46587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  }
46687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
46787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
46887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org  // filter4
46947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  {
47047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    const __m128i t4 = _mm_set1_epi8(4);
47147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    const __m128i t3 = _mm_set1_epi8(3);
47247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    const __m128i t80 = _mm_set1_epi8(0x80);
47347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    const __m128i te0 = _mm_set1_epi8(0xe0);
47447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    const __m128i t1f = _mm_set1_epi8(0x1f);
47547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    const __m128i t1 = _mm_set1_epi8(0x1);
47647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    const __m128i t7f = _mm_set1_epi8(0x7f);
47787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
47847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
47947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    __m128i filt;
48047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    __m128i work_a;
48147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    __m128i filter1, filter2;
48247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
48387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    op1 = _mm_xor_si128(p1, t80);
48487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    op0 = _mm_xor_si128(p0, t80);
48587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    oq0 = _mm_xor_si128(q0, t80);
48687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    oq1 = _mm_xor_si128(q1, t80);
48787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
48887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
48987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
49087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
49187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
49287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    work_a = _mm_subs_epi8(oq0, op0);
49347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filt = _mm_adds_epi8(filt, work_a);
49447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filt = _mm_adds_epi8(filt, work_a);
49547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filt = _mm_adds_epi8(filt, work_a);
496d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // (vp9_filter + 3 * (qs0 - ps0)) & mask
49747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filt = _mm_and_si128(filt, mask);
49847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filter1 = _mm_adds_epi8(filt, t4);
49947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filter2 = _mm_adds_epi8(filt, t3);
50047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
501d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // Filter1 >> 3
50247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    work_a = _mm_cmpgt_epi8(zero, filter1);
50347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filter1 = _mm_srli_epi16(filter1, 3);
50447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    work_a = _mm_and_si128(work_a, te0);
50547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filter1 = _mm_and_si128(filter1, t1f);
50647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filter1 = _mm_or_si128(filter1, work_a);
50787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
50847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
509d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // Filter2 >> 3
51047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    work_a = _mm_cmpgt_epi8(zero, filter2);
51147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filter2 = _mm_srli_epi16(filter2, 3);
51247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    work_a = _mm_and_si128(work_a, te0);
51347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filter2 = _mm_and_si128(filter2, t1f);
51447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filter2 = _mm_or_si128(filter2, work_a);
51587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
51647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
517d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // filt >> 1
51847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filt = _mm_adds_epi8(filter1, t1);
51947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    work_a = _mm_cmpgt_epi8(zero, filt);
52047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filt = _mm_srli_epi16(filt, 1);
52147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    work_a = _mm_and_si128(work_a, t80);
52247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filt = _mm_and_si128(filt, t7f);
52347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filt = _mm_or_si128(filt, work_a);
52447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    filt = _mm_andnot_si128(hev, filt);
52587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
52687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
52747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    // loopfilter done
52847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
52987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
53087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    // filter8
53147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    {
53287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i four = _mm_set1_epi16(4);
53387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
53487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
53587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
53687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
53787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
53887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
53987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
54087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
54187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
54287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
54387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
54487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
54587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
54687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
54787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
54887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
54987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
55087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      __m128i f8_lo, f8_hi;
55187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
55287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
55387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                            _mm_add_epi16(p3_lo, p2_lo));
55487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
55587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                            _mm_add_epi16(p2_lo, p1_lo));
55687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
55787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
55887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
55987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                            _mm_add_epi16(p3_hi, p2_hi));
56087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
56187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                            _mm_add_epi16(p2_hi, p1_hi));
56287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
56387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
56487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
56587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
56687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
56787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
56887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
56987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
57087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
57187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
57287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
57387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
57487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
57587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
57687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
57787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
57887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
57987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
58087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
58187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
58287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
58387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
58487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
58547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    }
58647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
58747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
58887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    // wide flat calculations
58947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    {
59047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org      const __m128i eight = _mm_set1_epi16(8);
59187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
59287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
59387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
59487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
59587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
59687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
59787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
59887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
59987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
60087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
60187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
60287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
60387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
60487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
60587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
60687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
60787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
60887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
60987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
61087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
61187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
61287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
61387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
61487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
61587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
61687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
61787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
61887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
61987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
62087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
62187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
62287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
62387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
62487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
62587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      __m128i f_lo;
62687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      __m128i f_hi;
62787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
62887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
62987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1),
63087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                           _mm_add_epi16(p4_lo, f_lo));
63187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
63287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                           _mm_add_epi16(p2_lo, p1_lo));
63387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
63487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
63587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
63687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
63787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1),
63887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                           _mm_add_epi16(p4_hi, f_hi));
63987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
64087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                           _mm_add_epi16(p2_hi, p1_hi));
64187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
64287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
64387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
64487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
64587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
64687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
64787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
64887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
64987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
65087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
65187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
65287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
65387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
65487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
65587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
65687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
65787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
65887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
65987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
66087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
66187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
66287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
66387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
66487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
66587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
66687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
66787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
66887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
66987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
67087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
67187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
67287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
67387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
67487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
67587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
67687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
67787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
67887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
67987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
68087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
68187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
68287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
68387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
68487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
68587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
68687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
68787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
68887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
68987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
69087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
69187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
69287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
69387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
69487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
69587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
69687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
69787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
69887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
69987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
70087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
70187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
70287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
70387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
70487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
70587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
70687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org
70787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
70887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
70987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
71087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
71147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    }
71247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    // wide flat
71347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
71447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  }
71547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
71647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
717d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
7188b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_horizontal_16_sse2(unsigned char *s, int p,
7198b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                const unsigned char *_blimit,
7208b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                const unsigned char *_limit,
7218b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                const unsigned char *_thresh, int count) {
72247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  if (count == 1)
72347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh);
72447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org  else
72547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org    mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh);
72647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org}
72747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org
7288b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
7298b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                               const unsigned char *_blimit,
7308b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                               const unsigned char *_limit,
7318b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                               const unsigned char *_thresh, int count) {
732d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
733d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
734d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
735d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
736d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
737d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
7386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  const __m128i zero = _mm_set1_epi16(0);
739d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
740d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
741d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
742d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  __m128i mask, hev, flat;
74306d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
744d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
7456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
746f7b25aef0ed571110c9f656f29ead07b02d33d89fgalligan@chromium.org  (void)count;
747d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
748d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
749d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
750d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
751d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
752d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
753d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
754d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
755d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
756d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  p1q1 = _mm_shuffle_epi32(q1p1, 78);
757d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  p0q0 = _mm_shuffle_epi32(q0p0, 78);
758d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
759d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  {
760d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // filter_mask and hev_mask
761d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i one = _mm_set1_epi8(1);
762d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i fe = _mm_set1_epi8(0xfe);
763d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
764d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
76587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    abs_p1p0 = abs_diff(q1p1, q0p0);
766d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
767d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
76887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    abs_p0q0 = abs_diff(q0p0, p0q0);
76987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    abs_p1q1 = abs_diff(q1p1, p1q1);
770d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
771d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    hev = _mm_subs_epu8(flat, thresh);
772d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
773d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
774d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
775d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
776d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
777d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
778d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
779d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_max_epu8(abs_p1p0, mask);
780d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // mask |= (abs(p1 - p0) > limit) * -1;
781d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // mask |= (abs(q1 - q0) > limit) * -1;
782d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
78387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    work = _mm_max_epu8(abs_diff(q2p2, q1p1),
78487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                        abs_diff(q3p3, q2p2));
785d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_max_epu8(work, mask);
786d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
787d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_subs_epu8(mask, limit);
788d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_cmpeq_epi8(mask, zero);
789d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
790d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // flat_mask4
791d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
79287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org    flat = _mm_max_epu8(abs_diff(q2p2, q0p0),
79387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org                        abs_diff(q3p3, q0p0));
794d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    flat = _mm_max_epu8(abs_p1p0, flat);
795d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
796d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    flat = _mm_subs_epu8(flat, one);
797d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    flat = _mm_cmpeq_epi8(flat, zero);
798d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    flat = _mm_and_si128(flat, mask);
799d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  }
800d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
801d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  {
802d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i four = _mm_set1_epi16(4);
803d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    unsigned char *src = s;
804d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    {
805d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      __m128i workp_a, workp_b, workp_shft;
806d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
807d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
808d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
809d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
810d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
811d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
812d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
813d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
814d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
815d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
816d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
817d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
818d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
819d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_op2[0],
820d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
821d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
822d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
823d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
824d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_op1[0],
825d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
826d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
827d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
828d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
829d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
830d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_op0[0],
831d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
832d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
833d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
834d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
835d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
836d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_oq0[0],
837d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
838d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
839d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
840d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
841d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
842d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_oq1[0],
843d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
844d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
845d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
846d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
847d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
848d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_oq2[0],
849d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
850d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    }
851d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  }
852d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // lp filter
853d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  {
854d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i t4 = _mm_set1_epi8(4);
855d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i t3 = _mm_set1_epi8(3);
856d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i t80 = _mm_set1_epi8(0x80);
857d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i t1 = _mm_set1_epi8(0x1);
858d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
859d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                      t80);
860d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
861d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                      t80);
862d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
863d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                      t80);
864d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
865d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                      t80);
866d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    __m128i filt;
867d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    __m128i work_a;
868d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    __m128i filter1, filter2;
869d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
870d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
871d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_subs_epi8(qs0, ps0);
872d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_adds_epi8(filt, work_a);
873d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_adds_epi8(filt, work_a);
874d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_adds_epi8(filt, work_a);
875d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // (vp9_filter + 3 * (qs0 - ps0)) & mask
876d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_and_si128(filt, mask);
877d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
878d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter1 = _mm_adds_epi8(filt, t4);
879d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter2 = _mm_adds_epi8(filt, t3);
880d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
881d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // Filter1 >> 3
882d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter1 = _mm_unpacklo_epi8(zero, filter1);
883d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter1 = _mm_srai_epi16(filter1, 11);
884d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter1 = _mm_packs_epi16(filter1, filter1);
885d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
886d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // Filter2 >> 3
887d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter2 = _mm_unpacklo_epi8(zero, filter2);
888d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter2 = _mm_srai_epi16(filter2, 11);
889d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter2 = _mm_packs_epi16(filter2, zero);
890d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
891d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // filt >> 1
892d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_adds_epi8(filter1, t1);
893d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_unpacklo_epi8(zero, filt);
894d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_srai_epi16(filt, 9);
895d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_packs_epi16(filt, zero);
896d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
897d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_andnot_si128(hev, filt);
898d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
899d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
900d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
901d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
902d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q0 = _mm_and_si128(flat, q0);
903d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q0 = _mm_or_si128(work_a, q0);
904d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
905d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
906d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
907d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
908d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q1 = _mm_and_si128(flat, q1);
909d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q1 = _mm_or_si128(work_a, q1);
910d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
911d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
912d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
913d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
914d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q2 = _mm_and_si128(flat, q2);
915d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q2 = _mm_or_si128(work_a, q2);
916d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
917d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
918d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
919d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
920d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p0 = _mm_and_si128(flat, p0);
921d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p0 = _mm_or_si128(work_a, p0);
922d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
923d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
924d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
925d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
926d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p1 = _mm_and_si128(flat, p1);
927d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p1 = _mm_or_si128(work_a, p1);
928d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
929d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
930d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
931d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
932d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p2 = _mm_and_si128(flat, p2);
933d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p2 = _mm_or_si128(work_a, p2);
934d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
935d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
936d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
937d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
938d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
939d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
940d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
941d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  }
942d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org}
943d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
9448b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
9458b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const uint8_t *_blimit0,
9468b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const uint8_t *_limit0,
9478b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const uint8_t *_thresh0,
9488b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const uint8_t *_blimit1,
9498b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const uint8_t *_limit1,
9508b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const uint8_t *_thresh1) {
951d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
952d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
953d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
954d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
955d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
956d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
957d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i zero = _mm_set1_epi16(0);
958d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i blimit =
959d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
960d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                         _mm_load_si128((const __m128i *)_blimit1));
961d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i limit =
962d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
963d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                         _mm_load_si128((const __m128i *)_limit1));
964d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i thresh =
965d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
966d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                         _mm_load_si128((const __m128i *)_thresh1));
967d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
968d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  __m128i mask, hev, flat;
969d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
970d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
971d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
972d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
973d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
974d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
975d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
976d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
977d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
978d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
9796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  {
9806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
9816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                          _mm_subs_epu8(p0, p1));
9826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
9836fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                          _mm_subs_epu8(q0, q1));
9846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i one = _mm_set1_epi8(1);
9856fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i fe = _mm_set1_epi8(0xfe);
9866fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
9876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
9886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                    _mm_subs_epu8(q0, p0));
9896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
9906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                    _mm_subs_epu8(q1, p1));
9916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    __m128i work;
992d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
993d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // filter_mask and hev_mask
9946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
9956fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    hev = _mm_subs_epu8(flat, thresh);
9966fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
9976fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
9986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
9996fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
10006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
10016fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
10026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
10036fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mask = _mm_max_epu8(flat, mask);
10046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // mask |= (abs(p1 - p0) > limit) * -1;
10056fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // mask |= (abs(q1 - q0) > limit) * -1;
10066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
10076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                     _mm_subs_epu8(p1, p2)),
10086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                         _mm_or_si128(_mm_subs_epu8(p3, p2),
10096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                      _mm_subs_epu8(p2, p3)));
10106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mask = _mm_max_epu8(work, mask);
10116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
10126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                     _mm_subs_epu8(q1, q2)),
10136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                         _mm_or_si128(_mm_subs_epu8(q3, q2),
10146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                      _mm_subs_epu8(q2, q3)));
10156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mask = _mm_max_epu8(work, mask);
10166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mask = _mm_subs_epu8(mask, limit);
10176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    mask = _mm_cmpeq_epi8(mask, zero);
10186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1019d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // flat_mask4
10206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
10216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                     _mm_subs_epu8(p0, p2)),
10226fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                         _mm_or_si128(_mm_subs_epu8(q2, q0),
10236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                      _mm_subs_epu8(q0, q2)));
10246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    flat = _mm_max_epu8(work, flat);
10256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
10266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                     _mm_subs_epu8(p0, p3)),
10276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                         _mm_or_si128(_mm_subs_epu8(q3, q0),
10286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                      _mm_subs_epu8(q0, q3)));
10296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    flat = _mm_max_epu8(work, flat);
10306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    flat = _mm_subs_epu8(flat, one);
10316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    flat = _mm_cmpeq_epi8(flat, zero);
10326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    flat = _mm_and_si128(flat, mask);
10336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  }
10346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  {
10356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i four = _mm_set1_epi16(4);
10366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    unsigned char *src = s;
1037d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    int i = 0;
1038d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1039d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    do {
10406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      __m128i workp_a, workp_b, workp_shft;
10416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
10426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
10436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
10446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
10456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
10466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
10476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
10486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
10496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
105006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
10516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
105206d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
10536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1054d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
10556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
10566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
10576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
10586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1059d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
10606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
10616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
106206d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
10636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
10646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1065d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
10666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
10676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
10686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
10696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
10706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1071d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
10726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
10736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
107406d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
10756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
10766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1077d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
10786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
10796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
108006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
10816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
10826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1083d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
10846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                       _mm_packus_epi16(workp_shft, workp_shft));
1085d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1086d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      src += 8;
1087d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    } while (++i < 2);
10886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  }
10896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  // lp filter
10906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  {
10916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i t4 = _mm_set1_epi8(4);
10926fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i t3 = _mm_set1_epi8(3);
10936fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i t80 = _mm_set1_epi8(0x80);
10946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i te0 = _mm_set1_epi8(0xe0);
10956fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i t1f = _mm_set1_epi8(0x1f);
10966fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i t1 = _mm_set1_epi8(0x1);
10976fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    const __m128i t7f = _mm_set1_epi8(0x7f);
10986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1099d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
11006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                      t80);
1101d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
11026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                      t80);
1103d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
11046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                      t80);
1105d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
11066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                                      t80);
11076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    __m128i filt;
11086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    __m128i work_a;
11096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    __m128i filter1, filter2;
11106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
11116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
11126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_subs_epi8(qs0, ps0);
11136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filt = _mm_adds_epi8(filt, work_a);
11146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filt = _mm_adds_epi8(filt, work_a);
11156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filt = _mm_adds_epi8(filt, work_a);
1116d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // (vp9_filter + 3 * (qs0 - ps0)) & mask
11176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filt = _mm_and_si128(filt, mask);
11186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
11196fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filter1 = _mm_adds_epi8(filt, t4);
11206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filter2 = _mm_adds_epi8(filt, t3);
11216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1122d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // Filter1 >> 3
11236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_cmpgt_epi8(zero, filter1);
11246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filter1 = _mm_srli_epi16(filter1, 3);
11256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_and_si128(work_a, te0);
11266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filter1 = _mm_and_si128(filter1, t1f);
11276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filter1 = _mm_or_si128(filter1, work_a);
11286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1129d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // Filter2 >> 3
11306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_cmpgt_epi8(zero, filter2);
11316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filter2 = _mm_srli_epi16(filter2, 3);
11326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_and_si128(work_a, te0);
11336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filter2 = _mm_and_si128(filter2, t1f);
11346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filter2 = _mm_or_si128(filter2, work_a);
11356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1136d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // filt >> 1
11376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filt = _mm_adds_epi8(filter1, t1);
11386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_cmpgt_epi8(zero, filt);
11396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filt = _mm_srli_epi16(filt, 1);
11406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_and_si128(work_a, t80);
11416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filt = _mm_and_si128(filt, t7f);
11426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filt = _mm_or_si128(filt, work_a);
11436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
11446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    filt = _mm_andnot_si128(hev, filt);
11456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
11466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1147d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q0 = _mm_load_si128((__m128i *)flat_oq0);
11486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
11496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    q0 = _mm_and_si128(flat, q0);
11506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    q0 = _mm_or_si128(work_a, q0);
11516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
11526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1153d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q1 = _mm_load_si128((__m128i *)flat_oq1);
11546fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
11556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    q1 = _mm_and_si128(flat, q1);
11566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    q1 = _mm_or_si128(work_a, q1);
11576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
11586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1159d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q2 = _mm_load_si128((__m128i *)flat_oq2);
11606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
11616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    q2 = _mm_and_si128(flat, q2);
11626fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    q2 = _mm_or_si128(work_a, q2);
11636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
11646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1165d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p0 = _mm_load_si128((__m128i *)flat_op0);
11666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
11676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    p0 = _mm_and_si128(flat, p0);
11686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    p0 = _mm_or_si128(work_a, p0);
11696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
11706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1171d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p1 = _mm_load_si128((__m128i *)flat_op1);
11726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
11736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    p1 = _mm_and_si128(flat, p1);
11746fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    p1 = _mm_or_si128(work_a, p1);
11756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
11766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1177d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p2 = _mm_load_si128((__m128i *)flat_op2);
11786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    work_a = _mm_andnot_si128(flat, work_a);
11796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    p2 = _mm_and_si128(flat, p2);
11806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    p2 = _mm_or_si128(work_a, p2);
11816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1182d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1183d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1184d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1185d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1186d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1187d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1188d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  }
1189d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org}
1190d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
11918b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
11928b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const unsigned char *_blimit0,
11938b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const unsigned char *_limit0,
11948b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const unsigned char *_thresh0,
11958b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const unsigned char *_blimit1,
11968b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const unsigned char *_limit1,
11978b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                    const unsigned char *_thresh1) {
1198d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i blimit =
1199d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1200d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                         _mm_load_si128((const __m128i *)_blimit1));
1201d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i limit =
1202d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1203d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                         _mm_load_si128((const __m128i *)_limit1));
1204d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i thresh =
1205d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1206d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                         _mm_load_si128((const __m128i *)_thresh1));
1207d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  const __m128i zero = _mm_set1_epi16(0);
1208d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1209d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  __m128i mask, hev, flat;
1210d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1211d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1212d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1213d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1214d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1215d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1216d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1217d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1218d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1219d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1220d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // filter_mask and hev_mask
1221d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  {
1222d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
1223d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                          _mm_subs_epu8(p0, p1));
1224d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
1225d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                          _mm_subs_epu8(q0, q1));
1226d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i fe = _mm_set1_epi8(0xfe);
1227d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1228d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
1229d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                    _mm_subs_epu8(q0, p0));
1230d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
1231d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                    _mm_subs_epu8(q1, p1));
1232d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    __m128i work;
1233d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1234d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1235d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    hev = _mm_subs_epu8(flat, thresh);
1236d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1237d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1238d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
1239d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1240d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1241d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1242d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
1243d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_max_epu8(flat, mask);
1244d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // mask |= (abs(p1 - p0) > limit) * -1;
1245d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // mask |= (abs(q1 - q0) > limit) * -1;
1246d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
1247d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                     _mm_subs_epu8(p1, p2)),
1248d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                         _mm_or_si128(_mm_subs_epu8(p3, p2),
1249d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                      _mm_subs_epu8(p2, p3)));
1250d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_max_epu8(work, mask);
1251d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
1252d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                     _mm_subs_epu8(q1, q2)),
1253d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                         _mm_or_si128(_mm_subs_epu8(q3, q2),
1254d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                      _mm_subs_epu8(q2, q3)));
1255d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_max_epu8(work, mask);
1256d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_subs_epu8(mask, limit);
1257d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    mask = _mm_cmpeq_epi8(mask, zero);
1258d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  }
1259d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1260d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // filter4
1261d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  {
1262d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i t4 = _mm_set1_epi8(4);
1263d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i t3 = _mm_set1_epi8(3);
1264d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i t80 = _mm_set1_epi8(0x80);
1265d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i te0 = _mm_set1_epi8(0xe0);
1266d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i t1f = _mm_set1_epi8(0x1f);
1267d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i t1 = _mm_set1_epi8(0x1);
1268d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i t7f = _mm_set1_epi8(0x7f);
1269d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1270d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
1271d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                      t80);
1272d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
1273d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                      t80);
1274d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
1275d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                      t80);
1276d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
1277d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                      t80);
1278d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    __m128i filt;
1279d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    __m128i work_a;
1280d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    __m128i filter1, filter2;
1281d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1282d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1283d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_subs_epi8(qs0, ps0);
1284d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_adds_epi8(filt, work_a);
1285d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_adds_epi8(filt, work_a);
1286d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_adds_epi8(filt, work_a);
1287d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // (vp9_filter + 3 * (qs0 - ps0)) & mask
1288d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_and_si128(filt, mask);
1289d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1290d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter1 = _mm_adds_epi8(filt, t4);
1291d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter2 = _mm_adds_epi8(filt, t3);
1292d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1293d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // Filter1 >> 3
1294d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_cmpgt_epi8(zero, filter1);
1295d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter1 = _mm_srli_epi16(filter1, 3);
1296d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_and_si128(work_a, te0);
1297d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter1 = _mm_and_si128(filter1, t1f);
1298d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter1 = _mm_or_si128(filter1, work_a);
1299d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1300d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // Filter2 >> 3
1301d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_cmpgt_epi8(zero, filter2);
1302d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter2 = _mm_srli_epi16(filter2, 3);
1303d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_and_si128(work_a, te0);
1304d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter2 = _mm_and_si128(filter2, t1f);
1305d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filter2 = _mm_or_si128(filter2, work_a);
1306d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1307d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    // filt >> 1
1308d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_adds_epi8(filter1, t1);
1309d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_cmpgt_epi8(zero, filt);
1310d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_srli_epi16(filt, 1);
1311d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    work_a = _mm_and_si128(work_a, t80);
1312d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_and_si128(filt, t7f);
1313d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_or_si128(filt, work_a);
1314d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1315d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    filt = _mm_andnot_si128(hev, filt);
1316d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1317d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1318d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1319d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1320d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1321d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1322d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1323d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1324d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1325d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
13266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  }
13276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org}
13286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
132906d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.orgstatic INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
133006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org                                 int in_p, unsigned char *out, int out_p) {
13316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
13326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
13336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1334d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Read in 16 lines
13356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x0 = _mm_loadl_epi64((__m128i *)in0);
13366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x8 = _mm_loadl_epi64((__m128i *)in1);
13376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
13386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
13396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
13406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
13416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
13426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
13436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
13446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
13456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
13466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
13476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
13486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
13496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
13506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
13516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
13526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x0 = _mm_unpacklo_epi8(x0, x1);
13536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x1 = _mm_unpacklo_epi8(x2, x3);
13546fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x2 = _mm_unpacklo_epi8(x4, x5);
13556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x3 = _mm_unpacklo_epi8(x6, x7);
13566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
13576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x8 = _mm_unpacklo_epi8(x8, x9);
13586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x9 = _mm_unpacklo_epi8(x10, x11);
13596fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x10 = _mm_unpacklo_epi8(x12, x13);
13606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x11 = _mm_unpacklo_epi8(x14, x15);
13616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
13626fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x4 = _mm_unpacklo_epi16(x0, x1);
13636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x5 = _mm_unpacklo_epi16(x2, x3);
13646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x12 = _mm_unpacklo_epi16(x8, x9);
13656fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x13 = _mm_unpacklo_epi16(x10, x11);
13666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
13676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x6 = _mm_unpacklo_epi32(x4, x5);
13686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x7 = _mm_unpackhi_epi32(x4, x5);
13696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x14 = _mm_unpacklo_epi32(x12, x13);
13706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x15 = _mm_unpackhi_epi32(x12, x13);
13716fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1372d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Store first 4-line result
13736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
13746fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
13756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
13766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
13776fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
13786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x4 = _mm_unpackhi_epi16(x0, x1);
13796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x5 = _mm_unpackhi_epi16(x2, x3);
13806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x12 = _mm_unpackhi_epi16(x8, x9);
13816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x13 = _mm_unpackhi_epi16(x10, x11);
13826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
13836fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x6 = _mm_unpacklo_epi32(x4, x5);
13846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x7 = _mm_unpackhi_epi32(x4, x5);
13856fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x14 = _mm_unpacklo_epi32(x12, x13);
13866fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  x15 = _mm_unpackhi_epi32(x12, x13);
13876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1388d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Store second 4-line result
13896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
13906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
13916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
13926fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
13936fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org}
13946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
139506d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.orgstatic INLINE void transpose(unsigned char *src[], int in_p,
139606d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org                             unsigned char *dst[], int out_p,
139706d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org                             int num_8x8_to_transpose) {
13986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  int idx8x8 = 0;
13996fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
14006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  do {
14016fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    unsigned char *in = src[idx8x8];
14026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    unsigned char *out = dst[idx8x8];
14036fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
14046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p));  // 00 01 02 03 04 05 06 07
14056fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p));  // 10 11 12 13 14 15 16 17
14066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p));  // 20 21 22 23 24 25 26 27
14076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p));  // 30 31 32 33 34 35 36 37
14086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p));  // 40 41 42 43 44 45 46 47
14096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p));  // 50 51 52 53 54 55 56 57
14106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p));  // 60 61 62 63 64 65 66 67
14116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p));  // 70 71 72 73 74 75 76 77
14126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
14136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x0 = _mm_unpacklo_epi8(x0, x1);
14146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
14156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x1 = _mm_unpacklo_epi8(x2, x3);
14166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
14176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x2 = _mm_unpacklo_epi8(x4, x5);
14186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
14196fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x3 = _mm_unpacklo_epi8(x6, x7);
14206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
14216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x4 = _mm_unpacklo_epi16(x0, x1);
14226fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
14236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x5 = _mm_unpacklo_epi16(x2, x3);
14246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
14256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x6 = _mm_unpacklo_epi32(x4, x5);
14266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
14276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x7 = _mm_unpackhi_epi32(x4, x5);
14286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
14296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    _mm_storel_pd((double *)(out + 0*out_p),
14306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
14316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    _mm_storeh_pd((double *)(out + 1*out_p),
14326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
14336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    _mm_storel_pd((double *)(out + 2*out_p),
14346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
14356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    _mm_storeh_pd((double *)(out + 3*out_p),
14366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
14376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
14386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
14396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x4 = _mm_unpackhi_epi16(x0, x1);
14406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
14416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x5 = _mm_unpackhi_epi16(x2, x3);
14426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
14436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x6 = _mm_unpacklo_epi32(x4, x5);
14446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
14456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    x7 = _mm_unpackhi_epi32(x4, x5);
14466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
14476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    _mm_storel_pd((double *)(out + 4*out_p),
14486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
14496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    _mm_storeh_pd((double *)(out + 5*out_p),
14506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
14516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    _mm_storel_pd((double *)(out + 6*out_p),
14526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
14536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org    _mm_storeh_pd((double *)(out + 7*out_p),
14546fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
14556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  } while (++idx8x8 < num_8x8_to_transpose);
14566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org}
14576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
14588b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
14598b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                  const uint8_t *limit0,
14608b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                  const uint8_t *thresh0,
14618b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                  const uint8_t *blimit1,
14628b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                  const uint8_t *limit1,
14638b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                  const uint8_t *thresh1) {
1464d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
1465d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  unsigned char *src[2];
1466d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  unsigned char *dst[2];
1467d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1468d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Transpose 8x16
1469d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1470d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1471d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Loop filtering
14728b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org  vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
14738b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                 blimit1, limit1, thresh1);
1474d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  src[0] = t_dst;
1475d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  src[1] = t_dst + 8;
1476d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  dst[0] = s - 4;
1477d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  dst[1] = s - 4 + p * 8;
1478d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1479d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Transpose back
1480d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  transpose(src, 16, dst, p, 2);
1481d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org}
1482d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
14838b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
14848b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                             const unsigned char *blimit,
14858b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                             const unsigned char *limit,
14868b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                             const unsigned char *thresh, int count) {
1487d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
1488d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  unsigned char *src[1];
1489d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  unsigned char *dst[1];
1490d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  (void)count;
1491d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1492d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Transpose 8x8
1493d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  src[0] = s - 4;
1494d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  dst[0] = t_dst;
1495d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1496d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  transpose(src, p, dst, 8, 1);
1497d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1498d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Loop filtering
14998b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org  vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1);
1500d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1501d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  src[0] = t_dst;
1502d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  dst[0] = s - 4;
1503d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1504d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Transpose back
1505d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  transpose(src, 8, dst, p, 1);
1506d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org}
1507d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
15088b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
15098b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                  const uint8_t *limit0,
15108b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                  const uint8_t *thresh0,
15118b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                  const uint8_t *blimit1,
15128b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                  const uint8_t *limit1,
15138b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                  const uint8_t *thresh1) {
1514d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
15156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  unsigned char *src[2];
15166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  unsigned char *dst[2];
15176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1518d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Transpose 8x16
1519d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
15206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1521d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Loop filtering
15228b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org  vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
15238b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                 blimit1, limit1, thresh1);
1524d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  src[0] = t_dst;
1525d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  src[1] = t_dst + 8;
15266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1527d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  dst[0] = s - 4;
1528d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  dst[1] = s - 4 + p * 8;
15296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
1530d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Transpose back
15316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org  transpose(src, 16, dst, p, 2);
15326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org}
15336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org
15348b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
15358b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                              const unsigned char *blimit,
15368b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                              const unsigned char *limit,
15378b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                              const unsigned char *thresh) {
1538d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
1539d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  unsigned char *src[2];
1540d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  unsigned char *dst[2];
1541f7b25aef0ed571110c9f656f29ead07b02d33d89fgalligan@chromium.org
1542f7b25aef0ed571110c9f656f29ead07b02d33d89fgalligan@chromium.org  src[0] = s - 8;
1543d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  src[1] = s;
1544d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  dst[0] = t_dst;
1545d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  dst[1] = t_dst + 8 * 8;
1546f7b25aef0ed571110c9f656f29ead07b02d33d89fgalligan@chromium.org
1547d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Transpose 16x8
1548d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  transpose(src, p, dst, 8, 2);
1549d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org
1550d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Loop filtering
1551d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
1552d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org
1553d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org  src[0] = t_dst;
1554d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  src[1] = t_dst + 8 * 8;
1555d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org  dst[0] = s - 8;
1556d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  dst[1] = s;
1557d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org
1558d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Transpose back
1559d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  transpose(src, 8, dst, p, 2);
1560d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org}
1561d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
15628b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
15638b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                   const uint8_t *blimit, const uint8_t *limit,
15648b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org                                   const uint8_t *thresh) {
1565d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
1566d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1567d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Transpose 16x16
1568d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1569d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1570d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1571d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Loop filtering
1572d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
1573d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org                                   thresh);
1574d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org
1575d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  // Transpose back
1576d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1577d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1578d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org}
1579