16fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org/* 26fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 36fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org * 46fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org * Use of this source code is governed by a BSD-style license 56fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org * that can be found in the LICENSE file in the root of the source 66fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org * tree. An additional intellectual property rights grant can be found 76fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org * in the file PATENTS. All contributing project authors may 86fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org * be found in the AUTHORS file in the root of the source tree. 96fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org */ 106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 11d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org#include <emmintrin.h> // SSE2 126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org#include "vp9/common/vp9_loopfilter.h" 13d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org#include "vpx_ports/emmintrin_compat.h" 146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.orgstatic INLINE __m128i abs_diff(__m128i a, __m128i b) { 1687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); 1787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org} 1887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 1947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, 2047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org int p, 2147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const unsigned char *_blimit, 2247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const unsigned char *_limit, 2347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const unsigned char *_thresh) { 24d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org const __m128i zero = _mm_set1_epi16(0); 2506d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org const __m128i one = _mm_set1_epi8(1); 26d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); 27d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i limit = _mm_load_si128((const __m128i *)_limit); 28d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); 29d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i mask, hev, flat, flat2; 30f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; 31f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i abs_p1p0; 32d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org 33f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); 34f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4), 35f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org (__m64 *)(s + 4 * p))); 36f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); 37f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3), 38f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org (__m64 *)(s + 3 * p))); 39f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); 40f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2), 41f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org (__m64 *)(s + 2 * p))); 42f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); 43f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1), 44f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org (__m64 *)(s + 1 * p))); 45f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org p1q1 = _mm_shuffle_epi32(q1p1, 78); 46f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); 47f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0), 48f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org (__m64 *)(s - 0 * p))); 49f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org p0q0 = _mm_shuffle_epi32(q0p0, 78); 5006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org 51d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org { 52f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; 5387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org abs_p1p0 = abs_diff(q1p1, q0p0); 54f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); 55f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org fe = _mm_set1_epi8(0xfe); 56f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 5787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org abs_p0q0 = abs_diff(q0p0, p0q0); 5887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org abs_p1q1 = abs_diff(q1p1, p1q1); 59d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 60d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org hev = _mm_subs_epu8(flat, thresh); 61d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 62d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org 63d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 64d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 65d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 66d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 67d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 68f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org mask = _mm_max_epu8(abs_p1p0, mask); 69d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org // mask |= (abs(p1 - p0) > limit) * -1; 70d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org // mask |= (abs(q1 - q0) > limit) * -1; 71f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 7287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org work = _mm_max_epu8(abs_diff(q2p2, q1p1), 7387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org abs_diff(q3p3, q2p2)); 74d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org mask = _mm_max_epu8(work, mask); 75f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); 76d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org mask = _mm_subs_epu8(mask, limit); 77d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org mask = _mm_cmpeq_epi8(mask, zero); 78d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org } 79d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org 80d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org // lp filter 81d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org { 82d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org const __m128i t4 = _mm_set1_epi8(4); 83d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org const __m128i t3 = _mm_set1_epi8(3); 84d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org const __m128i t80 = _mm_set1_epi8(0x80); 85f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org const __m128i t1 = _mm_set1_epi16(0x1); 86f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); 87f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); 88f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i qs0 = _mm_xor_si128(p0q0, t80); 89f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i qs1 = _mm_xor_si128(p1q1, t80); 90d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org __m128i filt; 91d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org __m128i work_a; 92d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org __m128i filter1, filter2; 93f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; 94f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; 95d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org 96f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); 97f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org work_a = _mm_subs_epi8(qs0, qs0ps0); 98d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org filt = _mm_adds_epi8(filt, work_a); 99d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org filt = _mm_adds_epi8(filt, work_a); 100d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org filt = _mm_adds_epi8(filt, work_a); 101d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // (vp9_filter + 3 * (qs0 - ps0)) & mask 102d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org filt = _mm_and_si128(filt, mask); 103d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org 104d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org filter1 = _mm_adds_epi8(filt, t4); 105d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org filter2 = _mm_adds_epi8(filt, t3); 106d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org 107f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org filter1 = _mm_unpacklo_epi8(zero, filter1); 108f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org filter1 = _mm_srai_epi16(filter1, 0xB); 109f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org filter2 = _mm_unpacklo_epi8(zero, filter2); 110f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org filter2 = _mm_srai_epi16(filter2, 0xB); 111f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 112d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Filter1 >> 3 113f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); 114f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); 115d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org 116d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // filt >> 1 117f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org filt = _mm_adds_epi16(filter1, t1); 118f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org filt = _mm_srai_epi16(filt, 1); 119f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), 120f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org filt); 121f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); 122f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); 12306d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org // loopfilter done 12406d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org 12506d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org { 12606d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org __m128i work; 12787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); 128f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat = _mm_max_epu8(abs_p1p0, flat); 129f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); 13006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org flat = _mm_subs_epu8(flat, one); 13106d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org flat = _mm_cmpeq_epi8(flat, zero); 13206d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org flat = _mm_and_si128(flat, mask); 13306d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org 134f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); 135f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5), 136f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org (__m64 *)(s + 5 * p))); 137f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 138f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); 139f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6), 140f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org (__m64 *)(s + 6 * p))); 14187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); 142f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 143f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); 144f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7), 145f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org (__m64 *)(s + 7 * p))); 14687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); 14706d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org flat2 = _mm_max_epu8(work, flat2); 148f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); 14906d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org flat2 = _mm_subs_epu8(flat2, one); 15006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org flat2 = _mm_cmpeq_epi8(flat2, zero); 15106d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 15206d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org } 15306d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org 15406d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15506d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org // flat and wide flat calculations 15606d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org { 15706d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org const __m128i eight = _mm_set1_epi16(8); 15806d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org const __m128i four = _mm_set1_epi16(4); 159f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; 160f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; 161f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i pixelFilter_p, pixelFilter_q; 162f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; 163f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; 164f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 165f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org p7_16 = _mm_unpacklo_epi8(q7p7, zero);; 166f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org p6_16 = _mm_unpacklo_epi8(q6p6, zero); 167f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org p5_16 = _mm_unpacklo_epi8(q5p5, zero); 168f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org p4_16 = _mm_unpacklo_epi8(q4p4, zero); 169f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org p3_16 = _mm_unpacklo_epi8(q3p3, zero); 170f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org p2_16 = _mm_unpacklo_epi8(q2p2, zero); 171f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org p1_16 = _mm_unpacklo_epi8(q1p1, zero); 172f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org p0_16 = _mm_unpacklo_epi8(q0p0, zero); 173f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q0_16 = _mm_unpackhi_epi8(q0p0, zero); 174f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q1_16 = _mm_unpackhi_epi8(q1p1, zero); 175f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q2_16 = _mm_unpackhi_epi8(q2p2, zero); 176f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q3_16 = _mm_unpackhi_epi8(q3p3, zero); 177f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q4_16 = _mm_unpackhi_epi8(q4p4, zero); 178f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q5_16 = _mm_unpackhi_epi8(q5p5, zero); 179f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q6_16 = _mm_unpackhi_epi8(q6p6, zero); 180f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q7_16 = _mm_unpackhi_epi8(q7p7, zero); 181f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 182f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), 183f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(p4_16, p3_16)); 184f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), 185f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(q4_16, q3_16)); 186f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 187f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); 188f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); 189f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 190f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); 191f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); 192f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, 193f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_q)); 194f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixetFilter_p2p1p0 = _mm_add_epi16(four, 195f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(pixetFilter_p2p1p0, 196f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixetFilter_q2q1q0)); 197f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 198f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(p7_16, p0_16)), 4); 199f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 200f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(q7_16, q0_16)), 4); 201f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q0p0 = _mm_packus_epi16(res_p, res_q); 202f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 203f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(p3_16, p0_16)), 3); 204f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 205f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(q3_16, q0_16)), 3); 206f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 207f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat_q0p0 = _mm_packus_epi16(res_p, res_q); 208f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 209f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_p7 = _mm_add_epi16(p7_16, p7_16); 210f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_q7 = _mm_add_epi16(q7_16, q7_16); 211f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_p3 = _mm_add_epi16(p3_16, p3_16); 212f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_q3 = _mm_add_epi16(q3_16, q3_16); 213f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 214f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); 215f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); 216f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 217f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_p7, p1_16)), 4); 218f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 219f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_q7, q1_16)), 4); 220f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q1p1 = _mm_packus_epi16(res_p, res_q); 221f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 222f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); 223f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); 224f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 225f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_p3, p1_16)), 3); 226f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, 227f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_q3, q1_16)), 3); 228f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat_q1p1 = _mm_packus_epi16(res_p, res_q); 229f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 230f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_p7 = _mm_add_epi16(sum_p7, p7_16); 231f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_q7 = _mm_add_epi16(sum_q7, q7_16); 232f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_p3 = _mm_add_epi16(sum_p3, p3_16); 233f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_q3 = _mm_add_epi16(sum_q3, q3_16); 234f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 235f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); 236f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); 237f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 238f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_p7, p2_16)), 4); 239f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 240f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_q7, q2_16)), 4); 241f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q2p2 = _mm_packus_epi16(res_p, res_q); 242f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 243f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); 244f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); 245f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 246f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 247f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_p3, p2_16)), 3); 248f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, 249f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_q3, q2_16)), 3); 250f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat_q2p2 = _mm_packus_epi16(res_p, res_q); 251f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 252f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_p7 = _mm_add_epi16(sum_p7, p7_16); 253f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_q7 = _mm_add_epi16(sum_q7, q7_16); 254f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); 255f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); 256f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 257f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_p7, p3_16)), 4); 258f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 259f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_q7, q3_16)), 4); 260f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q3p3 = _mm_packus_epi16(res_p, res_q); 261f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 262f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_p7 = _mm_add_epi16(sum_p7, p7_16); 263f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_q7 = _mm_add_epi16(sum_q7, q7_16); 264f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); 265f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); 266f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 267f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_p7, p4_16)), 4); 268f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 269f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_q7, q4_16)), 4); 270f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q4p4 = _mm_packus_epi16(res_p, res_q); 271f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 272f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_p7 = _mm_add_epi16(sum_p7, p7_16); 273f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_q7 = _mm_add_epi16(sum_q7, q7_16); 274f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); 275f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); 276f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 277f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_p7, p5_16)), 4); 278f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 279f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_q7, q5_16)), 4); 280f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q5p5 = _mm_packus_epi16(res_p, res_q); 281f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 282f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_p7 = _mm_add_epi16(sum_p7, p7_16); 283f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org sum_q7 = _mm_add_epi16(sum_q7, q7_16); 284f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); 285f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); 286f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 287f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_p7, p6_16)), 4); 288f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 289f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_add_epi16(sum_q7, q6_16)), 4); 290f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q6p6 = _mm_packus_epi16(res_p, res_q); 29106d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org } 29206d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org // wide flat 29306d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 29406d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org 295f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat = _mm_shuffle_epi32(flat, 68); 296f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2 = _mm_shuffle_epi32(flat2, 68); 297f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 298f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q2p2 = _mm_andnot_si128(flat, q2p2); 299f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat_q2p2 = _mm_and_si128(flat, flat_q2p2); 300f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q2p2 = _mm_or_si128(q2p2, flat_q2p2); 301f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 302f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org qs1ps1 = _mm_andnot_si128(flat, qs1ps1); 303f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat_q1p1 = _mm_and_si128(flat, flat_q1p1); 304f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); 305f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 306f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org qs0ps0 = _mm_andnot_si128(flat, qs0ps0); 307f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat_q0p0 = _mm_and_si128(flat, flat_q0p0); 308f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); 309f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 310f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q6p6 = _mm_andnot_si128(flat2, q6p6); 311f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); 312f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q6p6 = _mm_or_si128(q6p6, flat2_q6p6); 313f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); 314f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); 315f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 316f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q5p5 = _mm_andnot_si128(flat2, q5p5); 317f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); 318f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q5p5 = _mm_or_si128(q5p5, flat2_q5p5); 319f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); 320f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); 321f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 322f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q4p4 = _mm_andnot_si128(flat2, q4p4); 323f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); 324f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q4p4 = _mm_or_si128(q4p4, flat2_q4p4); 325f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); 326f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); 327f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 328f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q3p3 = _mm_andnot_si128(flat2, q3p3); 329f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); 330f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q3p3 = _mm_or_si128(q3p3, flat2_q3p3); 331f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); 332f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); 333f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 334f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q2p2 = _mm_andnot_si128(flat2, q2p2); 335f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); 336f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q2p2 = _mm_or_si128(q2p2, flat2_q2p2); 337f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); 338f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); 339f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 340f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q1p1 = _mm_andnot_si128(flat2, q1p1); 341f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); 342f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q1p1 = _mm_or_si128(q1p1, flat2_q1p1); 343f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); 344f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); 345f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org 346f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q0p0 = _mm_andnot_si128(flat2, q0p0); 347f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); 348f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org q0p0 = _mm_or_si128(q0p0, flat2_q0p0); 349f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); 350f9586bb54d74c97d07b09eb2512f8569c9c1c025fgalligan@chromium.org _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); 351d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org } 352d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org} 353d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org 35487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.orgstatic INLINE __m128i filter_add2_sub2(const __m128i *const total, 35587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i *const a1, 35687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i *const a2, 35787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i *const s1, 35887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i *const s2) { 35987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org __m128i x = _mm_add_epi16(*a1, *total); 36087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); 36187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org return x; 36287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org} 36387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 36487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.orgstatic INLINE __m128i filter8_mask(const __m128i *const flat, 36587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i *const other_filt, 36687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i *const f8_lo, 36787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i *const f8_hi) { 36887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i f8 = _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), 36987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_srli_epi16(*f8_hi, 3)); 37087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i result = _mm_and_si128(*flat, f8); 37187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); 37287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org} 37387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 37487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.orgstatic INLINE __m128i filter16_mask(const __m128i *const flat, 37587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i *const other_filt, 37687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i *const f_lo, 37787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i *const f_hi) { 37887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i f = _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), 37987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_srli_epi16(*f_hi, 4)); 38087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i result = _mm_and_si128(*flat, f); 38187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); 38287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org} 38387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 38447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.orgstatic void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, 38547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org int p, 38647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const unsigned char *_blimit, 38747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const unsigned char *_limit, 38847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const unsigned char *_thresh) { 38947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i zero = _mm_set1_epi16(0); 39047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i one = _mm_set1_epi8(1); 391d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); 392d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i limit = _mm_load_si128((const __m128i *)_limit); 393d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); 394d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i mask, hev, flat, flat2; 39547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i p7, p6, p5; 39647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; 39747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i q5, q6, q7; 39847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 39987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org __m128i op2, op1, op0, oq0, oq1, oq2; 40087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 40187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org __m128i max_abs_p1p0q1q0; 40287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 40387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); 40487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); 40587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); 40647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); 40747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 40847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 40947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 41047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 41147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 41247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 41347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 41447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 41547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); 41687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); 41787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); 41887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); 41947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 42047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org { 42187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i abs_p1p0 = abs_diff(p1, p0); 42287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i abs_q1q0 = abs_diff(q1, q0); 42347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i fe = _mm_set1_epi8(0xfe); 42487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i ff = _mm_cmpeq_epi8(zero, zero); 42587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org __m128i abs_p0q0 = abs_diff(p0, q0); 42687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org __m128i abs_p1q1 = abs_diff(p1, q1); 42747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i work; 42887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); 42947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 43047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 43147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 43247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 43347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 43447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 43587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); 43647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // mask |= (abs(p1 - p0) > limit) * -1; 43747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // mask |= (abs(q1 - q0) > limit) * -1; 43887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); 43947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org mask = _mm_max_epu8(work, mask); 44087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); 44147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org mask = _mm_max_epu8(work, mask); 44247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org mask = _mm_subs_epu8(mask, limit); 44347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org mask = _mm_cmpeq_epi8(mask, zero); 44447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org } 44547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 44687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org { 44787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org __m128i work; 44887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); 44987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat = _mm_max_epu8(work, max_abs_p1p0q1q0); 45087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); 45187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat = _mm_max_epu8(work, flat); 45287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); 45387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat = _mm_subs_epu8(flat, one); 45487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat = _mm_cmpeq_epi8(flat, zero); 45587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat = _mm_and_si128(flat, mask); 45687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); 45787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat2 = _mm_max_epu8(work, flat2); 45887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); 45987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat2 = _mm_max_epu8(work, flat2); 46087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0)); 46187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat2 = _mm_max_epu8(work, flat2); 46287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat2 = _mm_subs_epu8(flat2, one); 46387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat2 = _mm_cmpeq_epi8(flat2, zero); 46487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 46587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org } 46687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 46787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 46887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org // filter4 46947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org { 47047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i t4 = _mm_set1_epi8(4); 47147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i t3 = _mm_set1_epi8(3); 47247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i t80 = _mm_set1_epi8(0x80); 47347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i te0 = _mm_set1_epi8(0xe0); 47447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i t1f = _mm_set1_epi8(0x1f); 47547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i t1 = _mm_set1_epi8(0x1); 47647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i t7f = _mm_set1_epi8(0x7f); 47787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i ff = _mm_cmpeq_epi8(t4, t4); 47847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 47947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i filt; 48047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i work_a; 48147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org __m128i filter1, filter2; 48247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 48387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org op1 = _mm_xor_si128(p1, t80); 48487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org op0 = _mm_xor_si128(p0, t80); 48587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org oq0 = _mm_xor_si128(q0, t80); 48687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org oq1 = _mm_xor_si128(q1, t80); 48787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 48887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); 48987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 49087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); 49187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 49287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org work_a = _mm_subs_epi8(oq0, op0); 49347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filt = _mm_adds_epi8(filt, work_a); 49447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filt = _mm_adds_epi8(filt, work_a); 49547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filt = _mm_adds_epi8(filt, work_a); 496d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // (vp9_filter + 3 * (qs0 - ps0)) & mask 49747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filt = _mm_and_si128(filt, mask); 49847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filter1 = _mm_adds_epi8(filt, t4); 49947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filter2 = _mm_adds_epi8(filt, t3); 50047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 501d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Filter1 >> 3 50247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org work_a = _mm_cmpgt_epi8(zero, filter1); 50347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filter1 = _mm_srli_epi16(filter1, 3); 50447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org work_a = _mm_and_si128(work_a, te0); 50547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filter1 = _mm_and_si128(filter1, t1f); 50647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filter1 = _mm_or_si128(filter1, work_a); 50787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); 50847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 509d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Filter2 >> 3 51047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org work_a = _mm_cmpgt_epi8(zero, filter2); 51147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filter2 = _mm_srli_epi16(filter2, 3); 51247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org work_a = _mm_and_si128(work_a, te0); 51347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filter2 = _mm_and_si128(filter2, t1f); 51447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filter2 = _mm_or_si128(filter2, work_a); 51587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); 51647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 517d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // filt >> 1 51847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filt = _mm_adds_epi8(filter1, t1); 51947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org work_a = _mm_cmpgt_epi8(zero, filt); 52047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filt = _mm_srli_epi16(filt, 1); 52147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org work_a = _mm_and_si128(work_a, t80); 52247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filt = _mm_and_si128(filt, t7f); 52347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filt = _mm_or_si128(filt, work_a); 52447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org filt = _mm_andnot_si128(hev, filt); 52587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); 52687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); 52747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // loopfilter done 52847265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 52987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 53087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org // filter8 53147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org { 53287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i four = _mm_set1_epi16(4); 53387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); 53487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); 53587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); 53687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); 53787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); 53887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); 53987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); 54087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); 54187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 54287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); 54387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); 54487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); 54587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); 54687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); 54787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); 54887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); 54987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); 55087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org __m128i f8_lo, f8_hi; 55187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 55287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), 55387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_add_epi16(p3_lo, p2_lo)); 55487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), 55587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_add_epi16(p2_lo, p1_lo)); 55687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); 55787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 55887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), 55987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_add_epi16(p3_hi, p2_hi)); 56087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), 56187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_add_epi16(p2_hi, p1_hi)); 56287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); 56387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 56487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); 56587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 56687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); 56787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); 56887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); 56987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 57087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); 57187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); 57287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); 57387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 57487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); 57587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); 57687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); 57787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 57887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); 57987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); 58087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); 58187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 58287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); 58387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); 58487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); 58547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org } 58647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 58747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 58887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org // wide flat calculations 58947265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org { 59047265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org const __m128i eight = _mm_set1_epi16(8); 59187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero); 59287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); 59387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); 59487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); 59587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); 59687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); 59787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); 59887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); 59987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); 60087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); 60187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); 60287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); 60387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); 60487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); 60587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); 60687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero); 60787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 60887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero); 60987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); 61087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); 61187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); 61287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); 61387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); 61487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); 61587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); 61687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); 61787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); 61887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); 61987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); 62087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); 62187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); 62287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); 62387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero); 62487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 62587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org __m128i f_lo; 62687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org __m128i f_hi; 62787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 62887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 62987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), 63087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_add_epi16(p4_lo, f_lo)); 63187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), 63287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_add_epi16(p2_lo, p1_lo)); 63387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); 63487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); 63587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 63687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 63787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), 63887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_add_epi16(p4_hi, f_hi)); 63987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), 64087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_add_epi16(p2_hi, p1_hi)); 64187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); 64287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); 64387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 64487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); 64587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 7 * p), p6); 64687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 64787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); 64887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); 64987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); 65087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 6 * p), p5); 65187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 65287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); 65387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); 65487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); 65587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 5 * p), p4); 65687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 65787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); 65887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); 65987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); 66087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 4 * p), p3); 66187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 66287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); 66387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); 66487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); 66587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 3 * p), op2); 66687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 66787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); 66887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); 66987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); 67087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 2 * p), op1); 67187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 67287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); 67387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); 67487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); 67587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 1 * p), op0); 67687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 67787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); 67887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); 67987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); 68087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); 68187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 68287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); 68387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); 68487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); 68587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); 68687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 68787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); 68887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); 68987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); 69087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); 69187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 69287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); 69387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); 69487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); 69587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s + 3 * p), q3); 69687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 69787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); 69887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); 69987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); 70087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s + 4 * p), q4); 70187997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 70287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); 70387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); 70487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); 70587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s + 5 * p), q5); 70687997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org 70787997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); 70887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); 70987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); 71087997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s + 6 * p), q6); 71147265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org } 71247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // wide flat 71347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 71447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org } 71547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 71647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 717d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. 7188b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_horizontal_16_sse2(unsigned char *s, int p, 7198b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_blimit, 7208b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_limit, 7218b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_thresh, int count) { 72247265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org if (count == 1) 72347265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); 72447265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org else 72547265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh); 72647265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org} 72747265f8fe3a36a426773454ad90d20c9aa616c24johannkoenig@chromium.org 7288b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_horizontal_8_sse2(unsigned char *s, int p, 7298b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_blimit, 7308b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_limit, 7318b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_thresh, int count) { 732d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16); 733d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16); 734d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16); 735d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16); 736d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16); 737d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16); 7386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i zero = _mm_set1_epi16(0); 739d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); 740d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i limit = _mm_load_si128((const __m128i *)_limit); 741d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); 742d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i mask, hev, flat; 74306d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org __m128i p3, p2, p1, p0, q0, q1, q2, q3; 744d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; 7456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 746f7b25aef0ed571110c9f656f29ead07b02d33d89fgalligan@chromium.org (void)count; 747d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 748d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), 749d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_loadl_epi64((__m128i *)(s + 3 * p))); 750d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), 751d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_loadl_epi64((__m128i *)(s + 2 * p))); 752d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), 753d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_loadl_epi64((__m128i *)(s + 1 * p))); 754d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), 755d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_loadl_epi64((__m128i *)(s - 0 * p))); 756d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p1q1 = _mm_shuffle_epi32(q1p1, 78); 757d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p0q0 = _mm_shuffle_epi32(q0p0, 78); 758d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 759d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org { 760d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // filter_mask and hev_mask 761d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i one = _mm_set1_epi8(1); 762d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i fe = _mm_set1_epi8(0xfe); 763d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i ff = _mm_cmpeq_epi8(fe, fe); 764d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; 76587997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org abs_p1p0 = abs_diff(q1p1, q0p0); 766d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); 767d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 76887997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org abs_p0q0 = abs_diff(q0p0, p0q0); 76987997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org abs_p1q1 = abs_diff(q1p1, p1q1); 770d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 771d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org hev = _mm_subs_epu8(flat, thresh); 772d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 773d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 774d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 775d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 776d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 777d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 778d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 779d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_max_epu8(abs_p1p0, mask); 780d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // mask |= (abs(p1 - p0) > limit) * -1; 781d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // mask |= (abs(q1 - q0) > limit) * -1; 782d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 78387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org work = _mm_max_epu8(abs_diff(q2p2, q1p1), 78487997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org abs_diff(q3p3, q2p2)); 785d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_max_epu8(work, mask); 786d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); 787d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_subs_epu8(mask, limit); 788d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_cmpeq_epi8(mask, zero); 789d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 790d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // flat_mask4 791d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 79287997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org flat = _mm_max_epu8(abs_diff(q2p2, q0p0), 79387997d490ae52aa962a985c95b3cddf7f8832641johannkoenig@chromium.org abs_diff(q3p3, q0p0)); 794d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org flat = _mm_max_epu8(abs_p1p0, flat); 795d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); 796d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org flat = _mm_subs_epu8(flat, one); 797d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org flat = _mm_cmpeq_epi8(flat, zero); 798d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org flat = _mm_and_si128(flat, mask); 799d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org } 800d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 801d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org { 802d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i four = _mm_set1_epi16(4); 803d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org unsigned char *src = s; 804d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org { 805d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i workp_a, workp_b, workp_shft; 806d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); 807d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); 808d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); 809d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); 810d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); 811d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); 812d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); 813d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); 814d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 815d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); 816d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); 817d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); 818d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 819d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_op2[0], 820d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 821d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 822d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); 823d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 824d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_op1[0], 825d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 826d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 827d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); 828d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); 829d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 830d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_op0[0], 831d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 832d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 833d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); 834d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); 835d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 836d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_oq0[0], 837d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 838d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 839d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); 840d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); 841d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 842d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_oq1[0], 843d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 844d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 845d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); 846d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); 847d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 848d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_oq2[0], 849d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 850d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org } 851d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org } 852d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // lp filter 853d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org { 854d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i t4 = _mm_set1_epi8(4); 855d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i t3 = _mm_set1_epi8(3); 856d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i t80 = _mm_set1_epi8(0x80); 857d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i t1 = _mm_set1_epi8(0x1); 858d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), 859d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org t80); 860d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), 861d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org t80); 862d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), 863d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org t80); 864d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), 865d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org t80); 866d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i filt; 867d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i work_a; 868d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i filter1, filter2; 869d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 870d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 871d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_subs_epi8(qs0, ps0); 872d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_adds_epi8(filt, work_a); 873d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_adds_epi8(filt, work_a); 874d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_adds_epi8(filt, work_a); 875d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // (vp9_filter + 3 * (qs0 - ps0)) & mask 876d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_and_si128(filt, mask); 877d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 878d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter1 = _mm_adds_epi8(filt, t4); 879d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter2 = _mm_adds_epi8(filt, t3); 880d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 881d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Filter1 >> 3 882d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter1 = _mm_unpacklo_epi8(zero, filter1); 883d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter1 = _mm_srai_epi16(filter1, 11); 884d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter1 = _mm_packs_epi16(filter1, filter1); 885d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 886d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Filter2 >> 3 887d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter2 = _mm_unpacklo_epi8(zero, filter2); 888d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter2 = _mm_srai_epi16(filter2, 11); 889d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter2 = _mm_packs_epi16(filter2, zero); 890d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 891d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // filt >> 1 892d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_adds_epi8(filter1, t1); 893d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_unpacklo_epi8(zero, filt); 894d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_srai_epi16(filt, 9); 895d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_packs_epi16(filt, zero); 896d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 897d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_andnot_si128(hev, filt); 898d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 899d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 900d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q0 = _mm_loadl_epi64((__m128i *)flat_oq0); 901d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_andnot_si128(flat, work_a); 902d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q0 = _mm_and_si128(flat, q0); 903d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q0 = _mm_or_si128(work_a, q0); 904d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 905d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 906d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q1 = _mm_loadl_epi64((__m128i *)flat_oq1); 907d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_andnot_si128(flat, work_a); 908d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q1 = _mm_and_si128(flat, q1); 909d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q1 = _mm_or_si128(work_a, q1); 910d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 911d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); 912d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q2 = _mm_loadl_epi64((__m128i *)flat_oq2); 913d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_andnot_si128(flat, work_a); 914d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q2 = _mm_and_si128(flat, q2); 915d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q2 = _mm_or_si128(work_a, q2); 916d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 917d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 918d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p0 = _mm_loadl_epi64((__m128i *)flat_op0); 919d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_andnot_si128(flat, work_a); 920d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p0 = _mm_and_si128(flat, p0); 921d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p0 = _mm_or_si128(work_a, p0); 922d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 923d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 924d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p1 = _mm_loadl_epi64((__m128i *)flat_op1); 925d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_andnot_si128(flat, work_a); 926d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p1 = _mm_and_si128(flat, p1); 927d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p1 = _mm_or_si128(work_a, p1); 928d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 929d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); 930d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p2 = _mm_loadl_epi64((__m128i *)flat_op2); 931d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_andnot_si128(flat, work_a); 932d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p2 = _mm_and_si128(flat, p2); 933d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p2 = _mm_or_si128(work_a, p2); 934d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 935d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)(s - 3 * p), p2); 936d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)(s - 2 * p), p1); 937d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)(s - 1 * p), p0); 938d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)(s + 0 * p), q0); 939d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)(s + 1 * p), q1); 940d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)(s + 2 * p), q2); 941d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org } 942d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org} 943d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 9448b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, 9458b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *_blimit0, 9468b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *_limit0, 9478b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *_thresh0, 9488b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *_blimit1, 9498b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *_limit1, 9508b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *_thresh1) { 951d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16); 952d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16); 953d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16); 954d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16); 955d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16); 956d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16); 957d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i zero = _mm_set1_epi16(0); 958d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i blimit = 959d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), 960d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_load_si128((const __m128i *)_blimit1)); 961d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i limit = 962d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), 963d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_load_si128((const __m128i *)_limit1)); 964d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i thresh = 965d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), 966d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_load_si128((const __m128i *)_thresh1)); 967d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 968d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i mask, hev, flat; 969d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i p3, p2, p1, p0, q0, q1, q2, q3; 970d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 971d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 972d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 973d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 974d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 975d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 976d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 977d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 978d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 9796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org { 9806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), 9816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(p0, p1)); 9826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), 9836fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(q0, q1)); 9846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i one = _mm_set1_epi8(1); 9856fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i fe = _mm_set1_epi8(0xfe); 9866fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 9876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), 9886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(q0, p0)); 9896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), 9906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(q1, p1)); 9916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org __m128i work; 992d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 993d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // filter_mask and hev_mask 9946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 9956fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org hev = _mm_subs_epu8(flat, thresh); 9966fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 9976fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 9986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 9996fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 10006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 10016fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 10026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 10036fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org mask = _mm_max_epu8(flat, mask); 10046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // mask |= (abs(p1 - p0) > limit) * -1; 10056fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // mask |= (abs(q1 - q0) > limit) * -1; 10066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), 10076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(p1, p2)), 10086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_or_si128(_mm_subs_epu8(p3, p2), 10096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(p2, p3))); 10106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org mask = _mm_max_epu8(work, mask); 10116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), 10126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(q1, q2)), 10136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_or_si128(_mm_subs_epu8(q3, q2), 10146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(q2, q3))); 10156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org mask = _mm_max_epu8(work, mask); 10166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org mask = _mm_subs_epu8(mask, limit); 10176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org mask = _mm_cmpeq_epi8(mask, zero); 10186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1019d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // flat_mask4 10206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), 10216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(p0, p2)), 10226fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_or_si128(_mm_subs_epu8(q2, q0), 10236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(q0, q2))); 10246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org flat = _mm_max_epu8(work, flat); 10256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), 10266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(p0, p3)), 10276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_or_si128(_mm_subs_epu8(q3, q0), 10286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_subs_epu8(q0, q3))); 10296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org flat = _mm_max_epu8(work, flat); 10306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org flat = _mm_subs_epu8(flat, one); 10316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org flat = _mm_cmpeq_epi8(flat, zero); 10326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org flat = _mm_and_si128(flat, mask); 10336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org } 10346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org { 10356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i four = _mm_set1_epi16(4); 10366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org unsigned char *src = s; 1037d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org int i = 0; 1038d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1039d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org do { 10406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org __m128i workp_a, workp_b, workp_shft; 10416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); 10426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); 10436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); 10446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); 10456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); 10466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); 10476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); 10486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); 10496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 105006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); 10516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); 105206d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); 10536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1054d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_op2[i * 8], 10556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 10566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 10576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); 10586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1059d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_op1[i * 8], 10606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 10616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 106206d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); 10636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); 10646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1065d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_op0[i * 8], 10666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 10676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 10686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); 10696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); 10706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1071d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], 10726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 10736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 107406d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); 10756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); 10766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1077d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], 10786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 10796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 108006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); 10816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); 10826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1083d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], 10846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_packus_epi16(workp_shft, workp_shft)); 1085d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1086d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org src += 8; 1087d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org } while (++i < 2); 10886fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org } 10896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // lp filter 10906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org { 10916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i t4 = _mm_set1_epi8(4); 10926fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i t3 = _mm_set1_epi8(3); 10936fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i t80 = _mm_set1_epi8(0x80); 10946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i te0 = _mm_set1_epi8(0xe0); 10956fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i t1f = _mm_set1_epi8(0x1f); 10966fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i t1 = _mm_set1_epi8(0x1); 10976fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org const __m128i t7f = _mm_set1_epi8(0x7f); 10986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1099d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), 11006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org t80); 1101d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), 11026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org t80); 1103d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), 11046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org t80); 1105d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), 11066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org t80); 11076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org __m128i filt; 11086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org __m128i work_a; 11096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org __m128i filter1, filter2; 11106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 11116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 11126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_subs_epi8(qs0, ps0); 11136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filt = _mm_adds_epi8(filt, work_a); 11146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filt = _mm_adds_epi8(filt, work_a); 11156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filt = _mm_adds_epi8(filt, work_a); 1116d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // (vp9_filter + 3 * (qs0 - ps0)) & mask 11176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filt = _mm_and_si128(filt, mask); 11186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 11196fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filter1 = _mm_adds_epi8(filt, t4); 11206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filter2 = _mm_adds_epi8(filt, t3); 11216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1122d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Filter1 >> 3 11236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_cmpgt_epi8(zero, filter1); 11246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filter1 = _mm_srli_epi16(filter1, 3); 11256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_and_si128(work_a, te0); 11266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filter1 = _mm_and_si128(filter1, t1f); 11276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filter1 = _mm_or_si128(filter1, work_a); 11286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1129d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Filter2 >> 3 11306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_cmpgt_epi8(zero, filter2); 11316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filter2 = _mm_srli_epi16(filter2, 3); 11326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_and_si128(work_a, te0); 11336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filter2 = _mm_and_si128(filter2, t1f); 11346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filter2 = _mm_or_si128(filter2, work_a); 11356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1136d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // filt >> 1 11376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filt = _mm_adds_epi8(filter1, t1); 11386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_cmpgt_epi8(zero, filt); 11396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filt = _mm_srli_epi16(filt, 1); 11406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_and_si128(work_a, t80); 11416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filt = _mm_and_si128(filt, t7f); 11426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filt = _mm_or_si128(filt, work_a); 11436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 11446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org filt = _mm_andnot_si128(hev, filt); 11456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 11466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 1147d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q0 = _mm_load_si128((__m128i *)flat_oq0); 11486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_andnot_si128(flat, work_a); 11496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org q0 = _mm_and_si128(flat, q0); 11506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org q0 = _mm_or_si128(work_a, q0); 11516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 11526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 1153d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q1 = _mm_load_si128((__m128i *)flat_oq1); 11546fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_andnot_si128(flat, work_a); 11556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org q1 = _mm_and_si128(flat, q1); 11566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org q1 = _mm_or_si128(work_a, q1); 11576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 11586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); 1159d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q2 = _mm_load_si128((__m128i *)flat_oq2); 11606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_andnot_si128(flat, work_a); 11616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org q2 = _mm_and_si128(flat, q2); 11626fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org q2 = _mm_or_si128(work_a, q2); 11636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 11646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 1165d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p0 = _mm_load_si128((__m128i *)flat_op0); 11666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_andnot_si128(flat, work_a); 11676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org p0 = _mm_and_si128(flat, p0); 11686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org p0 = _mm_or_si128(work_a, p0); 11696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 11706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 1171d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p1 = _mm_load_si128((__m128i *)flat_op1); 11726fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_andnot_si128(flat, work_a); 11736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org p1 = _mm_and_si128(flat, p1); 11746fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org p1 = _mm_or_si128(work_a, p1); 11756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 11766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); 1177d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p2 = _mm_load_si128((__m128i *)flat_op2); 11786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org work_a = _mm_andnot_si128(flat, work_a); 11796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org p2 = _mm_and_si128(flat, p2); 11806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org p2 = _mm_or_si128(work_a, p2); 11816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1182d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 3 * p), p2); 1183d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 2 * p), p1); 1184d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 1 * p), p0); 1185d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s + 0 * p), q0); 1186d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s + 1 * p), q1); 1187d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s + 2 * p), q2); 1188d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org } 1189d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org} 1190d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 11918b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, 11928b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_blimit0, 11938b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_limit0, 11948b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_thresh0, 11958b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_blimit1, 11968b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_limit1, 11978b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *_thresh1) { 1198d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i blimit = 1199d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), 1200d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_load_si128((const __m128i *)_blimit1)); 1201d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i limit = 1202d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), 1203d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_load_si128((const __m128i *)_limit1)); 1204d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i thresh = 1205d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), 1206d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_load_si128((const __m128i *)_thresh1)); 1207d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i zero = _mm_set1_epi16(0); 1208d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i p3, p2, p1, p0, q0, q1, q2, q3; 1209d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i mask, hev, flat; 1210d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1211d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 1212d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 1213d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 1214d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 1215d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 1216d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 1217d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 1218d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 1219d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1220d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // filter_mask and hev_mask 1221d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org { 1222d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), 1223d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_subs_epu8(p0, p1)); 1224d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), 1225d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_subs_epu8(q0, q1)); 1226d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i fe = _mm_set1_epi8(0xfe); 1227d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 1228d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), 1229d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_subs_epu8(q0, p0)); 1230d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), 1231d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_subs_epu8(q1, p1)); 1232d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i work; 1233d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1234d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 1235d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org hev = _mm_subs_epu8(flat, thresh); 1236d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 1237d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1238d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 1239d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 1240d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 1241d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 1242d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 1243d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_max_epu8(flat, mask); 1244d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // mask |= (abs(p1 - p0) > limit) * -1; 1245d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // mask |= (abs(q1 - q0) > limit) * -1; 1246d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), 1247d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_subs_epu8(p1, p2)), 1248d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_or_si128(_mm_subs_epu8(p3, p2), 1249d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_subs_epu8(p2, p3))); 1250d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_max_epu8(work, mask); 1251d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), 1252d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_subs_epu8(q1, q2)), 1253d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_or_si128(_mm_subs_epu8(q3, q2), 1254d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_subs_epu8(q2, q3))); 1255d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_max_epu8(work, mask); 1256d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_subs_epu8(mask, limit); 1257d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mask = _mm_cmpeq_epi8(mask, zero); 1258d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org } 1259d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1260d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // filter4 1261d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org { 1262d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i t4 = _mm_set1_epi8(4); 1263d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i t3 = _mm_set1_epi8(3); 1264d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i t80 = _mm_set1_epi8(0x80); 1265d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i te0 = _mm_set1_epi8(0xe0); 1266d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i t1f = _mm_set1_epi8(0x1f); 1267d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i t1 = _mm_set1_epi8(0x1); 1268d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i t7f = _mm_set1_epi8(0x7f); 1269d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1270d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), 1271d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org t80); 1272d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), 1273d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org t80); 1274d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), 1275d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org t80); 1276d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), 1277d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org t80); 1278d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i filt; 1279d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i work_a; 1280d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org __m128i filter1, filter2; 1281d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1282d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 1283d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_subs_epi8(qs0, ps0); 1284d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_adds_epi8(filt, work_a); 1285d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_adds_epi8(filt, work_a); 1286d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_adds_epi8(filt, work_a); 1287d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // (vp9_filter + 3 * (qs0 - ps0)) & mask 1288d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_and_si128(filt, mask); 1289d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1290d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter1 = _mm_adds_epi8(filt, t4); 1291d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter2 = _mm_adds_epi8(filt, t3); 1292d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1293d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Filter1 >> 3 1294d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_cmpgt_epi8(zero, filter1); 1295d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter1 = _mm_srli_epi16(filter1, 3); 1296d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_and_si128(work_a, te0); 1297d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter1 = _mm_and_si128(filter1, t1f); 1298d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter1 = _mm_or_si128(filter1, work_a); 1299d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1300d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Filter2 >> 3 1301d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_cmpgt_epi8(zero, filter2); 1302d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter2 = _mm_srli_epi16(filter2, 3); 1303d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_and_si128(work_a, te0); 1304d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter2 = _mm_and_si128(filter2, t1f); 1305d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filter2 = _mm_or_si128(filter2, work_a); 1306d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1307d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // filt >> 1 1308d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_adds_epi8(filter1, t1); 1309d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_cmpgt_epi8(zero, filt); 1310d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_srli_epi16(filt, 1); 1311d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org work_a = _mm_and_si128(work_a, t80); 1312d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_and_si128(filt, t7f); 1313d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_or_si128(filt, work_a); 1314d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1315d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org filt = _mm_andnot_si128(hev, filt); 1316d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1317d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 1318d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 1319d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 1320d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 1321d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1322d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 2 * p), p1); 1323d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s - 1 * p), p0); 1324d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s + 0 * p), q0); 1325d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org _mm_storeu_si128((__m128i *)(s + 1 * p), q1); 13266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org } 13276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org} 13286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 132906d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.orgstatic INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, 133006d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org int in_p, unsigned char *out, int out_p) { 13316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org __m128i x0, x1, x2, x3, x4, x5, x6, x7; 13326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org __m128i x8, x9, x10, x11, x12, x13, x14, x15; 13336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1334d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Read in 16 lines 13356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x0 = _mm_loadl_epi64((__m128i *)in0); 13366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x8 = _mm_loadl_epi64((__m128i *)in1); 13376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); 13386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); 13396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); 13406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); 13416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); 13426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); 13436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); 13446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); 13456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); 13466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); 13476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); 13486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); 13496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); 13506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); 13516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 13526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x0 = _mm_unpacklo_epi8(x0, x1); 13536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x1 = _mm_unpacklo_epi8(x2, x3); 13546fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x2 = _mm_unpacklo_epi8(x4, x5); 13556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x3 = _mm_unpacklo_epi8(x6, x7); 13566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 13576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x8 = _mm_unpacklo_epi8(x8, x9); 13586fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x9 = _mm_unpacklo_epi8(x10, x11); 13596fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x10 = _mm_unpacklo_epi8(x12, x13); 13606fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x11 = _mm_unpacklo_epi8(x14, x15); 13616fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 13626fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x4 = _mm_unpacklo_epi16(x0, x1); 13636fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x5 = _mm_unpacklo_epi16(x2, x3); 13646fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x12 = _mm_unpacklo_epi16(x8, x9); 13656fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x13 = _mm_unpacklo_epi16(x10, x11); 13666fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 13676fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x6 = _mm_unpacklo_epi32(x4, x5); 13686fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x7 = _mm_unpackhi_epi32(x4, x5); 13696fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x14 = _mm_unpacklo_epi32(x12, x13); 13706fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x15 = _mm_unpackhi_epi32(x12, x13); 13716fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1372d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Store first 4-line result 13736fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); 13746fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); 13756fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); 13766fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); 13776fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 13786fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x4 = _mm_unpackhi_epi16(x0, x1); 13796fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x5 = _mm_unpackhi_epi16(x2, x3); 13806fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x12 = _mm_unpackhi_epi16(x8, x9); 13816fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x13 = _mm_unpackhi_epi16(x10, x11); 13826fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 13836fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x6 = _mm_unpacklo_epi32(x4, x5); 13846fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x7 = _mm_unpackhi_epi32(x4, x5); 13856fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x14 = _mm_unpacklo_epi32(x12, x13); 13866fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x15 = _mm_unpackhi_epi32(x12, x13); 13876fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1388d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Store second 4-line result 13896fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); 13906fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); 13916fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); 13926fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); 13936fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org} 13946fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 139506d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.orgstatic INLINE void transpose(unsigned char *src[], int in_p, 139606d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org unsigned char *dst[], int out_p, 139706d88a191f52640d533e6204ba067d1fd6fc0accjohannkoenig@chromium.org int num_8x8_to_transpose) { 13986fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org int idx8x8 = 0; 13996fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org __m128i x0, x1, x2, x3, x4, x5, x6, x7; 14006fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org do { 14016fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org unsigned char *in = src[idx8x8]; 14026fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org unsigned char *out = dst[idx8x8]; 14036fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 14046fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 14056fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 14066fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 14076fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 14086fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 14096fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 14106fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 14116fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 14126fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 14136fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x0 = _mm_unpacklo_epi8(x0, x1); 14146fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 14156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x1 = _mm_unpacklo_epi8(x2, x3); 14166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 14176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x2 = _mm_unpacklo_epi8(x4, x5); 14186fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 14196fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x3 = _mm_unpacklo_epi8(x6, x7); 14206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 14216fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x4 = _mm_unpacklo_epi16(x0, x1); 14226fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 14236fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x5 = _mm_unpacklo_epi16(x2, x3); 14246fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 14256fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x6 = _mm_unpacklo_epi32(x4, x5); 14266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 14276fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x7 = _mm_unpackhi_epi32(x4, x5); 14286fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 14296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storel_pd((double *)(out + 0*out_p), 14306fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 14316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeh_pd((double *)(out + 1*out_p), 14326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 14336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storel_pd((double *)(out + 2*out_p), 14346fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 14356fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeh_pd((double *)(out + 3*out_p), 14366fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 14376fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 14386fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 14396fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x4 = _mm_unpackhi_epi16(x0, x1); 14406fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 14416fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x5 = _mm_unpackhi_epi16(x2, x3); 14426fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 14436fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x6 = _mm_unpacklo_epi32(x4, x5); 14446fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 14456fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org x7 = _mm_unpackhi_epi32(x4, x5); 14466fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 14476fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storel_pd((double *)(out + 4*out_p), 14486fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 14496fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeh_pd((double *)(out + 5*out_p), 14506fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 14516fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storel_pd((double *)(out + 6*out_p), 14526fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 14536fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_storeh_pd((double *)(out + 7*out_p), 14546fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 14556fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org } while (++idx8x8 < num_8x8_to_transpose); 14566fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org} 14576fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 14588b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, 14598b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *limit0, 14608b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *thresh0, 14618b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *blimit1, 14628b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *limit1, 14638b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *thresh1) { 1464d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8); 1465d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org unsigned char *src[2]; 1466d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org unsigned char *dst[2]; 1467d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1468d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Transpose 8x16 1469d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); 1470d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1471d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Loop filtering 14728b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, 14738b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org blimit1, limit1, thresh1); 1474d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org src[0] = t_dst; 1475d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org src[1] = t_dst + 8; 1476d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org dst[0] = s - 4; 1477d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org dst[1] = s - 4 + p * 8; 1478d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1479d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Transpose back 1480d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org transpose(src, 16, dst, p, 2); 1481d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org} 1482d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 14838b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_vertical_8_sse2(unsigned char *s, int p, 14848b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *blimit, 14858b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *limit, 14868b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *thresh, int count) { 1487d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8); 1488d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org unsigned char *src[1]; 1489d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org unsigned char *dst[1]; 1490d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org (void)count; 1491d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1492d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Transpose 8x8 1493d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org src[0] = s - 4; 1494d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org dst[0] = t_dst; 1495d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1496d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org transpose(src, p, dst, 8, 1); 1497d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1498d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Loop filtering 14998b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1); 1500d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1501d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org src[0] = t_dst; 1502d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org dst[0] = s - 4; 1503d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1504d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Transpose back 1505d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org transpose(src, 8, dst, p, 1); 1506d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org} 1507d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 15088b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, 15098b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *limit0, 15108b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *thresh0, 15118b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *blimit1, 15128b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *limit1, 15138b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *thresh1) { 1514d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8); 15156fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org unsigned char *src[2]; 15166fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org unsigned char *dst[2]; 15176fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1518d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Transpose 8x16 1519d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); 15206fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1521d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Loop filtering 15228b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, 15238b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org blimit1, limit1, thresh1); 1524d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org src[0] = t_dst; 1525d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org src[1] = t_dst + 8; 15266fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1527d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org dst[0] = s - 4; 1528d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org dst[1] = s - 4 + p * 8; 15296fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 1530d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Transpose back 15316fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org transpose(src, 16, dst, p, 2); 15326fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org} 15336fefe538d859300e7febe78271828198c10f1b52fgalligan@chromium.org 15348b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_vertical_16_sse2(unsigned char *s, int p, 15358b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *blimit, 15368b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *limit, 15378b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const unsigned char *thresh) { 1538d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16); 1539d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org unsigned char *src[2]; 1540d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org unsigned char *dst[2]; 1541f7b25aef0ed571110c9f656f29ead07b02d33d89fgalligan@chromium.org 1542f7b25aef0ed571110c9f656f29ead07b02d33d89fgalligan@chromium.org src[0] = s - 8; 1543d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org src[1] = s; 1544d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org dst[0] = t_dst; 1545d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org dst[1] = t_dst + 8 * 8; 1546f7b25aef0ed571110c9f656f29ead07b02d33d89fgalligan@chromium.org 1547d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Transpose 16x8 1548d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org transpose(src, p, dst, 8, 2); 1549d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org 1550d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Loop filtering 1551d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh); 1552d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org 1553d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org src[0] = t_dst; 1554d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org src[1] = t_dst + 8 * 8; 1555d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org dst[0] = s - 8; 1556d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org dst[1] = s; 1557d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org 1558d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Transpose back 1559d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org transpose(src, 8, dst, p, 2); 1560d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org} 1561d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 15628b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.orgvoid vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p, 15638b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *blimit, const uint8_t *limit, 15648b26fe55f3e4daa2311dbd2d95e8ac2b4e080685johannkoenig@chromium.org const uint8_t *thresh) { 1565d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); 1566d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1567d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Transpose 16x16 1568d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); 1569d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); 1570d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1571d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Loop filtering 1572d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, 1573d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org thresh); 1574d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org 1575d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org // Transpose back 1576d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); 1577d851b91d14ef0bd71acdce7b90c9a8f1af1181adjohannkoenig@chromium.org transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); 1578d348b8d765c019ee7250075d663a83db00c65c08tomfinegan@chromium.org} 1579