1233d2500723e5594f3e7c70896ffeeef32b9c950ywan/* 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan * 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan * Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan * that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan * tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan * in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan * be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan */ 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include <emmintrin.h> // SSE2 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_loopfilter.h" 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx_ports/emmintrin_compat.h" 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan 15233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan int p, 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_blimit, 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_limit, 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_thresh) { 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i zero = _mm_set1_epi16(0); 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i one = _mm_set1_epi8(1); 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); 23233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i limit = _mm_load_si128((const __m128i *)_limit); 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i mask, hev, flat, flat2; 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i abs_p1p0; 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4), 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan (__m64 *)(s + 4 * p))); 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3p3 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q3p3), 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan (__m64 *)(s + 3 * p))); 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2p2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q2p2), 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan (__m64 *)(s + 2 * p))); 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1p1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q1p1), 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan (__m64 *)(s + 1 * p))); 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1q1 = _mm_shuffle_epi32(q1p1, 78); 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0p0 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q0p0), 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan (__m64 *)(s - 0 * p))); 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0q0 = _mm_shuffle_epi32(q0p0, 78); 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0p0, q1p1)); 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan fe = _mm_set1_epi8(0xfe); 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0q0, q0p0)); 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p1q1, q1p1)); 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan hev = _mm_subs_epu8(flat, thresh); 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(abs_p1p0, mask); 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(p1 - p0) > limit) * -1; 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(q1 - q0) > limit) * -1; 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1), 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q1p1, q2p2)), 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q2p2, q3p3))); 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(work, mask); 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_subs_epu8(mask, limit); 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_cmpeq_epi8(mask, zero); 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan // lp filter 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t4 = _mm_set1_epi8(4); 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t3 = _mm_set1_epi8(3); 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t80 = _mm_set1_epi8(0x80); 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t1 = _mm_set1_epi16(0x1); 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i qs0 = _mm_xor_si128(p0q0, t80); 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i qs1 = _mm_xor_si128(p1q1, t80); 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i filt; 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i work_a; 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i filter1, filter2; 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_subs_epi8(qs0, qs0ps0); 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan // (vp9_filter + 3 * (qs0 - ps0)) & mask 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(filt, mask); 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_adds_epi8(filt, t4); 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_adds_epi8(filt, t3); 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_unpacklo_epi8(zero, filter1); 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_srai_epi16(filter1, 0xB); 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_unpacklo_epi8(zero, filter2); 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_srai_epi16(filter2, 0xB); 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Filter1 >> 3 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan // filt >> 1 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi16(filter1, t1); 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_srai_epi16(filt, 1); 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt); 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan // loopfilter done 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i work; 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0), 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0p0, q2p2)), 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0p0, q3p3))); 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(abs_p1p0, flat); 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_subs_epu8(flat, one); 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_cmpeq_epi8(flat, zero); 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_and_si128(flat, mask); 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan q5p5 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q5p5), 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan (__m64 *)(s + 5 * p))); 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan q6p6 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q6p6), 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan (__m64 *)(s + 6 * p))); 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q4p4, q0p0), 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0p0, q4p4)), 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0p0, q5p5))); 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan q7p7 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q7p7), 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan (__m64 *)(s + 7 * p))); 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q6p6, q0p0), 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0p0, q6p6)), 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0p0, q7p7))); 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_max_epu8(work, flat2); 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_subs_epu8(flat2, one); 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_cmpeq_epi8(flat2, zero); 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan // flat and wide flat calculations 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i eight = _mm_set1_epi16(8); 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i four = _mm_set1_epi16(4); 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i pixelFilter_p, pixelFilter_q; 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan 178233d2500723e5594f3e7c70896ffeeef32b9c950ywan p7_16 = _mm_unpacklo_epi8(q7p7, zero);; 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan p6_16 = _mm_unpacklo_epi8(q6p6, zero); 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan p5_16 = _mm_unpacklo_epi8(q5p5, zero); 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan p4_16 = _mm_unpacklo_epi8(q4p4, zero); 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan p3_16 = _mm_unpacklo_epi8(q3p3, zero); 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2_16 = _mm_unpacklo_epi8(q2p2, zero); 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1_16 = _mm_unpacklo_epi8(q1p1, zero); 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0_16 = _mm_unpacklo_epi8(q0p0, zero); 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0_16 = _mm_unpackhi_epi8(q0p0, zero); 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1_16 = _mm_unpackhi_epi8(q1p1, zero); 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2_16 = _mm_unpackhi_epi8(q2p2, zero); 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3_16 = _mm_unpackhi_epi8(q3p3, zero); 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4_16 = _mm_unpackhi_epi8(q4p4, zero); 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan q5_16 = _mm_unpackhi_epi8(q5p5, zero); 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan q6_16 = _mm_unpackhi_epi8(q6p6, zero); 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan q7_16 = _mm_unpackhi_epi8(q7p7, zero); 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(p4_16, p3_16)); 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(q4_16, q3_16)); 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_q)); 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixetFilter_p2p1p0 = _mm_add_epi16(four, 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(pixetFilter_p2p1p0, 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixetFilter_q2q1q0)); 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(p7_16, p0_16)), 4); 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(q7_16, q0_16)), 4); 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q0p0 = _mm_packus_epi16(res_p, res_q); 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(p3_16, p0_16)), 3); 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(q3_16, q0_16)), 3); 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat_q0p0 = _mm_packus_epi16(res_p, res_q); 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_p7 = _mm_add_epi16(p7_16, p7_16); 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_q7 = _mm_add_epi16(q7_16, q7_16); 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_p3 = _mm_add_epi16(p3_16, p3_16); 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_q3 = _mm_add_epi16(q3_16, q3_16); 226233d2500723e5594f3e7c70896ffeeef32b9c950ywan 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_p7, p1_16)), 4); 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_q7, q1_16)), 4); 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q1p1 = _mm_packus_epi16(res_p, res_q); 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_p3, p1_16)), 3); 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_q3, q1_16)), 3); 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat_q1p1 = _mm_packus_epi16(res_p, res_q); 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_p7 = _mm_add_epi16(sum_p7, p7_16); 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_q7 = _mm_add_epi16(sum_q7, q7_16); 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_p3 = _mm_add_epi16(sum_p3, p3_16); 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_q3 = _mm_add_epi16(sum_q3, q3_16); 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_p7, p2_16)), 4); 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_q7, q2_16)), 4); 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q2p2 = _mm_packus_epi16(res_p, res_q); 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_p = _mm_srli_epi16(_mm_add_epi16(pixetFilter_p2p1p0, 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_p3, p2_16)), 3); 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_q = _mm_srli_epi16(_mm_add_epi16(pixetFilter_q2q1q0, 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_q3, q2_16)), 3); 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat_q2p2 = _mm_packus_epi16(res_p, res_q); 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_p7 = _mm_add_epi16(sum_p7, p7_16); 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_q7 = _mm_add_epi16(sum_q7, q7_16); 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_p7, p3_16)), 4); 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_q7, q3_16)), 4); 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q3p3 = _mm_packus_epi16(res_p, res_q); 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_p7 = _mm_add_epi16(sum_p7, p7_16); 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_q7 = _mm_add_epi16(sum_q7, q7_16); 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_p7, p4_16)), 4); 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_q7, q4_16)), 4); 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q4p4 = _mm_packus_epi16(res_p, res_q); 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_p7 = _mm_add_epi16(sum_p7, p7_16); 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_q7 = _mm_add_epi16(sum_q7, q7_16); 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); 289233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 290233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_p7, p5_16)), 4); 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_q7, q5_16)), 4); 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q5p5 = _mm_packus_epi16(res_p, res_q); 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_p7 = _mm_add_epi16(sum_p7, p7_16); 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan sum_q7 = _mm_add_epi16(sum_q7, q7_16); 297233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_p = _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_p7, p6_16)), 4); 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan res_q = _mm_srli_epi16(_mm_add_epi16(pixelFilter_q, 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_add_epi16(sum_q7, q6_16)), 4); 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q6p6 = _mm_packus_epi16(res_p, res_q); 304233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan // wide flat 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan 308233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_shuffle_epi32(flat, 68); 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_shuffle_epi32(flat2, 68); 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2p2 = _mm_andnot_si128(flat, q2p2); 312233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat_q2p2 = _mm_and_si128(flat, flat_q2p2); 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2p2 = _mm_or_si128(q2p2, flat_q2p2); 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan qs1ps1 = _mm_andnot_si128(flat, qs1ps1); 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat_q1p1 = _mm_and_si128(flat, flat_q1p1); 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan qs0ps0 = _mm_andnot_si128(flat, qs0ps0); 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat_q0p0 = _mm_and_si128(flat, flat_q0p0); 321233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan 323233d2500723e5594f3e7c70896ffeeef32b9c950ywan q6p6 = _mm_andnot_si128(flat2, q6p6); 324233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan q6p6 = _mm_or_si128(q6p6, flat2_q6p6); 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan q5p5 = _mm_andnot_si128(flat2, q5p5); 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan q5p5 = _mm_or_si128(q5p5, flat2_q5p5); 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); 333233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4p4 = _mm_andnot_si128(flat2, q4p4); 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); 337233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4p4 = _mm_or_si128(q4p4, flat2_q4p4); 338233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3p3 = _mm_andnot_si128(flat2, q3p3); 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3p3 = _mm_or_si128(q3p3, flat2_q3p3); 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2p2 = _mm_andnot_si128(flat2, q2p2); 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2p2 = _mm_or_si128(q2p2, flat2_q2p2); 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1p1 = _mm_andnot_si128(flat2, q1p1); 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); 355233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1p1 = _mm_or_si128(q1p1, flat2_q1p1); 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0p0 = _mm_andnot_si128(flat2, q0p0); 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); 361233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0p0 = _mm_or_si128(q0p0, flat2_q0p0); 362233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 365233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan 367233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan int p, 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_blimit, 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_limit, 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_thresh) { 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16); 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16); 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan 375233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16); 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16); 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16); 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16); 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i zero = _mm_set1_epi16(0); 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i one = _mm_set1_epi8(1); 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i limit = _mm_load_si128((const __m128i *)_limit); 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i mask, hev, flat, flat2; 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i p7, p6, p5; 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i q5, q6, q7; 390233d2500723e5594f3e7c70896ffeeef32b9c950ywan int i = 0; 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 397233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 398233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 399233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 400233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 401233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); 402233d2500723e5594f3e7c70896ffeeef32b9c950ywan 403233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&ap[4 * 16], p4); 404233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&ap[3 * 16], p3); 405233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&ap[2 * 16], p2); 406233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&ap[1 * 16], p1); 407233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&ap[0 * 16], p0); 408233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&aq[4 * 16], q4); 409233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&aq[3 * 16], q3); 410233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&aq[2 * 16], q2); 411233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&aq[1 * 16], q1); 412233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&aq[0 * 16], q0); 413233d2500723e5594f3e7c70896ffeeef32b9c950ywan 414233d2500723e5594f3e7c70896ffeeef32b9c950ywan 415233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 416233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), 417233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0, p1)); 418233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), 419233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, q1)); 420233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i fe = _mm_set1_epi8(0xfe); 421233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 422233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), 423233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, p0)); 424233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), 425233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q1, p1)); 426233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i work; 427233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 428233d2500723e5594f3e7c70896ffeeef32b9c950ywan hev = _mm_subs_epu8(flat, thresh); 429233d2500723e5594f3e7c70896ffeeef32b9c950ywan hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 430233d2500723e5594f3e7c70896ffeeef32b9c950ywan 431233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 432233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 433233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 434233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 435233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 436233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(flat, mask); 437233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(p1 - p0) > limit) * -1; 438233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(q1 - q0) > limit) * -1; 439233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), 440233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p1, p2)), 441233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(p3, p2), 442233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p2, p3))); 443233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(work, mask); 444233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), 445233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q1, q2)), 446233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q3, q2), 447233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q2, q3))); 448233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(work, mask); 449233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_subs_epu8(mask, limit); 450233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_cmpeq_epi8(mask, zero); 451233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 452233d2500723e5594f3e7c70896ffeeef32b9c950ywan 453233d2500723e5594f3e7c70896ffeeef32b9c950ywan // lp filter 454233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 455233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t4 = _mm_set1_epi8(4); 456233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t3 = _mm_set1_epi8(3); 457233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t80 = _mm_set1_epi8(0x80); 458233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i te0 = _mm_set1_epi8(0xe0); 459233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t1f = _mm_set1_epi8(0x1f); 460233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t1 = _mm_set1_epi8(0x1); 461233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t7f = _mm_set1_epi8(0x7f); 462233d2500723e5594f3e7c70896ffeeef32b9c950ywan 463233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i ps1 = _mm_xor_si128(p1, t80); 464233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i ps0 = _mm_xor_si128(p0, t80); 465233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i qs0 = _mm_xor_si128(q0, t80); 466233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i qs1 = _mm_xor_si128(q1, t80); 467233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i filt; 468233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i work_a; 469233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i filter1, filter2; 470233d2500723e5594f3e7c70896ffeeef32b9c950ywan 471233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 472233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_subs_epi8(qs0, ps0); 473233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 474233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 475233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 476233d2500723e5594f3e7c70896ffeeef32b9c950ywan // (vp9_filter + 3 * (qs0 - ps0)) & mask 477233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(filt, mask); 478233d2500723e5594f3e7c70896ffeeef32b9c950ywan 479233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_adds_epi8(filt, t4); 480233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_adds_epi8(filt, t3); 481233d2500723e5594f3e7c70896ffeeef32b9c950ywan 482233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Filter1 >> 3 483233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_cmpgt_epi8(zero, filter1); 484233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_srli_epi16(filter1, 3); 485233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_and_si128(work_a, te0); 486233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_and_si128(filter1, t1f); 487233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_or_si128(filter1, work_a); 488233d2500723e5594f3e7c70896ffeeef32b9c950ywan qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 489233d2500723e5594f3e7c70896ffeeef32b9c950ywan 490233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Filter2 >> 3 491233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_cmpgt_epi8(zero, filter2); 492233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_srli_epi16(filter2, 3); 493233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_and_si128(work_a, te0); 494233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_and_si128(filter2, t1f); 495233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_or_si128(filter2, work_a); 496233d2500723e5594f3e7c70896ffeeef32b9c950ywan ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 497233d2500723e5594f3e7c70896ffeeef32b9c950ywan 498233d2500723e5594f3e7c70896ffeeef32b9c950ywan // filt >> 1 499233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filter1, t1); 500233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_cmpgt_epi8(zero, filt); 501233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_srli_epi16(filt, 1); 502233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_and_si128(work_a, t80); 503233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(filt, t7f); 504233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_or_si128(filt, work_a); 505233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_andnot_si128(hev, filt); 506233d2500723e5594f3e7c70896ffeeef32b9c950ywan ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 507233d2500723e5594f3e7c70896ffeeef32b9c950ywan qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 508233d2500723e5594f3e7c70896ffeeef32b9c950ywan // loopfilter done 509233d2500723e5594f3e7c70896ffeeef32b9c950ywan 510233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 511233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i work; 512233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), 513233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0, p2)), 514233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q2, q0), 515233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, q2))); 516233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(work, flat); 517233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), 518233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0, p3)), 519233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q3, q0), 520233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, q3))); 521233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(work, flat); 522233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), 523233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0, p4)), 524233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q4, q0), 525233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, q4))); 526233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_subs_epu8(flat, one); 527233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_cmpeq_epi8(flat, zero); 528233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_and_si128(flat, mask); 529233d2500723e5594f3e7c70896ffeeef32b9c950ywan 530233d2500723e5594f3e7c70896ffeeef32b9c950ywan p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); 531233d2500723e5594f3e7c70896ffeeef32b9c950ywan q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); 532233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0), 533233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0, p5)), 534233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q5, q0), 535233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, q5))); 536233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&ap[5 * 16], p5); 537233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&aq[5 * 16], q5); 538233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_max_epu8(work, flat2); 539233d2500723e5594f3e7c70896ffeeef32b9c950ywan p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); 540233d2500723e5594f3e7c70896ffeeef32b9c950ywan q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); 541233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0), 542233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0, p6)), 543233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q6, q0), 544233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, q6))); 545233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&ap[6 * 16], p6); 546233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&aq[6 * 16], q6); 547233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_max_epu8(work, flat2); 548233d2500723e5594f3e7c70896ffeeef32b9c950ywan 549233d2500723e5594f3e7c70896ffeeef32b9c950ywan p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); 550233d2500723e5594f3e7c70896ffeeef32b9c950ywan q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); 551233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0), 552233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0, p7)), 553233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q7, q0), 554233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, q7))); 555233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&ap[7 * 16], p7); 556233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&aq[7 * 16], q7); 557233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_max_epu8(work, flat2); 558233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_subs_epu8(flat2, one); 559233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_cmpeq_epi8(flat2, zero); 560233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 561233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 562233d2500723e5594f3e7c70896ffeeef32b9c950ywan 563233d2500723e5594f3e7c70896ffeeef32b9c950ywan // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 564233d2500723e5594f3e7c70896ffeeef32b9c950ywan // flat and wide flat calculations 565233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 566233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i eight = _mm_set1_epi16(8); 567233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i four = _mm_set1_epi16(4); 568233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i temp_flat2 = flat2; 569233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *src = s; 570233d2500723e5594f3e7c70896ffeeef32b9c950ywan int i = 0; 571233d2500723e5594f3e7c70896ffeeef32b9c950ywan do { 572233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i workp_shft; 573233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i a, b, c; 574233d2500723e5594f3e7c70896ffeeef32b9c950ywan 575233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned int off = i * 8; 576233d2500723e5594f3e7c70896ffeeef32b9c950ywan p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)), 577233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 578233d2500723e5594f3e7c70896ffeeef32b9c950ywan p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)), 579233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 580233d2500723e5594f3e7c70896ffeeef32b9c950ywan p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)), 581233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 582233d2500723e5594f3e7c70896ffeeef32b9c950ywan p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)), 583233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 584233d2500723e5594f3e7c70896ffeeef32b9c950ywan p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)), 585233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 586233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)), 587233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 588233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)), 589233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 590233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)), 591233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 592233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)), 593233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 594233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)), 595233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 596233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)), 597233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 598233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)), 599233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 600233d2500723e5594f3e7c70896ffeeef32b9c950ywan q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)), 601233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 602233d2500723e5594f3e7c70896ffeeef32b9c950ywan q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)), 603233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 604233d2500723e5594f3e7c70896ffeeef32b9c950ywan q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)), 605233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 606233d2500723e5594f3e7c70896ffeeef32b9c950ywan q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)), 607233d2500723e5594f3e7c70896ffeeef32b9c950ywan zero); 608233d2500723e5594f3e7c70896ffeeef32b9c950ywan 609233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 610233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); 611233d2500723e5594f3e7c70896ffeeef32b9c950ywan 612233d2500723e5594f3e7c70896ffeeef32b9c950ywan b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2)); 613233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); 614233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); 615233d2500723e5594f3e7c70896ffeeef32b9c950ywan 616233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8], 617233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 618233d2500723e5594f3e7c70896ffeeef32b9c950ywan , b)); 619233d2500723e5594f3e7c70896ffeeef32b9c950ywan 620233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); 621233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 622233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8], 623233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 624233d2500723e5594f3e7c70896ffeeef32b9c950ywan 625233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q1, a); 626233d2500723e5594f3e7c70896ffeeef32b9c950ywan b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); 627233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8], 628233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 629233d2500723e5594f3e7c70896ffeeef32b9c950ywan , b)); 630233d2500723e5594f3e7c70896ffeeef32b9c950ywan 631233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); 632233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 633233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8], 634233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 635233d2500723e5594f3e7c70896ffeeef32b9c950ywan 636233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q2, a); 637233d2500723e5594f3e7c70896ffeeef32b9c950ywan b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); 638233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_op[i * 8], 639233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 640233d2500723e5594f3e7c70896ffeeef32b9c950ywan , b)); 641233d2500723e5594f3e7c70896ffeeef32b9c950ywan 642233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); 643233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 644233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8], 645233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 646233d2500723e5594f3e7c70896ffeeef32b9c950ywan 647233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q3, a); 648233d2500723e5594f3e7c70896ffeeef32b9c950ywan b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); 649233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_oq[i * 8], 650233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 651233d2500723e5594f3e7c70896ffeeef32b9c950ywan , b)); 652233d2500723e5594f3e7c70896ffeeef32b9c950ywan 653233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); 654233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 655233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8], 656233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 657233d2500723e5594f3e7c70896ffeeef32b9c950ywan 658233d2500723e5594f3e7c70896ffeeef32b9c950ywan b = _mm_add_epi16(q3, b); 659233d2500723e5594f3e7c70896ffeeef32b9c950ywan b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); 660233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8], 661233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 662233d2500723e5594f3e7c70896ffeeef32b9c950ywan , b)); 663233d2500723e5594f3e7c70896ffeeef32b9c950ywan 664233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(q4, c); 665233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); 666233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 667233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8], 668233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 669233d2500723e5594f3e7c70896ffeeef32b9c950ywan 670233d2500723e5594f3e7c70896ffeeef32b9c950ywan b = _mm_add_epi16(q3, b); 671233d2500723e5594f3e7c70896ffeeef32b9c950ywan b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); 672233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8], 673233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) 674233d2500723e5594f3e7c70896ffeeef32b9c950ywan , b)); 675233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q5, a); 676233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1); 677233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 678233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8], 679233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 680233d2500723e5594f3e7c70896ffeeef32b9c950ywan 681233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q6, a); 682233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); 683233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 684233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_op[i * 8], 685233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 686233d2500723e5594f3e7c70896ffeeef32b9c950ywan 687233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q7, a); 688233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); 689233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 690233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_oq[i * 8], 691233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 692233d2500723e5594f3e7c70896ffeeef32b9c950ywan 693233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q7, a); 694233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); 695233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 696233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8], 697233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 698233d2500723e5594f3e7c70896ffeeef32b9c950ywan 699233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q7, a); 700233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); 701233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 702233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8], 703233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 704233d2500723e5594f3e7c70896ffeeef32b9c950ywan 705233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q7, a); 706233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3); 707233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 708233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8], 709233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 710233d2500723e5594f3e7c70896ffeeef32b9c950ywan 711233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q7, a); 712233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4); 713233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 714233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8], 715233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 716233d2500723e5594f3e7c70896ffeeef32b9c950ywan 717233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q7, a); 718233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5); 719233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 720233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8], 721233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 722233d2500723e5594f3e7c70896ffeeef32b9c950ywan 723233d2500723e5594f3e7c70896ffeeef32b9c950ywan a = _mm_add_epi16(q7, a); 724233d2500723e5594f3e7c70896ffeeef32b9c950ywan c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6); 725233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); 726233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8], 727233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 728233d2500723e5594f3e7c70896ffeeef32b9c950ywan 729233d2500723e5594f3e7c70896ffeeef32b9c950ywan temp_flat2 = _mm_srli_si128(temp_flat2, 8); 730233d2500723e5594f3e7c70896ffeeef32b9c950ywan src += 8; 731233d2500723e5594f3e7c70896ffeeef32b9c950ywan } while (++i < 2); 732233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 733233d2500723e5594f3e7c70896ffeeef32b9c950ywan // wide flat 734233d2500723e5594f3e7c70896ffeeef32b9c950ywan // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 735233d2500723e5594f3e7c70896ffeeef32b9c950ywan 736233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_load_si128((__m128i *)&ap[2 * 16]); 737233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]); 738233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 739233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_and_si128(flat, p2); 740233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_or_si128(work_a, p2); 741233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&flat_op[2 * 16], p2); 742233d2500723e5594f3e7c70896ffeeef32b9c950ywan 743233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]); 744233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, ps1); 745233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_and_si128(flat, p1); 746233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_or_si128(work_a, p1); 747233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&flat_op[1 * 16], p1); 748233d2500723e5594f3e7c70896ffeeef32b9c950ywan 749233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_load_si128((__m128i *)&flat_op[0]); 750233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, ps0); 751233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_and_si128(flat, p0); 752233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_or_si128(work_a, p0); 753233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&flat_op[0], p0); 754233d2500723e5594f3e7c70896ffeeef32b9c950ywan 755233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_load_si128((__m128i *)&flat_oq[0]); 756233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, qs0); 757233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_and_si128(flat, q0); 758233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_or_si128(work_a, q0); 759233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&flat_oq[0], q0); 760233d2500723e5594f3e7c70896ffeeef32b9c950ywan 761233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]); 762233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, qs1); 763233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_and_si128(flat, q1); 764233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_or_si128(work_a, q1); 765233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1); 766233d2500723e5594f3e7c70896ffeeef32b9c950ywan 767233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_load_si128((__m128i *)&aq[2 * 16]); 768233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]); 769233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 770233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_and_si128(flat, q2); 771233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_or_si128(work_a, q2); 772233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2); 773233d2500723e5594f3e7c70896ffeeef32b9c950ywan 774233d2500723e5594f3e7c70896ffeeef32b9c950ywan // write out op6 - op3 775233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 776233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *dst = (s - 7 * p); 777233d2500723e5594f3e7c70896ffeeef32b9c950ywan for (i = 6; i > 2; i--) { 778233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i flat2_output; 779233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_load_si128((__m128i *)&ap[i * 16]); 780233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]); 781233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat2, work_a); 782233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_output = _mm_and_si128(flat2, flat2_output); 783233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_or_si128(work_a, flat2_output); 784233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)dst, work_a); 785233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst += p; 786233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 787233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 788233d2500723e5594f3e7c70896ffeeef32b9c950ywan 789233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]); 790233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]); 791233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat2, work_a); 792233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_and_si128(flat2, p2); 793233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_or_si128(work_a, p2); 794233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s - 3 * p), p2); 795233d2500723e5594f3e7c70896ffeeef32b9c950ywan 796233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]); 797233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]); 798233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat2, work_a); 799233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_and_si128(flat2, p1); 800233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_or_si128(work_a, p1); 801233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s - 2 * p), p1); 802233d2500723e5594f3e7c70896ffeeef32b9c950ywan 803233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_load_si128((__m128i *)&flat_op[0]); 804233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_load_si128((__m128i *)&flat2_op[0]); 805233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat2, work_a); 806233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_and_si128(flat2, p0); 807233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_or_si128(work_a, p0); 808233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s - 1 * p), p0); 809233d2500723e5594f3e7c70896ffeeef32b9c950ywan 810233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_load_si128((__m128i *)&flat_oq[0]); 811233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_load_si128((__m128i *)&flat2_oq[0]); 812233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat2, work_a); 813233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_and_si128(flat2, q0); 814233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_or_si128(work_a, q0); 815233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s - 0 * p), q0); 816233d2500723e5594f3e7c70896ffeeef32b9c950ywan 817233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]); 818233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_load_si128((__m128i *)&flat2_oq[16]); 819233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat2, work_a); 820233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_and_si128(flat2, q1); 821233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_or_si128(work_a, q1); 822233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s + 1 * p), q1); 823233d2500723e5594f3e7c70896ffeeef32b9c950ywan 824233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]); 825233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]); 826233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat2, work_a); 827233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_and_si128(flat2, q2); 828233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_or_si128(work_a, q2); 829233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s + 2 * p), q2); 830233d2500723e5594f3e7c70896ffeeef32b9c950ywan 831233d2500723e5594f3e7c70896ffeeef32b9c950ywan // write out oq3 - oq7 832233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 833233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *dst = (s + 3 * p); 834233d2500723e5594f3e7c70896ffeeef32b9c950ywan for (i = 3; i < 7; i++) { 835233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i flat2_output; 836233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_load_si128((__m128i *)&aq[i * 16]); 837233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]); 838233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat2, work_a); 839233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat2_output = _mm_and_si128(flat2, flat2_output); 840233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_or_si128(work_a, flat2_output); 841233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)dst, work_a); 842233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst += p; 843233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 844233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 845233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 846233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 847233d2500723e5594f3e7c70896ffeeef32b9c950ywan 848233d2500723e5594f3e7c70896ffeeef32b9c950ywan// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. 849233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_horizontal_16_sse2(unsigned char *s, int p, 850233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_blimit, 851233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_limit, 852233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_thresh, int count) { 853233d2500723e5594f3e7c70896ffeeef32b9c950ywan if (count == 1) 854233d2500723e5594f3e7c70896ffeeef32b9c950ywan mb_lpf_horizontal_edge_w_sse2_8(s, p, _blimit, _limit, _thresh); 855233d2500723e5594f3e7c70896ffeeef32b9c950ywan else 856233d2500723e5594f3e7c70896ffeeef32b9c950ywan mb_lpf_horizontal_edge_w_sse2_16(s, p, _blimit, _limit, _thresh); 857233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 858233d2500723e5594f3e7c70896ffeeef32b9c950ywan 859233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_horizontal_8_sse2(unsigned char *s, int p, 860233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_blimit, 861233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_limit, 862233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_thresh, int count) { 863233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16); 864233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16); 865233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16); 866233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16); 867233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16); 868233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16); 869233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i zero = _mm_set1_epi16(0); 870233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); 871233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i limit = _mm_load_si128((const __m128i *)_limit); 872233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); 873233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i mask, hev, flat; 874233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i p3, p2, p1, p0, q0, q1, q2, q3; 875233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; 876233d2500723e5594f3e7c70896ffeeef32b9c950ywan 877233d2500723e5594f3e7c70896ffeeef32b9c950ywan (void)count; 878233d2500723e5594f3e7c70896ffeeef32b9c950ywan 879233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), 880233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_loadl_epi64((__m128i *)(s + 3 * p))); 881233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), 882233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_loadl_epi64((__m128i *)(s + 2 * p))); 883233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), 884233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_loadl_epi64((__m128i *)(s + 1 * p))); 885233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), 886233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_loadl_epi64((__m128i *)(s - 0 * p))); 887233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1q1 = _mm_shuffle_epi32(q1p1, 78); 888233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0q0 = _mm_shuffle_epi32(q0p0, 78); 889233d2500723e5594f3e7c70896ffeeef32b9c950ywan 890233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 891233d2500723e5594f3e7c70896ffeeef32b9c950ywan // filter_mask and hev_mask 892233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i one = _mm_set1_epi8(1); 893233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i fe = _mm_set1_epi8(0xfe); 894233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i ff = _mm_cmpeq_epi8(fe, fe); 895233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; 896233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), 897233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0p0, q1p1)); 898233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); 899233d2500723e5594f3e7c70896ffeeef32b9c950ywan 900233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), 901233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0q0, q0p0)); 902233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), 903233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p1q1, q1p1)); 904233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 905233d2500723e5594f3e7c70896ffeeef32b9c950ywan hev = _mm_subs_epu8(flat, thresh); 906233d2500723e5594f3e7c70896ffeeef32b9c950ywan hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 907233d2500723e5594f3e7c70896ffeeef32b9c950ywan 908233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 909233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 910233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 911233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 912233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 913233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(abs_p1p0, mask); 914233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(p1 - p0) > limit) * -1; 915233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(q1 - q0) > limit) * -1; 916233d2500723e5594f3e7c70896ffeeef32b9c950ywan 917233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1), 918233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q1p1, q2p2)), 919233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), 920233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q2p2, q3p3))); 921233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(work, mask); 922233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); 923233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_subs_epu8(mask, limit); 924233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_cmpeq_epi8(mask, zero); 925233d2500723e5594f3e7c70896ffeeef32b9c950ywan 926233d2500723e5594f3e7c70896ffeeef32b9c950ywan // flat_mask4 927233d2500723e5594f3e7c70896ffeeef32b9c950ywan 928233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0), 929233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0p0, q2p2)), 930233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), 931233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0p0, q3p3))); 932233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(abs_p1p0, flat); 933233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); 934233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_subs_epu8(flat, one); 935233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_cmpeq_epi8(flat, zero); 936233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_and_si128(flat, mask); 937233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 938233d2500723e5594f3e7c70896ffeeef32b9c950ywan 939233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 940233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i four = _mm_set1_epi16(4); 941233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *src = s; 942233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 943233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i workp_a, workp_b, workp_shft; 944233d2500723e5594f3e7c70896ffeeef32b9c950ywan p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); 945233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); 946233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); 947233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); 948233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); 949233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); 950233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); 951233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); 952233d2500723e5594f3e7c70896ffeeef32b9c950ywan 953233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); 954233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); 955233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); 956233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 957233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_op2[0], 958233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 959233d2500723e5594f3e7c70896ffeeef32b9c950ywan 960233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); 961233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 962233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_op1[0], 963233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 964233d2500723e5594f3e7c70896ffeeef32b9c950ywan 965233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); 966233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); 967233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 968233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_op0[0], 969233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 970233d2500723e5594f3e7c70896ffeeef32b9c950ywan 971233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); 972233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); 973233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 974233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_oq0[0], 975233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 976233d2500723e5594f3e7c70896ffeeef32b9c950ywan 977233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); 978233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); 979233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 980233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_oq1[0], 981233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 982233d2500723e5594f3e7c70896ffeeef32b9c950ywan 983233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); 984233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); 985233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 986233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_oq2[0], 987233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 988233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 989233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 990233d2500723e5594f3e7c70896ffeeef32b9c950ywan // lp filter 991233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 992233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t4 = _mm_set1_epi8(4); 993233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t3 = _mm_set1_epi8(3); 994233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t80 = _mm_set1_epi8(0x80); 995233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t1 = _mm_set1_epi8(0x1); 996233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), 997233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 998233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), 999233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 1000233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), 1001233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 1002233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), 1003233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 1004233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i filt; 1005233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i work_a; 1006233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i filter1, filter2; 1007233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1008233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 1009233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_subs_epi8(qs0, ps0); 1010233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 1011233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 1012233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 1013233d2500723e5594f3e7c70896ffeeef32b9c950ywan // (vp9_filter + 3 * (qs0 - ps0)) & mask 1014233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(filt, mask); 1015233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1016233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_adds_epi8(filt, t4); 1017233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_adds_epi8(filt, t3); 1018233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1019233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Filter1 >> 3 1020233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_unpacklo_epi8(zero, filter1); 1021233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_srai_epi16(filter1, 11); 1022233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_packs_epi16(filter1, filter1); 1023233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1024233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Filter2 >> 3 1025233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_unpacklo_epi8(zero, filter2); 1026233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_srai_epi16(filter2, 11); 1027233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_packs_epi16(filter2, zero); 1028233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1029233d2500723e5594f3e7c70896ffeeef32b9c950ywan // filt >> 1 1030233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filter1, t1); 1031233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_unpacklo_epi8(zero, filt); 1032233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_srai_epi16(filt, 9); 1033233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_packs_epi16(filt, zero); 1034233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1035233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_andnot_si128(hev, filt); 1036233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1037233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 1038233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_loadl_epi64((__m128i *)flat_oq0); 1039233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1040233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_and_si128(flat, q0); 1041233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_or_si128(work_a, q0); 1042233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1043233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 1044233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_loadl_epi64((__m128i *)flat_oq1); 1045233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1046233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_and_si128(flat, q1); 1047233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_or_si128(work_a, q1); 1048233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1049233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); 1050233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_loadl_epi64((__m128i *)flat_oq2); 1051233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1052233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_and_si128(flat, q2); 1053233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_or_si128(work_a, q2); 1054233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1055233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 1056233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_loadl_epi64((__m128i *)flat_op0); 1057233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1058233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_and_si128(flat, p0); 1059233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_or_si128(work_a, p0); 1060233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1061233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 1062233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_loadl_epi64((__m128i *)flat_op1); 1063233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1064233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_and_si128(flat, p1); 1065233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_or_si128(work_a, p1); 1066233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1067233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); 1068233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_loadl_epi64((__m128i *)flat_op2); 1069233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1070233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_and_si128(flat, p2); 1071233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_or_si128(work_a, p2); 1072233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1073233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s - 3 * p), p2); 1074233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s - 2 * p), p1); 1075233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s - 1 * p), p0); 1076233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s + 0 * p), q0); 1077233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s + 1 * p), q1); 1078233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)(s + 2 * p), q2); 1079233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 1080233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 1081233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1082233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, 1083233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *_blimit0, 1084233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *_limit0, 1085233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *_thresh0, 1086233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *_blimit1, 1087233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *_limit1, 1088233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *_thresh1) { 1089233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16); 1090233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16); 1091233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16); 1092233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16); 1093233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16); 1094233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16); 1095233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i zero = _mm_set1_epi16(0); 1096233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i blimit = 1097233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), 1098233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_load_si128((const __m128i *)_blimit1)); 1099233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i limit = 1100233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), 1101233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_load_si128((const __m128i *)_limit1)); 1102233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i thresh = 1103233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), 1104233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_load_si128((const __m128i *)_thresh1)); 1105233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1106233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i mask, hev, flat; 1107233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i p3, p2, p1, p0, q0, q1, q2, q3; 1108233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1109233d2500723e5594f3e7c70896ffeeef32b9c950ywan p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 1110233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 1111233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 1112233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 1113233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 1114233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 1115233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 1116233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 1117233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 1118233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), 1119233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0, p1)); 1120233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), 1121233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, q1)); 1122233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i one = _mm_set1_epi8(1); 1123233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i fe = _mm_set1_epi8(0xfe); 1124233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 1125233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), 1126233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, p0)); 1127233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), 1128233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q1, p1)); 1129233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i work; 1130233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1131233d2500723e5594f3e7c70896ffeeef32b9c950ywan // filter_mask and hev_mask 1132233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 1133233d2500723e5594f3e7c70896ffeeef32b9c950ywan hev = _mm_subs_epu8(flat, thresh); 1134233d2500723e5594f3e7c70896ffeeef32b9c950ywan hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 1135233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1136233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 1137233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 1138233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 1139233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 1140233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 1141233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(flat, mask); 1142233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(p1 - p0) > limit) * -1; 1143233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(q1 - q0) > limit) * -1; 1144233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), 1145233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p1, p2)), 1146233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(p3, p2), 1147233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p2, p3))); 1148233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(work, mask); 1149233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), 1150233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q1, q2)), 1151233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q3, q2), 1152233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q2, q3))); 1153233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(work, mask); 1154233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_subs_epu8(mask, limit); 1155233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_cmpeq_epi8(mask, zero); 1156233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1157233d2500723e5594f3e7c70896ffeeef32b9c950ywan // flat_mask4 1158233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), 1159233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0, p2)), 1160233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q2, q0), 1161233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, q2))); 1162233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(work, flat); 1163233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), 1164233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0, p3)), 1165233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q3, q0), 1166233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, q3))); 1167233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(work, flat); 1168233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_subs_epu8(flat, one); 1169233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_cmpeq_epi8(flat, zero); 1170233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_and_si128(flat, mask); 1171233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 1172233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 1173233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i four = _mm_set1_epi16(4); 1174233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *src = s; 1175233d2500723e5594f3e7c70896ffeeef32b9c950ywan int i = 0; 1176233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1177233d2500723e5594f3e7c70896ffeeef32b9c950ywan do { 1178233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i workp_a, workp_b, workp_shft; 1179233d2500723e5594f3e7c70896ffeeef32b9c950ywan p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); 1180233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); 1181233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); 1182233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); 1183233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); 1184233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); 1185233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); 1186233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); 1187233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1188233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); 1189233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); 1190233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); 1191233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1192233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_op2[i * 8], 1193233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 1194233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1195233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); 1196233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1197233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_op1[i * 8], 1198233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 1199233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1200233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); 1201233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); 1202233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1203233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_op0[i * 8], 1204233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 1205233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1206233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); 1207233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); 1208233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1209233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], 1210233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 1211233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1212233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); 1213233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); 1214233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1215233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], 1216233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 1217233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1218233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); 1219233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); 1220233d2500723e5594f3e7c70896ffeeef32b9c950ywan workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1221233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], 1222233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_packus_epi16(workp_shft, workp_shft)); 1223233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1224233d2500723e5594f3e7c70896ffeeef32b9c950ywan src += 8; 1225233d2500723e5594f3e7c70896ffeeef32b9c950ywan } while (++i < 2); 1226233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 1227233d2500723e5594f3e7c70896ffeeef32b9c950ywan // lp filter 1228233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 1229233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t4 = _mm_set1_epi8(4); 1230233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t3 = _mm_set1_epi8(3); 1231233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t80 = _mm_set1_epi8(0x80); 1232233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i te0 = _mm_set1_epi8(0xe0); 1233233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t1f = _mm_set1_epi8(0x1f); 1234233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t1 = _mm_set1_epi8(0x1); 1235233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t7f = _mm_set1_epi8(0x7f); 1236233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1237233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), 1238233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 1239233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), 1240233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 1241233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), 1242233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 1243233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), 1244233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 1245233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i filt; 1246233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i work_a; 1247233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i filter1, filter2; 1248233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1249233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 1250233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_subs_epi8(qs0, ps0); 1251233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 1252233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 1253233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 1254233d2500723e5594f3e7c70896ffeeef32b9c950ywan // (vp9_filter + 3 * (qs0 - ps0)) & mask 1255233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(filt, mask); 1256233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1257233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_adds_epi8(filt, t4); 1258233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_adds_epi8(filt, t3); 1259233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1260233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Filter1 >> 3 1261233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_cmpgt_epi8(zero, filter1); 1262233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_srli_epi16(filter1, 3); 1263233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_and_si128(work_a, te0); 1264233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_and_si128(filter1, t1f); 1265233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_or_si128(filter1, work_a); 1266233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1267233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Filter2 >> 3 1268233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_cmpgt_epi8(zero, filter2); 1269233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_srli_epi16(filter2, 3); 1270233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_and_si128(work_a, te0); 1271233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_and_si128(filter2, t1f); 1272233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_or_si128(filter2, work_a); 1273233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1274233d2500723e5594f3e7c70896ffeeef32b9c950ywan // filt >> 1 1275233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filter1, t1); 1276233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_cmpgt_epi8(zero, filt); 1277233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_srli_epi16(filt, 1); 1278233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_and_si128(work_a, t80); 1279233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(filt, t7f); 1280233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_or_si128(filt, work_a); 1281233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1282233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_andnot_si128(hev, filt); 1283233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1284233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 1285233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_load_si128((__m128i *)flat_oq0); 1286233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1287233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_and_si128(flat, q0); 1288233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_or_si128(work_a, q0); 1289233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1290233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 1291233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_load_si128((__m128i *)flat_oq1); 1292233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1293233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_and_si128(flat, q1); 1294233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_or_si128(work_a, q1); 1295233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1296233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); 1297233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_load_si128((__m128i *)flat_oq2); 1298233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1299233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_and_si128(flat, q2); 1300233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_or_si128(work_a, q2); 1301233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1302233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 1303233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_load_si128((__m128i *)flat_op0); 1304233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1305233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_and_si128(flat, p0); 1306233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_or_si128(work_a, p0); 1307233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1308233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 1309233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_load_si128((__m128i *)flat_op1); 1310233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1311233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_and_si128(flat, p1); 1312233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_or_si128(work_a, p1); 1313233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1314233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); 1315233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_load_si128((__m128i *)flat_op2); 1316233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_andnot_si128(flat, work_a); 1317233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_and_si128(flat, p2); 1318233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_or_si128(work_a, p2); 1319233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1320233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s - 3 * p), p2); 1321233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s - 2 * p), p1); 1322233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s - 1 * p), p0); 1323233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s + 0 * p), q0); 1324233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s + 1 * p), q1); 1325233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s + 2 * p), q2); 1326233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 1327233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 1328233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1329233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, 1330233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_blimit0, 1331233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_limit0, 1332233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_thresh0, 1333233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_blimit1, 1334233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_limit1, 1335233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *_thresh1) { 1336233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i blimit = 1337233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), 1338233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_load_si128((const __m128i *)_blimit1)); 1339233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i limit = 1340233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), 1341233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_load_si128((const __m128i *)_limit1)); 1342233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i thresh = 1343233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), 1344233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_load_si128((const __m128i *)_thresh1)); 1345233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i zero = _mm_set1_epi16(0); 1346233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i p3, p2, p1, p0, q0, q1, q2, q3; 1347233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i mask, hev, flat; 1348233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1349233d2500723e5594f3e7c70896ffeeef32b9c950ywan p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 1350233d2500723e5594f3e7c70896ffeeef32b9c950ywan p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 1351233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 1352233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 1353233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 1354233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 1355233d2500723e5594f3e7c70896ffeeef32b9c950ywan q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 1356233d2500723e5594f3e7c70896ffeeef32b9c950ywan q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 1357233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1358233d2500723e5594f3e7c70896ffeeef32b9c950ywan // filter_mask and hev_mask 1359233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 1360233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), 1361233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p0, p1)); 1362233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), 1363233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, q1)); 1364233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i fe = _mm_set1_epi8(0xfe); 1365233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 1366233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), 1367233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q0, p0)); 1368233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), 1369233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q1, p1)); 1370233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i work; 1371233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1372233d2500723e5594f3e7c70896ffeeef32b9c950ywan flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 1373233d2500723e5594f3e7c70896ffeeef32b9c950ywan hev = _mm_subs_epu8(flat, thresh); 1374233d2500723e5594f3e7c70896ffeeef32b9c950ywan hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 1375233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1376233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); 1377233d2500723e5594f3e7c70896ffeeef32b9c950ywan abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 1378233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 1379233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 1380233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 1381233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(flat, mask); 1382233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(p1 - p0) > limit) * -1; 1383233d2500723e5594f3e7c70896ffeeef32b9c950ywan // mask |= (abs(q1 - q0) > limit) * -1; 1384233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), 1385233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p1, p2)), 1386233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(p3, p2), 1387233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(p2, p3))); 1388233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(work, mask); 1389233d2500723e5594f3e7c70896ffeeef32b9c950ywan work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), 1390233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q1, q2)), 1391233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_or_si128(_mm_subs_epu8(q3, q2), 1392233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_subs_epu8(q2, q3))); 1393233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_max_epu8(work, mask); 1394233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_subs_epu8(mask, limit); 1395233d2500723e5594f3e7c70896ffeeef32b9c950ywan mask = _mm_cmpeq_epi8(mask, zero); 1396233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 1397233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1398233d2500723e5594f3e7c70896ffeeef32b9c950ywan // filter4 1399233d2500723e5594f3e7c70896ffeeef32b9c950ywan { 1400233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t4 = _mm_set1_epi8(4); 1401233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t3 = _mm_set1_epi8(3); 1402233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t80 = _mm_set1_epi8(0x80); 1403233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i te0 = _mm_set1_epi8(0xe0); 1404233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t1f = _mm_set1_epi8(0x1f); 1405233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t1 = _mm_set1_epi8(0x1); 1406233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i t7f = _mm_set1_epi8(0x7f); 1407233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1408233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), 1409233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 1410233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), 1411233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 1412233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), 1413233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 1414233d2500723e5594f3e7c70896ffeeef32b9c950ywan const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), 1415233d2500723e5594f3e7c70896ffeeef32b9c950ywan t80); 1416233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i filt; 1417233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i work_a; 1418233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i filter1, filter2; 1419233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1420233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 1421233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_subs_epi8(qs0, ps0); 1422233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 1423233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 1424233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filt, work_a); 1425233d2500723e5594f3e7c70896ffeeef32b9c950ywan // (vp9_filter + 3 * (qs0 - ps0)) & mask 1426233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(filt, mask); 1427233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1428233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_adds_epi8(filt, t4); 1429233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_adds_epi8(filt, t3); 1430233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1431233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Filter1 >> 3 1432233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_cmpgt_epi8(zero, filter1); 1433233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_srli_epi16(filter1, 3); 1434233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_and_si128(work_a, te0); 1435233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_and_si128(filter1, t1f); 1436233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter1 = _mm_or_si128(filter1, work_a); 1437233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1438233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Filter2 >> 3 1439233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_cmpgt_epi8(zero, filter2); 1440233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_srli_epi16(filter2, 3); 1441233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_and_si128(work_a, te0); 1442233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_and_si128(filter2, t1f); 1443233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter2 = _mm_or_si128(filter2, work_a); 1444233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1445233d2500723e5594f3e7c70896ffeeef32b9c950ywan // filt >> 1 1446233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_adds_epi8(filter1, t1); 1447233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_cmpgt_epi8(zero, filt); 1448233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_srli_epi16(filt, 1); 1449233d2500723e5594f3e7c70896ffeeef32b9c950ywan work_a = _mm_and_si128(work_a, t80); 1450233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_and_si128(filt, t7f); 1451233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_or_si128(filt, work_a); 1452233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1453233d2500723e5594f3e7c70896ffeeef32b9c950ywan filt = _mm_andnot_si128(hev, filt); 1454233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1455233d2500723e5594f3e7c70896ffeeef32b9c950ywan q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 1456233d2500723e5594f3e7c70896ffeeef32b9c950ywan q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 1457233d2500723e5594f3e7c70896ffeeef32b9c950ywan p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 1458233d2500723e5594f3e7c70896ffeeef32b9c950ywan p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 1459233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1460233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s - 2 * p), p1); 1461233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s - 1 * p), p0); 1462233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s + 0 * p), q0); 1463233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(s + 1 * p), q1); 1464233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 1465233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 1466233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1467233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, 1468233d2500723e5594f3e7c70896ffeeef32b9c950ywan int in_p, unsigned char *out, int out_p) { 1469233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i x0, x1, x2, x3, x4, x5, x6, x7; 1470233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i x8, x9, x10, x11, x12, x13, x14, x15; 1471233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1472233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Read in 16 lines 1473233d2500723e5594f3e7c70896ffeeef32b9c950ywan x0 = _mm_loadl_epi64((__m128i *)in0); 1474233d2500723e5594f3e7c70896ffeeef32b9c950ywan x8 = _mm_loadl_epi64((__m128i *)in1); 1475233d2500723e5594f3e7c70896ffeeef32b9c950ywan x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); 1476233d2500723e5594f3e7c70896ffeeef32b9c950ywan x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); 1477233d2500723e5594f3e7c70896ffeeef32b9c950ywan x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); 1478233d2500723e5594f3e7c70896ffeeef32b9c950ywan x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); 1479233d2500723e5594f3e7c70896ffeeef32b9c950ywan x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); 1480233d2500723e5594f3e7c70896ffeeef32b9c950ywan x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); 1481233d2500723e5594f3e7c70896ffeeef32b9c950ywan x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); 1482233d2500723e5594f3e7c70896ffeeef32b9c950ywan x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); 1483233d2500723e5594f3e7c70896ffeeef32b9c950ywan x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); 1484233d2500723e5594f3e7c70896ffeeef32b9c950ywan x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); 1485233d2500723e5594f3e7c70896ffeeef32b9c950ywan x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); 1486233d2500723e5594f3e7c70896ffeeef32b9c950ywan x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); 1487233d2500723e5594f3e7c70896ffeeef32b9c950ywan x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); 1488233d2500723e5594f3e7c70896ffeeef32b9c950ywan x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); 1489233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1490233d2500723e5594f3e7c70896ffeeef32b9c950ywan x0 = _mm_unpacklo_epi8(x0, x1); 1491233d2500723e5594f3e7c70896ffeeef32b9c950ywan x1 = _mm_unpacklo_epi8(x2, x3); 1492233d2500723e5594f3e7c70896ffeeef32b9c950ywan x2 = _mm_unpacklo_epi8(x4, x5); 1493233d2500723e5594f3e7c70896ffeeef32b9c950ywan x3 = _mm_unpacklo_epi8(x6, x7); 1494233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1495233d2500723e5594f3e7c70896ffeeef32b9c950ywan x8 = _mm_unpacklo_epi8(x8, x9); 1496233d2500723e5594f3e7c70896ffeeef32b9c950ywan x9 = _mm_unpacklo_epi8(x10, x11); 1497233d2500723e5594f3e7c70896ffeeef32b9c950ywan x10 = _mm_unpacklo_epi8(x12, x13); 1498233d2500723e5594f3e7c70896ffeeef32b9c950ywan x11 = _mm_unpacklo_epi8(x14, x15); 1499233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1500233d2500723e5594f3e7c70896ffeeef32b9c950ywan x4 = _mm_unpacklo_epi16(x0, x1); 1501233d2500723e5594f3e7c70896ffeeef32b9c950ywan x5 = _mm_unpacklo_epi16(x2, x3); 1502233d2500723e5594f3e7c70896ffeeef32b9c950ywan x12 = _mm_unpacklo_epi16(x8, x9); 1503233d2500723e5594f3e7c70896ffeeef32b9c950ywan x13 = _mm_unpacklo_epi16(x10, x11); 1504233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1505233d2500723e5594f3e7c70896ffeeef32b9c950ywan x6 = _mm_unpacklo_epi32(x4, x5); 1506233d2500723e5594f3e7c70896ffeeef32b9c950ywan x7 = _mm_unpackhi_epi32(x4, x5); 1507233d2500723e5594f3e7c70896ffeeef32b9c950ywan x14 = _mm_unpacklo_epi32(x12, x13); 1508233d2500723e5594f3e7c70896ffeeef32b9c950ywan x15 = _mm_unpackhi_epi32(x12, x13); 1509233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1510233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Store first 4-line result 1511233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); 1512233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); 1513233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); 1514233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); 1515233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1516233d2500723e5594f3e7c70896ffeeef32b9c950ywan x4 = _mm_unpackhi_epi16(x0, x1); 1517233d2500723e5594f3e7c70896ffeeef32b9c950ywan x5 = _mm_unpackhi_epi16(x2, x3); 1518233d2500723e5594f3e7c70896ffeeef32b9c950ywan x12 = _mm_unpackhi_epi16(x8, x9); 1519233d2500723e5594f3e7c70896ffeeef32b9c950ywan x13 = _mm_unpackhi_epi16(x10, x11); 1520233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1521233d2500723e5594f3e7c70896ffeeef32b9c950ywan x6 = _mm_unpacklo_epi32(x4, x5); 1522233d2500723e5594f3e7c70896ffeeef32b9c950ywan x7 = _mm_unpackhi_epi32(x4, x5); 1523233d2500723e5594f3e7c70896ffeeef32b9c950ywan x14 = _mm_unpacklo_epi32(x12, x13); 1524233d2500723e5594f3e7c70896ffeeef32b9c950ywan x15 = _mm_unpackhi_epi32(x12, x13); 1525233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1526233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Store second 4-line result 1527233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); 1528233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); 1529233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); 1530233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); 1531233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 1532233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1533233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic INLINE void transpose(unsigned char *src[], int in_p, 1534233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *dst[], int out_p, 1535233d2500723e5594f3e7c70896ffeeef32b9c950ywan int num_8x8_to_transpose) { 1536233d2500723e5594f3e7c70896ffeeef32b9c950ywan int idx8x8 = 0; 1537233d2500723e5594f3e7c70896ffeeef32b9c950ywan __m128i x0, x1, x2, x3, x4, x5, x6, x7; 1538233d2500723e5594f3e7c70896ffeeef32b9c950ywan do { 1539233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *in = src[idx8x8]; 1540233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *out = dst[idx8x8]; 1541233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1542233d2500723e5594f3e7c70896ffeeef32b9c950ywan x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 1543233d2500723e5594f3e7c70896ffeeef32b9c950ywan x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 1544233d2500723e5594f3e7c70896ffeeef32b9c950ywan x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 1545233d2500723e5594f3e7c70896ffeeef32b9c950ywan x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 1546233d2500723e5594f3e7c70896ffeeef32b9c950ywan x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 1547233d2500723e5594f3e7c70896ffeeef32b9c950ywan x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 1548233d2500723e5594f3e7c70896ffeeef32b9c950ywan x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 1549233d2500723e5594f3e7c70896ffeeef32b9c950ywan x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 1550233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 1551233d2500723e5594f3e7c70896ffeeef32b9c950ywan x0 = _mm_unpacklo_epi8(x0, x1); 1552233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 1553233d2500723e5594f3e7c70896ffeeef32b9c950ywan x1 = _mm_unpacklo_epi8(x2, x3); 1554233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 1555233d2500723e5594f3e7c70896ffeeef32b9c950ywan x2 = _mm_unpacklo_epi8(x4, x5); 1556233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 1557233d2500723e5594f3e7c70896ffeeef32b9c950ywan x3 = _mm_unpacklo_epi8(x6, x7); 1558233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 1559233d2500723e5594f3e7c70896ffeeef32b9c950ywan x4 = _mm_unpacklo_epi16(x0, x1); 1560233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 1561233d2500723e5594f3e7c70896ffeeef32b9c950ywan x5 = _mm_unpacklo_epi16(x2, x3); 1562233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 1563233d2500723e5594f3e7c70896ffeeef32b9c950ywan x6 = _mm_unpacklo_epi32(x4, x5); 1564233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 1565233d2500723e5594f3e7c70896ffeeef32b9c950ywan x7 = _mm_unpackhi_epi32(x4, x5); 1566233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1567233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_pd((double *)(out + 0*out_p), 1568233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 1569233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeh_pd((double *)(out + 1*out_p), 1570233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 1571233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_pd((double *)(out + 2*out_p), 1572233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 1573233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeh_pd((double *)(out + 3*out_p), 1574233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 1575233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1576233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 1577233d2500723e5594f3e7c70896ffeeef32b9c950ywan x4 = _mm_unpackhi_epi16(x0, x1); 1578233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 1579233d2500723e5594f3e7c70896ffeeef32b9c950ywan x5 = _mm_unpackhi_epi16(x2, x3); 1580233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 1581233d2500723e5594f3e7c70896ffeeef32b9c950ywan x6 = _mm_unpacklo_epi32(x4, x5); 1582233d2500723e5594f3e7c70896ffeeef32b9c950ywan // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 1583233d2500723e5594f3e7c70896ffeeef32b9c950ywan x7 = _mm_unpackhi_epi32(x4, x5); 1584233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1585233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_pd((double *)(out + 4*out_p), 1586233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 1587233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeh_pd((double *)(out + 5*out_p), 1588233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 1589233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storel_pd((double *)(out + 6*out_p), 1590233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 1591233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_storeh_pd((double *)(out + 7*out_p), 1592233d2500723e5594f3e7c70896ffeeef32b9c950ywan _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 1593233d2500723e5594f3e7c70896ffeeef32b9c950ywan } while (++idx8x8 < num_8x8_to_transpose); 1594233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 1595233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1596233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, 1597233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *limit0, 1598233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *thresh0, 1599233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *blimit1, 1600233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *limit1, 1601233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *thresh1) { 1602233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8); 1603233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *src[2]; 1604233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *dst[2]; 1605233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1606233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Transpose 8x16 1607233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); 1608233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1609233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Loop filtering 1610233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, 1611233d2500723e5594f3e7c70896ffeeef32b9c950ywan blimit1, limit1, thresh1); 1612233d2500723e5594f3e7c70896ffeeef32b9c950ywan src[0] = t_dst; 1613233d2500723e5594f3e7c70896ffeeef32b9c950ywan src[1] = t_dst + 8; 1614233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst[0] = s - 4; 1615233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst[1] = s - 4 + p * 8; 1616233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1617233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Transpose back 1618233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose(src, 16, dst, p, 2); 1619233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 1620233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1621233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_vertical_8_sse2(unsigned char *s, int p, 1622233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *blimit, 1623233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *limit, 1624233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *thresh, int count) { 1625233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8); 1626233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *src[1]; 1627233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *dst[1]; 1628233d2500723e5594f3e7c70896ffeeef32b9c950ywan (void)count; 1629233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1630233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Transpose 8x8 1631233d2500723e5594f3e7c70896ffeeef32b9c950ywan src[0] = s - 4; 1632233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst[0] = t_dst; 1633233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1634233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose(src, p, dst, 8, 1); 1635233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1636233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Loop filtering 1637233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, 1); 1638233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1639233d2500723e5594f3e7c70896ffeeef32b9c950ywan src[0] = t_dst; 1640233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst[0] = s - 4; 1641233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1642233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Transpose back 1643233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose(src, 8, dst, p, 1); 1644233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 1645233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1646233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, 1647233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *limit0, 1648233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *thresh0, 1649233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *blimit1, 1650233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *limit1, 1651233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *thresh1) { 1652233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8); 1653233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *src[2]; 1654233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *dst[2]; 1655233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1656233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Transpose 8x16 1657233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); 1658233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1659233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Loop filtering 1660233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, 1661233d2500723e5594f3e7c70896ffeeef32b9c950ywan blimit1, limit1, thresh1); 1662233d2500723e5594f3e7c70896ffeeef32b9c950ywan src[0] = t_dst; 1663233d2500723e5594f3e7c70896ffeeef32b9c950ywan src[1] = t_dst + 8; 1664233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1665233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst[0] = s - 4; 1666233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst[1] = s - 4 + p * 8; 1667233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1668233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Transpose back 1669233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose(src, 16, dst, p, 2); 1670233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 1671233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1672233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_vertical_16_sse2(unsigned char *s, int p, 1673233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *blimit, 1674233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *limit, 1675233d2500723e5594f3e7c70896ffeeef32b9c950ywan const unsigned char *thresh) { 1676233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16); 1677233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *src[2]; 1678233d2500723e5594f3e7c70896ffeeef32b9c950ywan unsigned char *dst[2]; 1679233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1680233d2500723e5594f3e7c70896ffeeef32b9c950ywan src[0] = s - 8; 1681233d2500723e5594f3e7c70896ffeeef32b9c950ywan src[1] = s; 1682233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst[0] = t_dst; 1683233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst[1] = t_dst + 8 * 8; 1684233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1685233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Transpose 16x8 1686233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose(src, p, dst, 8, 2); 1687233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1688233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Loop filtering 1689233d2500723e5594f3e7c70896ffeeef32b9c950ywan mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh); 1690233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1691233d2500723e5594f3e7c70896ffeeef32b9c950ywan src[0] = t_dst; 1692233d2500723e5594f3e7c70896ffeeef32b9c950ywan src[1] = t_dst + 8 * 8; 1693233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst[0] = s - 8; 1694233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst[1] = s; 1695233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1696233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Transpose back 1697233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose(src, 8, dst, p, 2); 1698233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 1699233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1700233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p, 1701233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *blimit, const uint8_t *limit, 1702233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *thresh) { 1703233d2500723e5594f3e7c70896ffeeef32b9c950ywan DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); 1704233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1705233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Transpose 16x16 1706233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); 1707233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); 1708233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1709233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Loop filtering 1710233d2500723e5594f3e7c70896ffeeef32b9c950ywan mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit, 1711233d2500723e5594f3e7c70896ffeeef32b9c950ywan thresh); 1712233d2500723e5594f3e7c70896ffeeef32b9c950ywan 1713233d2500723e5594f3e7c70896ffeeef32b9c950ywan // Transpose back 1714233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); 1715233d2500723e5594f3e7c70896ffeeef32b9c950ywan transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); 1716233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 1717