1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <emmintrin.h> // SSE2 12 13#include "./vpx_dsp_rtcd.h" 14#include "vpx_ports/mem.h" 15#include "vpx_ports/emmintrin_compat.h" 16 17static INLINE __m128i abs_diff(__m128i a, __m128i b) { 18 return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); 19} 20 21// filter_mask and hev_mask 22#define FILTER_HEV_MASK \ 23 do { \ 24 /* (abs(q1 - q0), abs(p1 - p0) */ \ 25 __m128i flat = abs_diff(q1p1, q0p0); \ 26 /* abs(p1 - q1), abs(p0 - q0) */ \ 27 const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \ 28 __m128i abs_p0q0, abs_p1q1, work; \ 29 \ 30 /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \ 31 hev = \ 32 _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \ 33 hev = _mm_cmpgt_epi16(hev, thresh); \ 34 hev = _mm_packs_epi16(hev, hev); \ 35 \ 36 /* const int8_t mask = filter_mask(*limit, *blimit, */ \ 37 /* p3, p2, p1, p0, q0, q1, q2, q3); */ \ 38 abs_p0q0 = \ 39 _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \ 40 abs_p1q1 = \ 41 _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \ 42 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \ 43 abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \ 44 /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ 45 mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \ 46 /* abs(p3 - p2), abs(p2 - p1) */ \ 47 work = abs_diff(p3p2, p2p1); \ 48 flat = _mm_max_epu8(work, flat); \ 49 /* abs(q3 - q2), abs(q2 - q1) */ \ 50 work = abs_diff(q3q2, q2q1); \ 51 flat = _mm_max_epu8(work, flat); \ 52 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \ 53 mask = _mm_unpacklo_epi64(mask, flat); \ 54 mask = _mm_subs_epu8(mask, limit); \ 55 mask = _mm_cmpeq_epi8(mask, zero); \ 56 mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \ 57 } while (0) 58 59#define FILTER4 \ 60 do { \ 61 const __m128i t3t4 = \ 62 _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \ 63 const __m128i t80 = _mm_set1_epi8(0x80); \ 64 __m128i filter, filter2filter1, work; \ 65 \ 66 ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \ 67 qs1qs0 = _mm_xor_si128(q1q0, t80); \ 68 \ 69 /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \ 70 work = _mm_subs_epi8(ps1ps0, qs1qs0); \ 71 filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \ 72 /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \ 73 filter = _mm_subs_epi8(filter, work); \ 74 filter = _mm_subs_epi8(filter, work); \ 75 filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \ 76 filter = _mm_and_si128(filter, mask); /* & mask */ \ 77 filter = _mm_unpacklo_epi64(filter, filter); \ 78 \ 79 /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \ 80 /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \ 81 filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \ 82 filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \ 83 filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \ 84 filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \ 85 filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \ 86 filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \ 87 \ 88 /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \ 89 filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \ 90 filter = _mm_unpacklo_epi8(filter, filter); \ 91 filter = _mm_srai_epi16(filter, 9); /* round */ \ 92 filter = _mm_packs_epi16(filter, filter); \ 93 filter = _mm_andnot_si128(hev, filter); \ 94 \ 95 hev = _mm_unpackhi_epi64(filter2filter1, filter); \ 96 filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \ 97 \ 98 /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \ 99 qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \ 100 /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \ 101 ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \ 102 qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \ 103 ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \ 104 } while (0) 105 106void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */, 107 const uint8_t *_blimit, const uint8_t *_limit, 108 const uint8_t *_thresh) { 109 const __m128i zero = _mm_set1_epi16(0); 110 const __m128i limit = 111 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), 112 _mm_loadl_epi64((const __m128i *)_limit)); 113 const __m128i thresh = 114 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); 115 const __m128i ff = _mm_cmpeq_epi8(zero, zero); 116 __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; 117 __m128i mask, hev; 118 119 p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), 120 _mm_loadl_epi64((__m128i *)(s - 4 * p))); 121 q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), 122 _mm_loadl_epi64((__m128i *)(s + 1 * p))); 123 q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), 124 _mm_loadl_epi64((__m128i *)(s + 0 * p))); 125 q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)), 126 _mm_loadl_epi64((__m128i *)(s + 3 * p))); 127 p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); 128 p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); 129 q1q0 = _mm_unpackhi_epi64(q0p0, q1p1); 130 q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); 131 132 FILTER_HEV_MASK; 133 FILTER4; 134 135 _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1 136 _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0 137 _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0 138 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1 139} 140 141void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */, 142 const uint8_t *_blimit, const uint8_t *_limit, 143 const uint8_t *_thresh) { 144 const __m128i zero = _mm_set1_epi16(0); 145 const __m128i limit = 146 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit), 147 _mm_loadl_epi64((const __m128i *)_limit)); 148 const __m128i thresh = 149 _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero); 150 const __m128i ff = _mm_cmpeq_epi8(zero, zero); 151 __m128i x0, x1, x2, x3; 152 __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0; 153 __m128i mask, hev; 154 155 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 156 q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)), 157 _mm_loadl_epi64((__m128i *)(s + 1 * p - 4))); 158 159 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 160 x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)), 161 _mm_loadl_epi64((__m128i *)(s + 3 * p - 4))); 162 163 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 164 x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)), 165 _mm_loadl_epi64((__m128i *)(s + 5 * p - 4))); 166 167 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 168 x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)), 169 _mm_loadl_epi64((__m128i *)(s + 7 * p - 4))); 170 171 // Transpose 8x8 172 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 173 p1p0 = _mm_unpacklo_epi16(q1q0, x1); 174 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 175 x0 = _mm_unpacklo_epi16(x2, x3); 176 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 177 p3p2 = _mm_unpacklo_epi32(p1p0, x0); 178 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 179 p1p0 = _mm_unpackhi_epi32(p1p0, x0); 180 p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high 181 p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high 182 183 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 184 q1q0 = _mm_unpackhi_epi16(q1q0, x1); 185 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 186 x2 = _mm_unpackhi_epi16(x2, x3); 187 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 188 q3q2 = _mm_unpackhi_epi32(q1q0, x2); 189 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 190 q1q0 = _mm_unpacklo_epi32(q1q0, x2); 191 192 q0p0 = _mm_unpacklo_epi64(p1p0, q1q0); 193 q1p1 = _mm_unpackhi_epi64(p1p0, q1q0); 194 p1p0 = _mm_unpacklo_epi64(q0p0, q1p1); 195 p2p1 = _mm_unpacklo_epi64(q1p1, p3p2); 196 q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2); 197 198 FILTER_HEV_MASK; 199 FILTER4; 200 201 // Transpose 8x4 to 4x8 202 // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37 203 // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07 204 // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 205 ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8)); 206 // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37 207 x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0); 208 // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27 209 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0); 210 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 211 qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0); 212 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 213 ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0); 214 215 *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0); 216 ps1ps0 = _mm_srli_si128(ps1ps0, 4); 217 *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0); 218 ps1ps0 = _mm_srli_si128(ps1ps0, 4); 219 *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0); 220 ps1ps0 = _mm_srli_si128(ps1ps0, 4); 221 *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0); 222 223 *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0); 224 qs1qs0 = _mm_srli_si128(qs1qs0, 4); 225 *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0); 226 qs1qs0 = _mm_srli_si128(qs1qs0, 4); 227 *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0); 228 qs1qs0 = _mm_srli_si128(qs1qs0, 4); 229 *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0); 230} 231 232void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p, 233 const unsigned char *_blimit, 234 const unsigned char *_limit, 235 const unsigned char *_thresh) { 236 const __m128i zero = _mm_set1_epi16(0); 237 const __m128i one = _mm_set1_epi8(1); 238 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); 239 const __m128i limit = _mm_load_si128((const __m128i *)_limit); 240 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); 241 __m128i mask, hev, flat, flat2; 242 __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; 243 __m128i abs_p1p0; 244 245 q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); 246 q4p4 = _mm_castps_si128( 247 _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); 248 q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); 249 q3p3 = _mm_castps_si128( 250 _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); 251 q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); 252 q2p2 = _mm_castps_si128( 253 _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); 254 q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); 255 q1p1 = _mm_castps_si128( 256 _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); 257 p1q1 = _mm_shuffle_epi32(q1p1, 78); 258 q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); 259 q0p0 = _mm_castps_si128( 260 _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); 261 p0q0 = _mm_shuffle_epi32(q0p0, 78); 262 263 { 264 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; 265 abs_p1p0 = abs_diff(q1p1, q0p0); 266 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); 267 fe = _mm_set1_epi8(0xfe); 268 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 269 abs_p0q0 = abs_diff(q0p0, p0q0); 270 abs_p1q1 = abs_diff(q1p1, p1q1); 271 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 272 hev = _mm_subs_epu8(flat, thresh); 273 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 274 275 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); 276 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 277 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 278 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 279 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 280 mask = _mm_max_epu8(abs_p1p0, mask); 281 // mask |= (abs(p1 - p0) > limit) * -1; 282 // mask |= (abs(q1 - q0) > limit) * -1; 283 284 work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); 285 mask = _mm_max_epu8(work, mask); 286 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); 287 mask = _mm_subs_epu8(mask, limit); 288 mask = _mm_cmpeq_epi8(mask, zero); 289 } 290 291 // lp filter 292 { 293 const __m128i t4 = _mm_set1_epi8(4); 294 const __m128i t3 = _mm_set1_epi8(3); 295 const __m128i t80 = _mm_set1_epi8(0x80); 296 const __m128i t1 = _mm_set1_epi16(0x1); 297 __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); 298 __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); 299 __m128i qs0 = _mm_xor_si128(p0q0, t80); 300 __m128i qs1 = _mm_xor_si128(p1q1, t80); 301 __m128i filt; 302 __m128i work_a; 303 __m128i filter1, filter2; 304 __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; 305 __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; 306 307 filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); 308 work_a = _mm_subs_epi8(qs0, qs0ps0); 309 filt = _mm_adds_epi8(filt, work_a); 310 filt = _mm_adds_epi8(filt, work_a); 311 filt = _mm_adds_epi8(filt, work_a); 312 // (vpx_filter + 3 * (qs0 - ps0)) & mask 313 filt = _mm_and_si128(filt, mask); 314 315 filter1 = _mm_adds_epi8(filt, t4); 316 filter2 = _mm_adds_epi8(filt, t3); 317 318 filter1 = _mm_unpacklo_epi8(zero, filter1); 319 filter1 = _mm_srai_epi16(filter1, 0xB); 320 filter2 = _mm_unpacklo_epi8(zero, filter2); 321 filter2 = _mm_srai_epi16(filter2, 0xB); 322 323 // Filter1 >> 3 324 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); 325 qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); 326 327 // filt >> 1 328 filt = _mm_adds_epi16(filter1, t1); 329 filt = _mm_srai_epi16(filt, 1); 330 filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), 331 filt); 332 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); 333 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); 334 // loopfilter done 335 336 { 337 __m128i work; 338 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); 339 flat = _mm_max_epu8(abs_p1p0, flat); 340 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); 341 flat = _mm_subs_epu8(flat, one); 342 flat = _mm_cmpeq_epi8(flat, zero); 343 flat = _mm_and_si128(flat, mask); 344 345 q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); 346 q5p5 = _mm_castps_si128( 347 _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); 348 349 q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); 350 q6p6 = _mm_castps_si128( 351 _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); 352 flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0)); 353 354 q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); 355 q7p7 = _mm_castps_si128( 356 _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); 357 work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0)); 358 flat2 = _mm_max_epu8(work, flat2); 359 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); 360 flat2 = _mm_subs_epu8(flat2, one); 361 flat2 = _mm_cmpeq_epi8(flat2, zero); 362 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 363 } 364 365 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 366 // flat and wide flat calculations 367 { 368 const __m128i eight = _mm_set1_epi16(8); 369 const __m128i four = _mm_set1_epi16(4); 370 __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; 371 __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; 372 __m128i pixelFilter_p, pixelFilter_q; 373 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; 374 __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; 375 376 p7_16 = _mm_unpacklo_epi8(q7p7, zero); 377 p6_16 = _mm_unpacklo_epi8(q6p6, zero); 378 p5_16 = _mm_unpacklo_epi8(q5p5, zero); 379 p4_16 = _mm_unpacklo_epi8(q4p4, zero); 380 p3_16 = _mm_unpacklo_epi8(q3p3, zero); 381 p2_16 = _mm_unpacklo_epi8(q2p2, zero); 382 p1_16 = _mm_unpacklo_epi8(q1p1, zero); 383 p0_16 = _mm_unpacklo_epi8(q0p0, zero); 384 q0_16 = _mm_unpackhi_epi8(q0p0, zero); 385 q1_16 = _mm_unpackhi_epi8(q1p1, zero); 386 q2_16 = _mm_unpackhi_epi8(q2p2, zero); 387 q3_16 = _mm_unpackhi_epi8(q3p3, zero); 388 q4_16 = _mm_unpackhi_epi8(q4p4, zero); 389 q5_16 = _mm_unpackhi_epi8(q5p5, zero); 390 q6_16 = _mm_unpackhi_epi8(q6p6, zero); 391 q7_16 = _mm_unpackhi_epi8(q7p7, zero); 392 393 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), 394 _mm_add_epi16(p4_16, p3_16)); 395 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), 396 _mm_add_epi16(q4_16, q3_16)); 397 398 pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); 399 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); 400 401 pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); 402 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); 403 pixelFilter_p = 404 _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); 405 pixetFilter_p2p1p0 = _mm_add_epi16( 406 four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); 407 res_p = _mm_srli_epi16( 408 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); 409 res_q = _mm_srli_epi16( 410 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); 411 flat2_q0p0 = _mm_packus_epi16(res_p, res_q); 412 res_p = _mm_srli_epi16( 413 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); 414 res_q = _mm_srli_epi16( 415 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); 416 417 flat_q0p0 = _mm_packus_epi16(res_p, res_q); 418 419 sum_p7 = _mm_add_epi16(p7_16, p7_16); 420 sum_q7 = _mm_add_epi16(q7_16, q7_16); 421 sum_p3 = _mm_add_epi16(p3_16, p3_16); 422 sum_q3 = _mm_add_epi16(q3_16, q3_16); 423 424 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); 425 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); 426 res_p = _mm_srli_epi16( 427 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); 428 res_q = _mm_srli_epi16( 429 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); 430 flat2_q1p1 = _mm_packus_epi16(res_p, res_q); 431 432 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); 433 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); 434 res_p = _mm_srli_epi16( 435 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); 436 res_q = _mm_srli_epi16( 437 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); 438 flat_q1p1 = _mm_packus_epi16(res_p, res_q); 439 440 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 441 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 442 sum_p3 = _mm_add_epi16(sum_p3, p3_16); 443 sum_q3 = _mm_add_epi16(sum_q3, q3_16); 444 445 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); 446 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); 447 res_p = _mm_srli_epi16( 448 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); 449 res_q = _mm_srli_epi16( 450 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); 451 flat2_q2p2 = _mm_packus_epi16(res_p, res_q); 452 453 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); 454 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); 455 456 res_p = _mm_srli_epi16( 457 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); 458 res_q = _mm_srli_epi16( 459 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); 460 flat_q2p2 = _mm_packus_epi16(res_p, res_q); 461 462 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 463 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 464 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); 465 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); 466 res_p = _mm_srli_epi16( 467 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); 468 res_q = _mm_srli_epi16( 469 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); 470 flat2_q3p3 = _mm_packus_epi16(res_p, res_q); 471 472 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 473 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 474 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); 475 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); 476 res_p = _mm_srli_epi16( 477 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); 478 res_q = _mm_srli_epi16( 479 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); 480 flat2_q4p4 = _mm_packus_epi16(res_p, res_q); 481 482 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 483 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 484 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); 485 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); 486 res_p = _mm_srli_epi16( 487 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); 488 res_q = _mm_srli_epi16( 489 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); 490 flat2_q5p5 = _mm_packus_epi16(res_p, res_q); 491 492 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 493 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 494 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); 495 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); 496 res_p = _mm_srli_epi16( 497 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); 498 res_q = _mm_srli_epi16( 499 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); 500 flat2_q6p6 = _mm_packus_epi16(res_p, res_q); 501 } 502 // wide flat 503 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 504 505 flat = _mm_shuffle_epi32(flat, 68); 506 flat2 = _mm_shuffle_epi32(flat2, 68); 507 508 q2p2 = _mm_andnot_si128(flat, q2p2); 509 flat_q2p2 = _mm_and_si128(flat, flat_q2p2); 510 q2p2 = _mm_or_si128(q2p2, flat_q2p2); 511 512 qs1ps1 = _mm_andnot_si128(flat, qs1ps1); 513 flat_q1p1 = _mm_and_si128(flat, flat_q1p1); 514 q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); 515 516 qs0ps0 = _mm_andnot_si128(flat, qs0ps0); 517 flat_q0p0 = _mm_and_si128(flat, flat_q0p0); 518 q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); 519 520 q6p6 = _mm_andnot_si128(flat2, q6p6); 521 flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); 522 q6p6 = _mm_or_si128(q6p6, flat2_q6p6); 523 _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); 524 _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); 525 526 q5p5 = _mm_andnot_si128(flat2, q5p5); 527 flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); 528 q5p5 = _mm_or_si128(q5p5, flat2_q5p5); 529 _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); 530 _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); 531 532 q4p4 = _mm_andnot_si128(flat2, q4p4); 533 flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); 534 q4p4 = _mm_or_si128(q4p4, flat2_q4p4); 535 _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); 536 _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); 537 538 q3p3 = _mm_andnot_si128(flat2, q3p3); 539 flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); 540 q3p3 = _mm_or_si128(q3p3, flat2_q3p3); 541 _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); 542 _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); 543 544 q2p2 = _mm_andnot_si128(flat2, q2p2); 545 flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); 546 q2p2 = _mm_or_si128(q2p2, flat2_q2p2); 547 _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); 548 _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); 549 550 q1p1 = _mm_andnot_si128(flat2, q1p1); 551 flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); 552 q1p1 = _mm_or_si128(q1p1, flat2_q1p1); 553 _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); 554 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); 555 556 q0p0 = _mm_andnot_si128(flat2, q0p0); 557 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); 558 q0p0 = _mm_or_si128(q0p0, flat2_q0p0); 559 _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); 560 _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); 561 } 562} 563 564static INLINE __m128i filter_add2_sub2(const __m128i *const total, 565 const __m128i *const a1, 566 const __m128i *const a2, 567 const __m128i *const s1, 568 const __m128i *const s2) { 569 __m128i x = _mm_add_epi16(*a1, *total); 570 x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2); 571 return x; 572} 573 574static INLINE __m128i filter8_mask(const __m128i *const flat, 575 const __m128i *const other_filt, 576 const __m128i *const f8_lo, 577 const __m128i *const f8_hi) { 578 const __m128i f8 = 579 _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3)); 580 const __m128i result = _mm_and_si128(*flat, f8); 581 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); 582} 583 584static INLINE __m128i filter16_mask(const __m128i *const flat, 585 const __m128i *const other_filt, 586 const __m128i *const f_lo, 587 const __m128i *const f_hi) { 588 const __m128i f = 589 _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4)); 590 const __m128i result = _mm_and_si128(*flat, f); 591 return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result); 592} 593 594void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p, 595 const unsigned char *_blimit, 596 const unsigned char *_limit, 597 const unsigned char *_thresh) { 598 const __m128i zero = _mm_set1_epi16(0); 599 const __m128i one = _mm_set1_epi8(1); 600 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); 601 const __m128i limit = _mm_load_si128((const __m128i *)_limit); 602 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); 603 __m128i mask, hev, flat, flat2; 604 __m128i p7, p6, p5; 605 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; 606 __m128i q5, q6, q7; 607 608 __m128i op2, op1, op0, oq0, oq1, oq2; 609 610 __m128i max_abs_p1p0q1q0; 611 612 p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); 613 p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); 614 p5 = _mm_loadu_si128((__m128i *)(s - 6 * p)); 615 p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); 616 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 617 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 618 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 619 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 620 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 621 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 622 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 623 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 624 q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); 625 q5 = _mm_loadu_si128((__m128i *)(s + 5 * p)); 626 q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); 627 q7 = _mm_loadu_si128((__m128i *)(s + 7 * p)); 628 629 { 630 const __m128i abs_p1p0 = abs_diff(p1, p0); 631 const __m128i abs_q1q0 = abs_diff(q1, q0); 632 const __m128i fe = _mm_set1_epi8(0xfe); 633 const __m128i ff = _mm_cmpeq_epi8(zero, zero); 634 __m128i abs_p0q0 = abs_diff(p0, q0); 635 __m128i abs_p1q1 = abs_diff(p1, q1); 636 __m128i work; 637 max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0); 638 639 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); 640 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 641 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 642 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 643 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 644 mask = _mm_max_epu8(max_abs_p1p0q1q0, mask); 645 // mask |= (abs(p1 - p0) > limit) * -1; 646 // mask |= (abs(q1 - q0) > limit) * -1; 647 work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2)); 648 mask = _mm_max_epu8(work, mask); 649 work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2)); 650 mask = _mm_max_epu8(work, mask); 651 mask = _mm_subs_epu8(mask, limit); 652 mask = _mm_cmpeq_epi8(mask, zero); 653 } 654 655 { 656 __m128i work; 657 work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0)); 658 flat = _mm_max_epu8(work, max_abs_p1p0q1q0); 659 work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0)); 660 flat = _mm_max_epu8(work, flat); 661 work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0)); 662 flat = _mm_subs_epu8(flat, one); 663 flat = _mm_cmpeq_epi8(flat, zero); 664 flat = _mm_and_si128(flat, mask); 665 flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0)); 666 flat2 = _mm_max_epu8(work, flat2); 667 work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0)); 668 flat2 = _mm_max_epu8(work, flat2); 669 work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0)); 670 flat2 = _mm_max_epu8(work, flat2); 671 flat2 = _mm_subs_epu8(flat2, one); 672 flat2 = _mm_cmpeq_epi8(flat2, zero); 673 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 674 } 675 676 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 677 // filter4 678 { 679 const __m128i t4 = _mm_set1_epi8(4); 680 const __m128i t3 = _mm_set1_epi8(3); 681 const __m128i t80 = _mm_set1_epi8(0x80); 682 const __m128i te0 = _mm_set1_epi8(0xe0); 683 const __m128i t1f = _mm_set1_epi8(0x1f); 684 const __m128i t1 = _mm_set1_epi8(0x1); 685 const __m128i t7f = _mm_set1_epi8(0x7f); 686 const __m128i ff = _mm_cmpeq_epi8(t4, t4); 687 688 __m128i filt; 689 __m128i work_a; 690 __m128i filter1, filter2; 691 692 op1 = _mm_xor_si128(p1, t80); 693 op0 = _mm_xor_si128(p0, t80); 694 oq0 = _mm_xor_si128(q0, t80); 695 oq1 = _mm_xor_si128(q1, t80); 696 697 hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh); 698 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 699 filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev); 700 701 work_a = _mm_subs_epi8(oq0, op0); 702 filt = _mm_adds_epi8(filt, work_a); 703 filt = _mm_adds_epi8(filt, work_a); 704 filt = _mm_adds_epi8(filt, work_a); 705 // (vpx_filter + 3 * (qs0 - ps0)) & mask 706 filt = _mm_and_si128(filt, mask); 707 filter1 = _mm_adds_epi8(filt, t4); 708 filter2 = _mm_adds_epi8(filt, t3); 709 710 // Filter1 >> 3 711 work_a = _mm_cmpgt_epi8(zero, filter1); 712 filter1 = _mm_srli_epi16(filter1, 3); 713 work_a = _mm_and_si128(work_a, te0); 714 filter1 = _mm_and_si128(filter1, t1f); 715 filter1 = _mm_or_si128(filter1, work_a); 716 oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80); 717 718 // Filter2 >> 3 719 work_a = _mm_cmpgt_epi8(zero, filter2); 720 filter2 = _mm_srli_epi16(filter2, 3); 721 work_a = _mm_and_si128(work_a, te0); 722 filter2 = _mm_and_si128(filter2, t1f); 723 filter2 = _mm_or_si128(filter2, work_a); 724 op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80); 725 726 // filt >> 1 727 filt = _mm_adds_epi8(filter1, t1); 728 work_a = _mm_cmpgt_epi8(zero, filt); 729 filt = _mm_srli_epi16(filt, 1); 730 work_a = _mm_and_si128(work_a, t80); 731 filt = _mm_and_si128(filt, t7f); 732 filt = _mm_or_si128(filt, work_a); 733 filt = _mm_andnot_si128(hev, filt); 734 op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80); 735 oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80); 736 // loopfilter done 737 738 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 739 // filter8 740 { 741 const __m128i four = _mm_set1_epi16(4); 742 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); 743 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); 744 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); 745 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); 746 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); 747 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); 748 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); 749 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); 750 751 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); 752 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); 753 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); 754 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); 755 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); 756 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); 757 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); 758 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); 759 __m128i f8_lo, f8_hi; 760 761 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four), 762 _mm_add_epi16(p3_lo, p2_lo)); 763 f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo), 764 _mm_add_epi16(p2_lo, p1_lo)); 765 f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo); 766 767 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four), 768 _mm_add_epi16(p3_hi, p2_hi)); 769 f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi), 770 _mm_add_epi16(p2_hi, p1_hi)); 771 f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi); 772 773 op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi); 774 775 f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo); 776 f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi); 777 op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi); 778 779 f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo); 780 f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi); 781 op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi); 782 783 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo); 784 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi); 785 oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi); 786 787 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo); 788 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi); 789 oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi); 790 791 f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo); 792 f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi); 793 oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi); 794 } 795 796 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 797 // wide flat calculations 798 { 799 const __m128i eight = _mm_set1_epi16(8); 800 const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero); 801 const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero); 802 const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero); 803 const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero); 804 const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero); 805 const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero); 806 const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero); 807 const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero); 808 const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero); 809 const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero); 810 const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero); 811 const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero); 812 const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero); 813 const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero); 814 const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero); 815 const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero); 816 817 const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero); 818 const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero); 819 const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero); 820 const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero); 821 const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero); 822 const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero); 823 const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero); 824 const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero); 825 const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero); 826 const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero); 827 const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero); 828 const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero); 829 const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero); 830 const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero); 831 const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero); 832 const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero); 833 834 __m128i f_lo; 835 __m128i f_hi; 836 837 f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7 838 f_lo = 839 _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo)); 840 f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo), 841 _mm_add_epi16(p2_lo, p1_lo)); 842 f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo); 843 f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo); 844 845 f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7 846 f_hi = 847 _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi)); 848 f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi), 849 _mm_add_epi16(p2_hi, p1_hi)); 850 f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi); 851 f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi); 852 853 p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi); 854 _mm_storeu_si128((__m128i *)(s - 7 * p), p6); 855 856 f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo); 857 f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi); 858 p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi); 859 _mm_storeu_si128((__m128i *)(s - 6 * p), p5); 860 861 f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo); 862 f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi); 863 p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi); 864 _mm_storeu_si128((__m128i *)(s - 5 * p), p4); 865 866 f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo); 867 f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi); 868 p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi); 869 _mm_storeu_si128((__m128i *)(s - 4 * p), p3); 870 871 f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo); 872 f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi); 873 op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi); 874 _mm_storeu_si128((__m128i *)(s - 3 * p), op2); 875 876 f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo); 877 f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi); 878 op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi); 879 _mm_storeu_si128((__m128i *)(s - 2 * p), op1); 880 881 f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo); 882 f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi); 883 op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi); 884 _mm_storeu_si128((__m128i *)(s - 1 * p), op0); 885 886 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo); 887 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi); 888 oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi); 889 _mm_storeu_si128((__m128i *)(s - 0 * p), oq0); 890 891 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo); 892 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi); 893 oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi); 894 _mm_storeu_si128((__m128i *)(s + 1 * p), oq1); 895 896 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo); 897 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi); 898 oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi); 899 _mm_storeu_si128((__m128i *)(s + 2 * p), oq2); 900 901 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo); 902 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi); 903 q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi); 904 _mm_storeu_si128((__m128i *)(s + 3 * p), q3); 905 906 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo); 907 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi); 908 q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi); 909 _mm_storeu_si128((__m128i *)(s + 4 * p), q4); 910 911 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo); 912 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi); 913 q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi); 914 _mm_storeu_si128((__m128i *)(s + 5 * p), q5); 915 916 f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo); 917 f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi); 918 q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi); 919 _mm_storeu_si128((__m128i *)(s + 6 * p), q6); 920 } 921 // wide flat 922 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 923 } 924} 925 926void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p, 927 const unsigned char *_blimit, 928 const unsigned char *_limit, 929 const unsigned char *_thresh) { 930 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); 931 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); 932 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); 933 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); 934 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); 935 DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); 936 const __m128i zero = _mm_set1_epi16(0); 937 const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); 938 const __m128i limit = _mm_load_si128((const __m128i *)_limit); 939 const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); 940 __m128i mask, hev, flat; 941 __m128i p3, p2, p1, p0, q0, q1, q2, q3; 942 __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; 943 944 q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), 945 _mm_loadl_epi64((__m128i *)(s + 3 * p))); 946 q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), 947 _mm_loadl_epi64((__m128i *)(s + 2 * p))); 948 q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), 949 _mm_loadl_epi64((__m128i *)(s + 1 * p))); 950 q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), 951 _mm_loadl_epi64((__m128i *)(s - 0 * p))); 952 p1q1 = _mm_shuffle_epi32(q1p1, 78); 953 p0q0 = _mm_shuffle_epi32(q0p0, 78); 954 955 { 956 // filter_mask and hev_mask 957 const __m128i one = _mm_set1_epi8(1); 958 const __m128i fe = _mm_set1_epi8(0xfe); 959 const __m128i ff = _mm_cmpeq_epi8(fe, fe); 960 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; 961 abs_p1p0 = abs_diff(q1p1, q0p0); 962 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); 963 964 abs_p0q0 = abs_diff(q0p0, p0q0); 965 abs_p1q1 = abs_diff(q1p1, p1q1); 966 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 967 hev = _mm_subs_epu8(flat, thresh); 968 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 969 970 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); 971 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 972 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 973 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 974 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 975 mask = _mm_max_epu8(abs_p1p0, mask); 976 // mask |= (abs(p1 - p0) > limit) * -1; 977 // mask |= (abs(q1 - q0) > limit) * -1; 978 979 work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2)); 980 mask = _mm_max_epu8(work, mask); 981 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); 982 mask = _mm_subs_epu8(mask, limit); 983 mask = _mm_cmpeq_epi8(mask, zero); 984 985 // flat_mask4 986 987 flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0)); 988 flat = _mm_max_epu8(abs_p1p0, flat); 989 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); 990 flat = _mm_subs_epu8(flat, one); 991 flat = _mm_cmpeq_epi8(flat, zero); 992 flat = _mm_and_si128(flat, mask); 993 } 994 995 { 996 const __m128i four = _mm_set1_epi16(4); 997 unsigned char *src = s; 998 { 999 __m128i workp_a, workp_b, workp_shft; 1000 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); 1001 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); 1002 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); 1003 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); 1004 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); 1005 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); 1006 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); 1007 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); 1008 1009 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); 1010 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); 1011 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); 1012 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1013 _mm_storel_epi64((__m128i *)&flat_op2[0], 1014 _mm_packus_epi16(workp_shft, workp_shft)); 1015 1016 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); 1017 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1018 _mm_storel_epi64((__m128i *)&flat_op1[0], 1019 _mm_packus_epi16(workp_shft, workp_shft)); 1020 1021 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); 1022 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); 1023 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1024 _mm_storel_epi64((__m128i *)&flat_op0[0], 1025 _mm_packus_epi16(workp_shft, workp_shft)); 1026 1027 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); 1028 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); 1029 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1030 _mm_storel_epi64((__m128i *)&flat_oq0[0], 1031 _mm_packus_epi16(workp_shft, workp_shft)); 1032 1033 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); 1034 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); 1035 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1036 _mm_storel_epi64((__m128i *)&flat_oq1[0], 1037 _mm_packus_epi16(workp_shft, workp_shft)); 1038 1039 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); 1040 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); 1041 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1042 _mm_storel_epi64((__m128i *)&flat_oq2[0], 1043 _mm_packus_epi16(workp_shft, workp_shft)); 1044 } 1045 } 1046 // lp filter 1047 { 1048 const __m128i t4 = _mm_set1_epi8(4); 1049 const __m128i t3 = _mm_set1_epi8(3); 1050 const __m128i t80 = _mm_set1_epi8(0x80); 1051 const __m128i t1 = _mm_set1_epi8(0x1); 1052 const __m128i ps1 = 1053 _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80); 1054 const __m128i ps0 = 1055 _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80); 1056 const __m128i qs0 = 1057 _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80); 1058 const __m128i qs1 = 1059 _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80); 1060 __m128i filt; 1061 __m128i work_a; 1062 __m128i filter1, filter2; 1063 1064 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 1065 work_a = _mm_subs_epi8(qs0, ps0); 1066 filt = _mm_adds_epi8(filt, work_a); 1067 filt = _mm_adds_epi8(filt, work_a); 1068 filt = _mm_adds_epi8(filt, work_a); 1069 // (vpx_filter + 3 * (qs0 - ps0)) & mask 1070 filt = _mm_and_si128(filt, mask); 1071 1072 filter1 = _mm_adds_epi8(filt, t4); 1073 filter2 = _mm_adds_epi8(filt, t3); 1074 1075 // Filter1 >> 3 1076 filter1 = _mm_unpacklo_epi8(zero, filter1); 1077 filter1 = _mm_srai_epi16(filter1, 11); 1078 filter1 = _mm_packs_epi16(filter1, filter1); 1079 1080 // Filter2 >> 3 1081 filter2 = _mm_unpacklo_epi8(zero, filter2); 1082 filter2 = _mm_srai_epi16(filter2, 11); 1083 filter2 = _mm_packs_epi16(filter2, zero); 1084 1085 // filt >> 1 1086 filt = _mm_adds_epi8(filter1, t1); 1087 filt = _mm_unpacklo_epi8(zero, filt); 1088 filt = _mm_srai_epi16(filt, 9); 1089 filt = _mm_packs_epi16(filt, zero); 1090 1091 filt = _mm_andnot_si128(hev, filt); 1092 1093 work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 1094 q0 = _mm_loadl_epi64((__m128i *)flat_oq0); 1095 work_a = _mm_andnot_si128(flat, work_a); 1096 q0 = _mm_and_si128(flat, q0); 1097 q0 = _mm_or_si128(work_a, q0); 1098 1099 work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 1100 q1 = _mm_loadl_epi64((__m128i *)flat_oq1); 1101 work_a = _mm_andnot_si128(flat, work_a); 1102 q1 = _mm_and_si128(flat, q1); 1103 q1 = _mm_or_si128(work_a, q1); 1104 1105 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); 1106 q2 = _mm_loadl_epi64((__m128i *)flat_oq2); 1107 work_a = _mm_andnot_si128(flat, work_a); 1108 q2 = _mm_and_si128(flat, q2); 1109 q2 = _mm_or_si128(work_a, q2); 1110 1111 work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 1112 p0 = _mm_loadl_epi64((__m128i *)flat_op0); 1113 work_a = _mm_andnot_si128(flat, work_a); 1114 p0 = _mm_and_si128(flat, p0); 1115 p0 = _mm_or_si128(work_a, p0); 1116 1117 work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 1118 p1 = _mm_loadl_epi64((__m128i *)flat_op1); 1119 work_a = _mm_andnot_si128(flat, work_a); 1120 p1 = _mm_and_si128(flat, p1); 1121 p1 = _mm_or_si128(work_a, p1); 1122 1123 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); 1124 p2 = _mm_loadl_epi64((__m128i *)flat_op2); 1125 work_a = _mm_andnot_si128(flat, work_a); 1126 p2 = _mm_and_si128(flat, p2); 1127 p2 = _mm_or_si128(work_a, p2); 1128 1129 _mm_storel_epi64((__m128i *)(s - 3 * p), p2); 1130 _mm_storel_epi64((__m128i *)(s - 2 * p), p1); 1131 _mm_storel_epi64((__m128i *)(s - 1 * p), p0); 1132 _mm_storel_epi64((__m128i *)(s + 0 * p), q0); 1133 _mm_storel_epi64((__m128i *)(s + 1 * p), q1); 1134 _mm_storel_epi64((__m128i *)(s + 2 * p), q2); 1135 } 1136} 1137 1138void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0, 1139 const uint8_t *_limit0, 1140 const uint8_t *_thresh0, 1141 const uint8_t *_blimit1, 1142 const uint8_t *_limit1, 1143 const uint8_t *_thresh1) { 1144 DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); 1145 DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); 1146 DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); 1147 DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); 1148 DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); 1149 DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); 1150 const __m128i zero = _mm_set1_epi16(0); 1151 const __m128i blimit = 1152 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), 1153 _mm_load_si128((const __m128i *)_blimit1)); 1154 const __m128i limit = 1155 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), 1156 _mm_load_si128((const __m128i *)_limit1)); 1157 const __m128i thresh = 1158 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), 1159 _mm_load_si128((const __m128i *)_thresh1)); 1160 1161 __m128i mask, hev, flat; 1162 __m128i p3, p2, p1, p0, q0, q1, q2, q3; 1163 1164 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 1165 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 1166 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 1167 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 1168 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 1169 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 1170 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 1171 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 1172 { 1173 const __m128i abs_p1p0 = 1174 _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); 1175 const __m128i abs_q1q0 = 1176 _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); 1177 const __m128i one = _mm_set1_epi8(1); 1178 const __m128i fe = _mm_set1_epi8(0xfe); 1179 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 1180 __m128i abs_p0q0 = 1181 _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); 1182 __m128i abs_p1q1 = 1183 _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); 1184 __m128i work; 1185 1186 // filter_mask and hev_mask 1187 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 1188 hev = _mm_subs_epu8(flat, thresh); 1189 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 1190 1191 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); 1192 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 1193 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 1194 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 1195 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 1196 mask = _mm_max_epu8(flat, mask); 1197 // mask |= (abs(p1 - p0) > limit) * -1; 1198 // mask |= (abs(q1 - q0) > limit) * -1; 1199 work = _mm_max_epu8( 1200 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), 1201 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); 1202 mask = _mm_max_epu8(work, mask); 1203 work = _mm_max_epu8( 1204 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), 1205 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); 1206 mask = _mm_max_epu8(work, mask); 1207 mask = _mm_subs_epu8(mask, limit); 1208 mask = _mm_cmpeq_epi8(mask, zero); 1209 1210 // flat_mask4 1211 work = _mm_max_epu8( 1212 _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), 1213 _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); 1214 flat = _mm_max_epu8(work, flat); 1215 work = _mm_max_epu8( 1216 _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), 1217 _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); 1218 flat = _mm_max_epu8(work, flat); 1219 flat = _mm_subs_epu8(flat, one); 1220 flat = _mm_cmpeq_epi8(flat, zero); 1221 flat = _mm_and_si128(flat, mask); 1222 } 1223 { 1224 const __m128i four = _mm_set1_epi16(4); 1225 unsigned char *src = s; 1226 int i = 0; 1227 1228 do { 1229 __m128i workp_a, workp_b, workp_shft; 1230 p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); 1231 p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); 1232 p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); 1233 p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); 1234 q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); 1235 q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); 1236 q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); 1237 q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); 1238 1239 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); 1240 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); 1241 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); 1242 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1243 _mm_storel_epi64((__m128i *)&flat_op2[i * 8], 1244 _mm_packus_epi16(workp_shft, workp_shft)); 1245 1246 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); 1247 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1248 _mm_storel_epi64((__m128i *)&flat_op1[i * 8], 1249 _mm_packus_epi16(workp_shft, workp_shft)); 1250 1251 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); 1252 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); 1253 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1254 _mm_storel_epi64((__m128i *)&flat_op0[i * 8], 1255 _mm_packus_epi16(workp_shft, workp_shft)); 1256 1257 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); 1258 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); 1259 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1260 _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], 1261 _mm_packus_epi16(workp_shft, workp_shft)); 1262 1263 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); 1264 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); 1265 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1266 _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], 1267 _mm_packus_epi16(workp_shft, workp_shft)); 1268 1269 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); 1270 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); 1271 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 1272 _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], 1273 _mm_packus_epi16(workp_shft, workp_shft)); 1274 1275 src += 8; 1276 } while (++i < 2); 1277 } 1278 // lp filter 1279 { 1280 const __m128i t4 = _mm_set1_epi8(4); 1281 const __m128i t3 = _mm_set1_epi8(3); 1282 const __m128i t80 = _mm_set1_epi8(0x80); 1283 const __m128i te0 = _mm_set1_epi8(0xe0); 1284 const __m128i t1f = _mm_set1_epi8(0x1f); 1285 const __m128i t1 = _mm_set1_epi8(0x1); 1286 const __m128i t7f = _mm_set1_epi8(0x7f); 1287 1288 const __m128i ps1 = 1289 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); 1290 const __m128i ps0 = 1291 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); 1292 const __m128i qs0 = 1293 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); 1294 const __m128i qs1 = 1295 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); 1296 __m128i filt; 1297 __m128i work_a; 1298 __m128i filter1, filter2; 1299 1300 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 1301 work_a = _mm_subs_epi8(qs0, ps0); 1302 filt = _mm_adds_epi8(filt, work_a); 1303 filt = _mm_adds_epi8(filt, work_a); 1304 filt = _mm_adds_epi8(filt, work_a); 1305 // (vpx_filter + 3 * (qs0 - ps0)) & mask 1306 filt = _mm_and_si128(filt, mask); 1307 1308 filter1 = _mm_adds_epi8(filt, t4); 1309 filter2 = _mm_adds_epi8(filt, t3); 1310 1311 // Filter1 >> 3 1312 work_a = _mm_cmpgt_epi8(zero, filter1); 1313 filter1 = _mm_srli_epi16(filter1, 3); 1314 work_a = _mm_and_si128(work_a, te0); 1315 filter1 = _mm_and_si128(filter1, t1f); 1316 filter1 = _mm_or_si128(filter1, work_a); 1317 1318 // Filter2 >> 3 1319 work_a = _mm_cmpgt_epi8(zero, filter2); 1320 filter2 = _mm_srli_epi16(filter2, 3); 1321 work_a = _mm_and_si128(work_a, te0); 1322 filter2 = _mm_and_si128(filter2, t1f); 1323 filter2 = _mm_or_si128(filter2, work_a); 1324 1325 // filt >> 1 1326 filt = _mm_adds_epi8(filter1, t1); 1327 work_a = _mm_cmpgt_epi8(zero, filt); 1328 filt = _mm_srli_epi16(filt, 1); 1329 work_a = _mm_and_si128(work_a, t80); 1330 filt = _mm_and_si128(filt, t7f); 1331 filt = _mm_or_si128(filt, work_a); 1332 1333 filt = _mm_andnot_si128(hev, filt); 1334 1335 work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 1336 q0 = _mm_load_si128((__m128i *)flat_oq0); 1337 work_a = _mm_andnot_si128(flat, work_a); 1338 q0 = _mm_and_si128(flat, q0); 1339 q0 = _mm_or_si128(work_a, q0); 1340 1341 work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 1342 q1 = _mm_load_si128((__m128i *)flat_oq1); 1343 work_a = _mm_andnot_si128(flat, work_a); 1344 q1 = _mm_and_si128(flat, q1); 1345 q1 = _mm_or_si128(work_a, q1); 1346 1347 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); 1348 q2 = _mm_load_si128((__m128i *)flat_oq2); 1349 work_a = _mm_andnot_si128(flat, work_a); 1350 q2 = _mm_and_si128(flat, q2); 1351 q2 = _mm_or_si128(work_a, q2); 1352 1353 work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 1354 p0 = _mm_load_si128((__m128i *)flat_op0); 1355 work_a = _mm_andnot_si128(flat, work_a); 1356 p0 = _mm_and_si128(flat, p0); 1357 p0 = _mm_or_si128(work_a, p0); 1358 1359 work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 1360 p1 = _mm_load_si128((__m128i *)flat_op1); 1361 work_a = _mm_andnot_si128(flat, work_a); 1362 p1 = _mm_and_si128(flat, p1); 1363 p1 = _mm_or_si128(work_a, p1); 1364 1365 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); 1366 p2 = _mm_load_si128((__m128i *)flat_op2); 1367 work_a = _mm_andnot_si128(flat, work_a); 1368 p2 = _mm_and_si128(flat, p2); 1369 p2 = _mm_or_si128(work_a, p2); 1370 1371 _mm_storeu_si128((__m128i *)(s - 3 * p), p2); 1372 _mm_storeu_si128((__m128i *)(s - 2 * p), p1); 1373 _mm_storeu_si128((__m128i *)(s - 1 * p), p0); 1374 _mm_storeu_si128((__m128i *)(s + 0 * p), q0); 1375 _mm_storeu_si128((__m128i *)(s + 1 * p), q1); 1376 _mm_storeu_si128((__m128i *)(s + 2 * p), q2); 1377 } 1378} 1379 1380void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p, 1381 const unsigned char *_blimit0, 1382 const unsigned char *_limit0, 1383 const unsigned char *_thresh0, 1384 const unsigned char *_blimit1, 1385 const unsigned char *_limit1, 1386 const unsigned char *_thresh1) { 1387 const __m128i blimit = 1388 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), 1389 _mm_load_si128((const __m128i *)_blimit1)); 1390 const __m128i limit = 1391 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), 1392 _mm_load_si128((const __m128i *)_limit1)); 1393 const __m128i thresh = 1394 _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), 1395 _mm_load_si128((const __m128i *)_thresh1)); 1396 const __m128i zero = _mm_set1_epi16(0); 1397 __m128i p3, p2, p1, p0, q0, q1, q2, q3; 1398 __m128i mask, hev, flat; 1399 1400 p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 1401 p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 1402 p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 1403 p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 1404 q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 1405 q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 1406 q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 1407 q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 1408 1409 // filter_mask and hev_mask 1410 { 1411 const __m128i abs_p1p0 = 1412 _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); 1413 const __m128i abs_q1q0 = 1414 _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); 1415 const __m128i fe = _mm_set1_epi8(0xfe); 1416 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 1417 __m128i abs_p0q0 = 1418 _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); 1419 __m128i abs_p1q1 = 1420 _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); 1421 __m128i work; 1422 1423 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 1424 hev = _mm_subs_epu8(flat, thresh); 1425 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 1426 1427 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); 1428 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 1429 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 1430 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 1431 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 1432 mask = _mm_max_epu8(flat, mask); 1433 // mask |= (abs(p1 - p0) > limit) * -1; 1434 // mask |= (abs(q1 - q0) > limit) * -1; 1435 work = _mm_max_epu8( 1436 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), 1437 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); 1438 mask = _mm_max_epu8(work, mask); 1439 work = _mm_max_epu8( 1440 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), 1441 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); 1442 mask = _mm_max_epu8(work, mask); 1443 mask = _mm_subs_epu8(mask, limit); 1444 mask = _mm_cmpeq_epi8(mask, zero); 1445 } 1446 1447 // filter4 1448 { 1449 const __m128i t4 = _mm_set1_epi8(4); 1450 const __m128i t3 = _mm_set1_epi8(3); 1451 const __m128i t80 = _mm_set1_epi8(0x80); 1452 const __m128i te0 = _mm_set1_epi8(0xe0); 1453 const __m128i t1f = _mm_set1_epi8(0x1f); 1454 const __m128i t1 = _mm_set1_epi8(0x1); 1455 const __m128i t7f = _mm_set1_epi8(0x7f); 1456 1457 const __m128i ps1 = 1458 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); 1459 const __m128i ps0 = 1460 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); 1461 const __m128i qs0 = 1462 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); 1463 const __m128i qs1 = 1464 _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); 1465 __m128i filt; 1466 __m128i work_a; 1467 __m128i filter1, filter2; 1468 1469 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 1470 work_a = _mm_subs_epi8(qs0, ps0); 1471 filt = _mm_adds_epi8(filt, work_a); 1472 filt = _mm_adds_epi8(filt, work_a); 1473 filt = _mm_adds_epi8(filt, work_a); 1474 // (vpx_filter + 3 * (qs0 - ps0)) & mask 1475 filt = _mm_and_si128(filt, mask); 1476 1477 filter1 = _mm_adds_epi8(filt, t4); 1478 filter2 = _mm_adds_epi8(filt, t3); 1479 1480 // Filter1 >> 3 1481 work_a = _mm_cmpgt_epi8(zero, filter1); 1482 filter1 = _mm_srli_epi16(filter1, 3); 1483 work_a = _mm_and_si128(work_a, te0); 1484 filter1 = _mm_and_si128(filter1, t1f); 1485 filter1 = _mm_or_si128(filter1, work_a); 1486 1487 // Filter2 >> 3 1488 work_a = _mm_cmpgt_epi8(zero, filter2); 1489 filter2 = _mm_srli_epi16(filter2, 3); 1490 work_a = _mm_and_si128(work_a, te0); 1491 filter2 = _mm_and_si128(filter2, t1f); 1492 filter2 = _mm_or_si128(filter2, work_a); 1493 1494 // filt >> 1 1495 filt = _mm_adds_epi8(filter1, t1); 1496 work_a = _mm_cmpgt_epi8(zero, filt); 1497 filt = _mm_srli_epi16(filt, 1); 1498 work_a = _mm_and_si128(work_a, t80); 1499 filt = _mm_and_si128(filt, t7f); 1500 filt = _mm_or_si128(filt, work_a); 1501 1502 filt = _mm_andnot_si128(hev, filt); 1503 1504 q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 1505 q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 1506 p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 1507 p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 1508 1509 _mm_storeu_si128((__m128i *)(s - 2 * p), p1); 1510 _mm_storeu_si128((__m128i *)(s - 1 * p), p0); 1511 _mm_storeu_si128((__m128i *)(s + 0 * p), q0); 1512 _mm_storeu_si128((__m128i *)(s + 1 * p), q1); 1513 } 1514} 1515 1516static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, 1517 int in_p, unsigned char *out, int out_p) { 1518 __m128i x0, x1, x2, x3, x4, x5, x6, x7; 1519 __m128i x8, x9, x10, x11, x12, x13, x14, x15; 1520 1521 // 2-way interleave w/hoisting of unpacks 1522 x0 = _mm_loadl_epi64((__m128i *)in0); // 1 1523 x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3 1524 x0 = _mm_unpacklo_epi8(x0, x1); // 1 1525 1526 x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5 1527 x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7 1528 x1 = _mm_unpacklo_epi8(x2, x3); // 2 1529 1530 x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9 1531 x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11 1532 x2 = _mm_unpacklo_epi8(x4, x5); // 3 1533 1534 x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13 1535 x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15 1536 x3 = _mm_unpacklo_epi8(x6, x7); // 4 1537 x4 = _mm_unpacklo_epi16(x0, x1); // 9 1538 1539 x8 = _mm_loadl_epi64((__m128i *)in1); // 2 1540 x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4 1541 x8 = _mm_unpacklo_epi8(x8, x9); // 5 1542 x5 = _mm_unpacklo_epi16(x2, x3); // 10 1543 1544 x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6 1545 x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8 1546 x9 = _mm_unpacklo_epi8(x10, x11); // 6 1547 1548 x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10 1549 x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12 1550 x10 = _mm_unpacklo_epi8(x12, x13); // 7 1551 x12 = _mm_unpacklo_epi16(x8, x9); // 11 1552 1553 x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14 1554 x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16 1555 x11 = _mm_unpacklo_epi8(x14, x15); // 8 1556 x13 = _mm_unpacklo_epi16(x10, x11); // 12 1557 1558 x6 = _mm_unpacklo_epi32(x4, x5); // 13 1559 x7 = _mm_unpackhi_epi32(x4, x5); // 14 1560 x14 = _mm_unpacklo_epi32(x12, x13); // 15 1561 x15 = _mm_unpackhi_epi32(x12, x13); // 16 1562 1563 // Store first 4-line result 1564 _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); 1565 _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); 1566 _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); 1567 _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); 1568 1569 x4 = _mm_unpackhi_epi16(x0, x1); 1570 x5 = _mm_unpackhi_epi16(x2, x3); 1571 x12 = _mm_unpackhi_epi16(x8, x9); 1572 x13 = _mm_unpackhi_epi16(x10, x11); 1573 1574 x6 = _mm_unpacklo_epi32(x4, x5); 1575 x7 = _mm_unpackhi_epi32(x4, x5); 1576 x14 = _mm_unpacklo_epi32(x12, x13); 1577 x15 = _mm_unpackhi_epi32(x12, x13); 1578 1579 // Store second 4-line result 1580 _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); 1581 _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); 1582 _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); 1583 _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); 1584} 1585 1586static INLINE void transpose(unsigned char *src[], int in_p, 1587 unsigned char *dst[], int out_p, 1588 int num_8x8_to_transpose) { 1589 int idx8x8 = 0; 1590 __m128i x0, x1, x2, x3, x4, x5, x6, x7; 1591 do { 1592 unsigned char *in = src[idx8x8]; 1593 unsigned char *out = dst[idx8x8]; 1594 1595 x0 = 1596 _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 1597 x1 = 1598 _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 1599 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 1600 x0 = _mm_unpacklo_epi8(x0, x1); 1601 1602 x2 = 1603 _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 1604 x3 = 1605 _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 1606 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 1607 x1 = _mm_unpacklo_epi8(x2, x3); 1608 1609 x4 = 1610 _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 1611 x5 = 1612 _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 1613 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 1614 x2 = _mm_unpacklo_epi8(x4, x5); 1615 1616 x6 = 1617 _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 1618 x7 = 1619 _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 1620 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 1621 x3 = _mm_unpacklo_epi8(x6, x7); 1622 1623 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 1624 x4 = _mm_unpacklo_epi16(x0, x1); 1625 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 1626 x5 = _mm_unpacklo_epi16(x2, x3); 1627 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 1628 x6 = _mm_unpacklo_epi32(x4, x5); 1629 _mm_storel_pd((double *)(out + 0 * out_p), 1630 _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 1631 _mm_storeh_pd((double *)(out + 1 * out_p), 1632 _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 1633 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 1634 x7 = _mm_unpackhi_epi32(x4, x5); 1635 _mm_storel_pd((double *)(out + 2 * out_p), 1636 _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 1637 _mm_storeh_pd((double *)(out + 3 * out_p), 1638 _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 1639 1640 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 1641 x4 = _mm_unpackhi_epi16(x0, x1); 1642 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 1643 x5 = _mm_unpackhi_epi16(x2, x3); 1644 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 1645 x6 = _mm_unpacklo_epi32(x4, x5); 1646 _mm_storel_pd((double *)(out + 4 * out_p), 1647 _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 1648 _mm_storeh_pd((double *)(out + 5 * out_p), 1649 _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 1650 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 1651 x7 = _mm_unpackhi_epi32(x4, x5); 1652 1653 _mm_storel_pd((double *)(out + 6 * out_p), 1654 _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 1655 _mm_storeh_pd((double *)(out + 7 * out_p), 1656 _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 1657 } while (++idx8x8 < num_8x8_to_transpose); 1658} 1659 1660void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, 1661 const uint8_t *limit0, const uint8_t *thresh0, 1662 const uint8_t *blimit1, const uint8_t *limit1, 1663 const uint8_t *thresh1) { 1664 DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); 1665 unsigned char *src[2]; 1666 unsigned char *dst[2]; 1667 1668 // Transpose 8x16 1669 transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); 1670 1671 // Loop filtering 1672 vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, 1673 blimit1, limit1, thresh1); 1674 src[0] = t_dst; 1675 src[1] = t_dst + 8; 1676 dst[0] = s - 4; 1677 dst[1] = s - 4 + p * 8; 1678 1679 // Transpose back 1680 transpose(src, 16, dst, p, 2); 1681} 1682 1683void vpx_lpf_vertical_8_sse2(unsigned char *s, int p, 1684 const unsigned char *blimit, 1685 const unsigned char *limit, 1686 const unsigned char *thresh) { 1687 DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]); 1688 unsigned char *src[1]; 1689 unsigned char *dst[1]; 1690 1691 // Transpose 8x8 1692 src[0] = s - 4; 1693 dst[0] = t_dst; 1694 1695 transpose(src, p, dst, 8, 1); 1696 1697 // Loop filtering 1698 vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh); 1699 1700 src[0] = t_dst; 1701 dst[0] = s - 4; 1702 1703 // Transpose back 1704 transpose(src, 8, dst, p, 1); 1705} 1706 1707void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0, 1708 const uint8_t *limit0, const uint8_t *thresh0, 1709 const uint8_t *blimit1, const uint8_t *limit1, 1710 const uint8_t *thresh1) { 1711 DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]); 1712 unsigned char *src[2]; 1713 unsigned char *dst[2]; 1714 1715 // Transpose 8x16 1716 transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); 1717 1718 // Loop filtering 1719 vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0, 1720 blimit1, limit1, thresh1); 1721 src[0] = t_dst; 1722 src[1] = t_dst + 8; 1723 1724 dst[0] = s - 4; 1725 dst[1] = s - 4 + p * 8; 1726 1727 // Transpose back 1728 transpose(src, 16, dst, p, 2); 1729} 1730 1731void vpx_lpf_vertical_16_sse2(unsigned char *s, int p, 1732 const unsigned char *blimit, 1733 const unsigned char *limit, 1734 const unsigned char *thresh) { 1735 DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]); 1736 unsigned char *src[2]; 1737 unsigned char *dst[2]; 1738 1739 src[0] = s - 8; 1740 src[1] = s; 1741 dst[0] = t_dst; 1742 dst[1] = t_dst + 8 * 8; 1743 1744 // Transpose 16x8 1745 transpose(src, p, dst, 8, 2); 1746 1747 // Loop filtering 1748 vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh); 1749 1750 src[0] = t_dst; 1751 src[1] = t_dst + 8 * 8; 1752 dst[0] = s - 8; 1753 dst[1] = s; 1754 1755 // Transpose back 1756 transpose(src, 8, dst, p, 2); 1757} 1758 1759void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p, 1760 const uint8_t *blimit, const uint8_t *limit, 1761 const uint8_t *thresh) { 1762 DECLARE_ALIGNED(16, unsigned char, t_dst[256]); 1763 1764 // Transpose 16x16 1765 transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); 1766 transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); 1767 1768 // Loop filtering 1769 vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh); 1770 1771 // Transpose back 1772 transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); 1773 transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); 1774} 1775