1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <emmintrin.h> // SSE2 12 13#include "./vpx_dsp_rtcd.h" 14#include "vpx_ports/mem.h" 15#include "vpx_ports/emmintrin_compat.h" 16 17static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) { 18 __m128i ubounded; 19 __m128i lbounded; 20 __m128i retval; 21 22 const __m128i zero = _mm_set1_epi16(0); 23 const __m128i one = _mm_set1_epi16(1); 24 __m128i t80, max, min; 25 26 if (bd == 8) { 27 t80 = _mm_set1_epi16(0x80); 28 max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80); 29 } else if (bd == 10) { 30 t80 = _mm_set1_epi16(0x200); 31 max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80); 32 } else { // bd == 12 33 t80 = _mm_set1_epi16(0x800); 34 max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80); 35 } 36 37 min = _mm_subs_epi16(zero, t80); 38 39 ubounded = _mm_cmpgt_epi16(value, max); 40 lbounded = _mm_cmplt_epi16(value, min); 41 retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value); 42 ubounded = _mm_and_si128(ubounded, max); 43 lbounded = _mm_and_si128(lbounded, min); 44 retval = _mm_or_si128(retval, ubounded); 45 retval = _mm_or_si128(retval, lbounded); 46 return retval; 47} 48 49// TODO(debargha, peter): Break up large functions into smaller ones 50// in this file. 51void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p, 52 const uint8_t *_blimit, 53 const uint8_t *_limit, 54 const uint8_t *_thresh, int bd) { 55 const __m128i zero = _mm_set1_epi16(0); 56 const __m128i one = _mm_set1_epi16(1); 57 __m128i blimit, limit, thresh; 58 __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0; 59 __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0; 60 __m128i ps1, qs1, ps0, qs0; 61 __m128i abs_p0q0, abs_p1q1, ffff, work; 62 __m128i filt, work_a, filter1, filter2; 63 __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4; 64 __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1; 65 __m128i flat2_q0, flat2_p0; 66 __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0; 67 __m128i pixelFilter_p, pixelFilter_q; 68 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; 69 __m128i sum_p7, sum_q7, sum_p3, sum_q3; 70 __m128i t4, t3, t80, t1; 71 __m128i eight, four; 72 73 if (bd == 8) { 74 blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); 75 limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); 76 thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); 77 } else if (bd == 10) { 78 blimit = _mm_slli_epi16( 79 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); 80 limit = _mm_slli_epi16( 81 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); 82 thresh = _mm_slli_epi16( 83 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); 84 } else { // bd == 12 85 blimit = _mm_slli_epi16( 86 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); 87 limit = _mm_slli_epi16( 88 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); 89 thresh = _mm_slli_epi16( 90 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); 91 } 92 93 q4 = _mm_load_si128((__m128i *)(s + 4 * p)); 94 p4 = _mm_load_si128((__m128i *)(s - 5 * p)); 95 q3 = _mm_load_si128((__m128i *)(s + 3 * p)); 96 p3 = _mm_load_si128((__m128i *)(s - 4 * p)); 97 q2 = _mm_load_si128((__m128i *)(s + 2 * p)); 98 p2 = _mm_load_si128((__m128i *)(s - 3 * p)); 99 q1 = _mm_load_si128((__m128i *)(s + 1 * p)); 100 p1 = _mm_load_si128((__m128i *)(s - 2 * p)); 101 q0 = _mm_load_si128((__m128i *)(s + 0 * p)); 102 p0 = _mm_load_si128((__m128i *)(s - 1 * p)); 103 104 // highbd_filter_mask 105 abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); 106 abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); 107 108 ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); 109 110 abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); 111 abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); 112 113 // highbd_hev_mask (in C code this is actually called from highbd_filter4) 114 flat = _mm_max_epi16(abs_p1p0, abs_q1q0); 115 hev = _mm_subs_epu16(flat, thresh); 116 hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); 117 118 abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2 119 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2 120 mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); 121 mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); 122 mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); 123 work = _mm_max_epi16( 124 _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)), 125 _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1))); 126 mask = _mm_max_epi16(work, mask); 127 work = _mm_max_epi16( 128 _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), 129 _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); 130 mask = _mm_max_epi16(work, mask); 131 work = _mm_max_epi16( 132 _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), 133 _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); 134 mask = _mm_max_epi16(work, mask); 135 136 mask = _mm_subs_epu16(mask, limit); 137 mask = _mm_cmpeq_epi16(mask, zero); // return ~mask 138 139 // lp filter 140 // highbd_filter4 141 t4 = _mm_set1_epi16(4); 142 t3 = _mm_set1_epi16(3); 143 if (bd == 8) 144 t80 = _mm_set1_epi16(0x80); 145 else if (bd == 10) 146 t80 = _mm_set1_epi16(0x200); 147 else // bd == 12 148 t80 = _mm_set1_epi16(0x800); 149 150 t1 = _mm_set1_epi16(0x1); 151 152 ps1 = _mm_subs_epi16(p1, t80); 153 qs1 = _mm_subs_epi16(q1, t80); 154 ps0 = _mm_subs_epi16(p0, t80); 155 qs0 = _mm_subs_epi16(q0, t80); 156 157 filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd), 158 hev); 159 work_a = _mm_subs_epi16(qs0, ps0); 160 filt = _mm_adds_epi16(filt, work_a); 161 filt = _mm_adds_epi16(filt, work_a); 162 filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); 163 filt = _mm_and_si128(filt, mask); 164 filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); 165 filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); 166 167 // Filter1 >> 3 168 filter1 = _mm_srai_epi16(filter1, 0x3); 169 filter2 = _mm_srai_epi16(filter2, 0x3); 170 171 qs0 = _mm_adds_epi16( 172 signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); 173 ps0 = _mm_adds_epi16( 174 signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); 175 filt = _mm_adds_epi16(filter1, t1); 176 filt = _mm_srai_epi16(filt, 1); 177 filt = _mm_andnot_si128(hev, filt); 178 qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), 179 t80); 180 ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), 181 t80); 182 183 // end highbd_filter4 184 // loopfilter done 185 186 // highbd_flat_mask4 187 flat = _mm_max_epi16( 188 _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), 189 _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3))); 190 work = _mm_max_epi16( 191 _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)), 192 _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); 193 flat = _mm_max_epi16(work, flat); 194 work = _mm_max_epi16(abs_p1p0, abs_q1q0); 195 flat = _mm_max_epi16(work, flat); 196 197 if (bd == 8) 198 flat = _mm_subs_epu16(flat, one); 199 else if (bd == 10) 200 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); 201 else // bd == 12 202 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); 203 204 flat = _mm_cmpeq_epi16(flat, zero); 205 // end flat_mask4 206 207 // flat & mask = flat && mask (as used in filter8) 208 // (because, in both vars, each block of 16 either all 1s or all 0s) 209 flat = _mm_and_si128(flat, mask); 210 211 p5 = _mm_load_si128((__m128i *)(s - 6 * p)); 212 q5 = _mm_load_si128((__m128i *)(s + 5 * p)); 213 p6 = _mm_load_si128((__m128i *)(s - 7 * p)); 214 q6 = _mm_load_si128((__m128i *)(s + 6 * p)); 215 p7 = _mm_load_si128((__m128i *)(s - 8 * p)); 216 q7 = _mm_load_si128((__m128i *)(s + 7 * p)); 217 218 // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7 219 // but referred to as p0-p4 & q0-q4 in fn) 220 flat2 = _mm_max_epi16( 221 _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)), 222 _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4))); 223 224 work = _mm_max_epi16( 225 _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)), 226 _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5))); 227 flat2 = _mm_max_epi16(work, flat2); 228 229 work = _mm_max_epi16( 230 _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)), 231 _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6))); 232 flat2 = _mm_max_epi16(work, flat2); 233 234 work = _mm_max_epi16( 235 _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)), 236 _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7))); 237 flat2 = _mm_max_epi16(work, flat2); 238 239 if (bd == 8) 240 flat2 = _mm_subs_epu16(flat2, one); 241 else if (bd == 10) 242 flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2)); 243 else // bd == 12 244 flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4)); 245 246 flat2 = _mm_cmpeq_epi16(flat2, zero); 247 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 248 // end highbd_flat_mask5 249 250 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 251 // flat and wide flat calculations 252 eight = _mm_set1_epi16(8); 253 four = _mm_set1_epi16(4); 254 255 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3)); 256 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3)); 257 258 pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1)); 259 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); 260 261 pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1)); 262 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); 263 pixelFilter_p = 264 _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); 265 pixetFilter_p2p1p0 = _mm_add_epi16( 266 four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); 267 flat2_p0 = 268 _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4); 269 flat2_q0 = 270 _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4); 271 flat_p0 = _mm_srli_epi16( 272 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3); 273 flat_q0 = _mm_srli_epi16( 274 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3); 275 276 sum_p7 = _mm_add_epi16(p7, p7); 277 sum_q7 = _mm_add_epi16(q7, q7); 278 sum_p3 = _mm_add_epi16(p3, p3); 279 sum_q3 = _mm_add_epi16(q3, q3); 280 281 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6); 282 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6); 283 flat2_p1 = _mm_srli_epi16( 284 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4); 285 flat2_q1 = _mm_srli_epi16( 286 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4); 287 288 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2); 289 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2); 290 flat_p1 = _mm_srli_epi16( 291 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3); 292 flat_q1 = _mm_srli_epi16( 293 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3); 294 295 sum_p7 = _mm_add_epi16(sum_p7, p7); 296 sum_q7 = _mm_add_epi16(sum_q7, q7); 297 sum_p3 = _mm_add_epi16(sum_p3, p3); 298 sum_q3 = _mm_add_epi16(sum_q3, q3); 299 300 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5); 301 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5); 302 flat2_p2 = _mm_srli_epi16( 303 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4); 304 flat2_q2 = _mm_srli_epi16( 305 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4); 306 307 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1); 308 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1); 309 flat_p2 = _mm_srli_epi16( 310 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3); 311 flat_q2 = _mm_srli_epi16( 312 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3); 313 314 sum_p7 = _mm_add_epi16(sum_p7, p7); 315 sum_q7 = _mm_add_epi16(sum_q7, q7); 316 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4); 317 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4); 318 flat2_p3 = _mm_srli_epi16( 319 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4); 320 flat2_q3 = _mm_srli_epi16( 321 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4); 322 323 sum_p7 = _mm_add_epi16(sum_p7, p7); 324 sum_q7 = _mm_add_epi16(sum_q7, q7); 325 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3); 326 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3); 327 flat2_p4 = _mm_srli_epi16( 328 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4); 329 flat2_q4 = _mm_srli_epi16( 330 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4); 331 332 sum_p7 = _mm_add_epi16(sum_p7, p7); 333 sum_q7 = _mm_add_epi16(sum_q7, q7); 334 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2); 335 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2); 336 flat2_p5 = _mm_srli_epi16( 337 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4); 338 flat2_q5 = _mm_srli_epi16( 339 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4); 340 341 sum_p7 = _mm_add_epi16(sum_p7, p7); 342 sum_q7 = _mm_add_epi16(sum_q7, q7); 343 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1); 344 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1); 345 flat2_p6 = _mm_srli_epi16( 346 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4); 347 flat2_q6 = _mm_srli_epi16( 348 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4); 349 350 // wide flat 351 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 352 353 // highbd_filter8 354 p2 = _mm_andnot_si128(flat, p2); 355 // p2 remains unchanged if !(flat && mask) 356 flat_p2 = _mm_and_si128(flat, flat_p2); 357 // when (flat && mask) 358 p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values 359 q2 = _mm_andnot_si128(flat, q2); 360 flat_q2 = _mm_and_si128(flat, flat_q2); 361 q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values 362 363 ps1 = _mm_andnot_si128(flat, ps1); 364 // p1 takes the value assigned to in in filter4 if !(flat && mask) 365 flat_p1 = _mm_and_si128(flat, flat_p1); 366 // when (flat && mask) 367 p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values 368 qs1 = _mm_andnot_si128(flat, qs1); 369 flat_q1 = _mm_and_si128(flat, flat_q1); 370 q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values 371 372 ps0 = _mm_andnot_si128(flat, ps0); 373 // p0 takes the value assigned to in in filter4 if !(flat && mask) 374 flat_p0 = _mm_and_si128(flat, flat_p0); 375 // when (flat && mask) 376 p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values 377 qs0 = _mm_andnot_si128(flat, qs0); 378 flat_q0 = _mm_and_si128(flat, flat_q0); 379 q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values 380 // end highbd_filter8 381 382 // highbd_filter16 383 p6 = _mm_andnot_si128(flat2, p6); 384 // p6 remains unchanged if !(flat2 && flat && mask) 385 flat2_p6 = _mm_and_si128(flat2, flat2_p6); 386 // get values for when (flat2 && flat && mask) 387 p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values 388 q6 = _mm_andnot_si128(flat2, q6); 389 // q6 remains unchanged if !(flat2 && flat && mask) 390 flat2_q6 = _mm_and_si128(flat2, flat2_q6); 391 // get values for when (flat2 && flat && mask) 392 q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values 393 _mm_store_si128((__m128i *)(s - 7 * p), p6); 394 _mm_store_si128((__m128i *)(s + 6 * p), q6); 395 396 p5 = _mm_andnot_si128(flat2, p5); 397 // p5 remains unchanged if !(flat2 && flat && mask) 398 flat2_p5 = _mm_and_si128(flat2, flat2_p5); 399 // get values for when (flat2 && flat && mask) 400 p5 = _mm_or_si128(p5, flat2_p5); 401 // full list of p5 values 402 q5 = _mm_andnot_si128(flat2, q5); 403 // q5 remains unchanged if !(flat2 && flat && mask) 404 flat2_q5 = _mm_and_si128(flat2, flat2_q5); 405 // get values for when (flat2 && flat && mask) 406 q5 = _mm_or_si128(q5, flat2_q5); 407 // full list of q5 values 408 _mm_store_si128((__m128i *)(s - 6 * p), p5); 409 _mm_store_si128((__m128i *)(s + 5 * p), q5); 410 411 p4 = _mm_andnot_si128(flat2, p4); 412 // p4 remains unchanged if !(flat2 && flat && mask) 413 flat2_p4 = _mm_and_si128(flat2, flat2_p4); 414 // get values for when (flat2 && flat && mask) 415 p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values 416 q4 = _mm_andnot_si128(flat2, q4); 417 // q4 remains unchanged if !(flat2 && flat && mask) 418 flat2_q4 = _mm_and_si128(flat2, flat2_q4); 419 // get values for when (flat2 && flat && mask) 420 q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values 421 _mm_store_si128((__m128i *)(s - 5 * p), p4); 422 _mm_store_si128((__m128i *)(s + 4 * p), q4); 423 424 p3 = _mm_andnot_si128(flat2, p3); 425 // p3 takes value from highbd_filter8 if !(flat2 && flat && mask) 426 flat2_p3 = _mm_and_si128(flat2, flat2_p3); 427 // get values for when (flat2 && flat && mask) 428 p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values 429 q3 = _mm_andnot_si128(flat2, q3); 430 // q3 takes value from highbd_filter8 if !(flat2 && flat && mask) 431 flat2_q3 = _mm_and_si128(flat2, flat2_q3); 432 // get values for when (flat2 && flat && mask) 433 q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values 434 _mm_store_si128((__m128i *)(s - 4 * p), p3); 435 _mm_store_si128((__m128i *)(s + 3 * p), q3); 436 437 p2 = _mm_andnot_si128(flat2, p2); 438 // p2 takes value from highbd_filter8 if !(flat2 && flat && mask) 439 flat2_p2 = _mm_and_si128(flat2, flat2_p2); 440 // get values for when (flat2 && flat && mask) 441 p2 = _mm_or_si128(p2, flat2_p2); 442 // full list of p2 values 443 q2 = _mm_andnot_si128(flat2, q2); 444 // q2 takes value from highbd_filter8 if !(flat2 && flat && mask) 445 flat2_q2 = _mm_and_si128(flat2, flat2_q2); 446 // get values for when (flat2 && flat && mask) 447 q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values 448 _mm_store_si128((__m128i *)(s - 3 * p), p2); 449 _mm_store_si128((__m128i *)(s + 2 * p), q2); 450 451 p1 = _mm_andnot_si128(flat2, p1); 452 // p1 takes value from highbd_filter8 if !(flat2 && flat && mask) 453 flat2_p1 = _mm_and_si128(flat2, flat2_p1); 454 // get values for when (flat2 && flat && mask) 455 p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values 456 q1 = _mm_andnot_si128(flat2, q1); 457 // q1 takes value from highbd_filter8 if !(flat2 && flat && mask) 458 flat2_q1 = _mm_and_si128(flat2, flat2_q1); 459 // get values for when (flat2 && flat && mask) 460 q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values 461 _mm_store_si128((__m128i *)(s - 2 * p), p1); 462 _mm_store_si128((__m128i *)(s + 1 * p), q1); 463 464 p0 = _mm_andnot_si128(flat2, p0); 465 // p0 takes value from highbd_filter8 if !(flat2 && flat && mask) 466 flat2_p0 = _mm_and_si128(flat2, flat2_p0); 467 // get values for when (flat2 && flat && mask) 468 p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values 469 q0 = _mm_andnot_si128(flat2, q0); 470 // q0 takes value from highbd_filter8 if !(flat2 && flat && mask) 471 flat2_q0 = _mm_and_si128(flat2, flat2_q0); 472 // get values for when (flat2 && flat && mask) 473 q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values 474 _mm_store_si128((__m128i *)(s - 1 * p), p0); 475 _mm_store_si128((__m128i *)(s - 0 * p), q0); 476} 477 478void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p, 479 const uint8_t *_blimit, 480 const uint8_t *_limit, 481 const uint8_t *_thresh, int bd) { 482 vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd); 483 vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd); 484} 485 486void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p, 487 const uint8_t *_blimit, 488 const uint8_t *_limit, 489 const uint8_t *_thresh, int bd) { 490 DECLARE_ALIGNED(16, uint16_t, flat_op2[16]); 491 DECLARE_ALIGNED(16, uint16_t, flat_op1[16]); 492 DECLARE_ALIGNED(16, uint16_t, flat_op0[16]); 493 DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]); 494 DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]); 495 DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]); 496 const __m128i zero = _mm_set1_epi16(0); 497 __m128i blimit, limit, thresh; 498 __m128i mask, hev, flat; 499 __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p)); 500 __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p)); 501 __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p)); 502 __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p)); 503 __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p)); 504 __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p)); 505 __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p)); 506 __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p)); 507 const __m128i one = _mm_set1_epi16(1); 508 const __m128i ffff = _mm_cmpeq_epi16(one, one); 509 __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; 510 const __m128i four = _mm_set1_epi16(4); 511 __m128i workp_a, workp_b, workp_shft; 512 513 const __m128i t4 = _mm_set1_epi16(4); 514 const __m128i t3 = _mm_set1_epi16(3); 515 __m128i t80; 516 const __m128i t1 = _mm_set1_epi16(0x1); 517 __m128i ps1, ps0, qs0, qs1; 518 __m128i filt; 519 __m128i work_a; 520 __m128i filter1, filter2; 521 522 if (bd == 8) { 523 blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); 524 limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); 525 thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); 526 t80 = _mm_set1_epi16(0x80); 527 } else if (bd == 10) { 528 blimit = _mm_slli_epi16( 529 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); 530 limit = _mm_slli_epi16( 531 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); 532 thresh = _mm_slli_epi16( 533 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); 534 t80 = _mm_set1_epi16(0x200); 535 } else { // bd == 12 536 blimit = _mm_slli_epi16( 537 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); 538 limit = _mm_slli_epi16( 539 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); 540 thresh = _mm_slli_epi16( 541 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); 542 t80 = _mm_set1_epi16(0x800); 543 } 544 545 ps1 = _mm_subs_epi16(p1, t80); 546 ps0 = _mm_subs_epi16(p0, t80); 547 qs0 = _mm_subs_epi16(q0, t80); 548 qs1 = _mm_subs_epi16(q1, t80); 549 550 // filter_mask and hev_mask 551 abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); 552 abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); 553 554 abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); 555 abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); 556 flat = _mm_max_epi16(abs_p1p0, abs_q1q0); 557 hev = _mm_subs_epu16(flat, thresh); 558 hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); 559 560 abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); 561 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); 562 mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); 563 mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); 564 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 565 // So taking maximums continues to work: 566 mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); 567 mask = _mm_max_epi16(abs_p1p0, mask); 568 // mask |= (abs(p1 - p0) > limit) * -1; 569 mask = _mm_max_epi16(abs_q1q0, mask); 570 // mask |= (abs(q1 - q0) > limit) * -1; 571 572 work = _mm_max_epi16( 573 _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), 574 _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2))); 575 mask = _mm_max_epi16(work, mask); 576 work = _mm_max_epi16( 577 _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)), 578 _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); 579 mask = _mm_max_epi16(work, mask); 580 mask = _mm_subs_epu16(mask, limit); 581 mask = _mm_cmpeq_epi16(mask, zero); 582 583 // flat_mask4 584 flat = _mm_max_epi16( 585 _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)), 586 _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2))); 587 work = _mm_max_epi16( 588 _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)), 589 _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3))); 590 flat = _mm_max_epi16(work, flat); 591 flat = _mm_max_epi16(abs_p1p0, flat); 592 flat = _mm_max_epi16(abs_q1q0, flat); 593 594 if (bd == 8) 595 flat = _mm_subs_epu16(flat, one); 596 else if (bd == 10) 597 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2)); 598 else // bd == 12 599 flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4)); 600 601 flat = _mm_cmpeq_epi16(flat, zero); 602 flat = _mm_and_si128(flat, mask); // flat & mask 603 604 // Added before shift for rounding part of ROUND_POWER_OF_TWO 605 606 workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); 607 workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); 608 workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); 609 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 610 _mm_store_si128((__m128i *)&flat_op2[0], workp_shft); 611 612 workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); 613 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 614 _mm_store_si128((__m128i *)&flat_op1[0], workp_shft); 615 616 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); 617 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); 618 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 619 _mm_store_si128((__m128i *)&flat_op0[0], workp_shft); 620 621 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); 622 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); 623 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 624 _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft); 625 626 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); 627 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); 628 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 629 _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft); 630 631 workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); 632 workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); 633 workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); 634 _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft); 635 636 // lp filter 637 filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); 638 filt = _mm_and_si128(filt, hev); 639 work_a = _mm_subs_epi16(qs0, ps0); 640 filt = _mm_adds_epi16(filt, work_a); 641 filt = _mm_adds_epi16(filt, work_a); 642 filt = _mm_adds_epi16(filt, work_a); 643 // (vpx_filter + 3 * (qs0 - ps0)) & mask 644 filt = signed_char_clamp_bd_sse2(filt, bd); 645 filt = _mm_and_si128(filt, mask); 646 647 filter1 = _mm_adds_epi16(filt, t4); 648 filter2 = _mm_adds_epi16(filt, t3); 649 650 // Filter1 >> 3 651 filter1 = signed_char_clamp_bd_sse2(filter1, bd); 652 filter1 = _mm_srai_epi16(filter1, 3); 653 654 // Filter2 >> 3 655 filter2 = signed_char_clamp_bd_sse2(filter2, bd); 656 filter2 = _mm_srai_epi16(filter2, 3); 657 658 // filt >> 1 659 filt = _mm_adds_epi16(filter1, t1); 660 filt = _mm_srai_epi16(filt, 1); 661 // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; 662 filt = _mm_andnot_si128(hev, filt); 663 664 work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd); 665 work_a = _mm_adds_epi16(work_a, t80); 666 q0 = _mm_load_si128((__m128i *)flat_oq0); 667 work_a = _mm_andnot_si128(flat, work_a); 668 q0 = _mm_and_si128(flat, q0); 669 q0 = _mm_or_si128(work_a, q0); 670 671 work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd); 672 work_a = _mm_adds_epi16(work_a, t80); 673 q1 = _mm_load_si128((__m128i *)flat_oq1); 674 work_a = _mm_andnot_si128(flat, work_a); 675 q1 = _mm_and_si128(flat, q1); 676 q1 = _mm_or_si128(work_a, q1); 677 678 work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); 679 q2 = _mm_load_si128((__m128i *)flat_oq2); 680 work_a = _mm_andnot_si128(flat, work_a); 681 q2 = _mm_and_si128(flat, q2); 682 q2 = _mm_or_si128(work_a, q2); 683 684 work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd); 685 work_a = _mm_adds_epi16(work_a, t80); 686 p0 = _mm_load_si128((__m128i *)flat_op0); 687 work_a = _mm_andnot_si128(flat, work_a); 688 p0 = _mm_and_si128(flat, p0); 689 p0 = _mm_or_si128(work_a, p0); 690 691 work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd); 692 work_a = _mm_adds_epi16(work_a, t80); 693 p1 = _mm_load_si128((__m128i *)flat_op1); 694 work_a = _mm_andnot_si128(flat, work_a); 695 p1 = _mm_and_si128(flat, p1); 696 p1 = _mm_or_si128(work_a, p1); 697 698 work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); 699 p2 = _mm_load_si128((__m128i *)flat_op2); 700 work_a = _mm_andnot_si128(flat, work_a); 701 p2 = _mm_and_si128(flat, p2); 702 p2 = _mm_or_si128(work_a, p2); 703 704 _mm_store_si128((__m128i *)(s - 3 * p), p2); 705 _mm_store_si128((__m128i *)(s - 2 * p), p1); 706 _mm_store_si128((__m128i *)(s - 1 * p), p0); 707 _mm_store_si128((__m128i *)(s + 0 * p), q0); 708 _mm_store_si128((__m128i *)(s + 1 * p), q1); 709 _mm_store_si128((__m128i *)(s + 2 * p), q2); 710} 711 712void vpx_highbd_lpf_horizontal_8_dual_sse2( 713 uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, 714 const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, 715 const uint8_t *_thresh1, int bd) { 716 vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd); 717 vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); 718} 719 720void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, 721 const uint8_t *_blimit, 722 const uint8_t *_limit, 723 const uint8_t *_thresh, int bd) { 724 const __m128i zero = _mm_set1_epi16(0); 725 __m128i blimit, limit, thresh; 726 __m128i mask, hev, flat; 727 __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); 728 __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); 729 __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); 730 __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); 731 __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); 732 __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); 733 __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); 734 __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); 735 const __m128i abs_p1p0 = 736 _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)); 737 const __m128i abs_q1q0 = 738 _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)); 739 const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0); 740 const __m128i one = _mm_set1_epi16(1); 741 __m128i abs_p0q0 = 742 _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0)); 743 __m128i abs_p1q1 = 744 _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1)); 745 __m128i work; 746 const __m128i t4 = _mm_set1_epi16(4); 747 const __m128i t3 = _mm_set1_epi16(3); 748 __m128i t80; 749 __m128i tff80; 750 __m128i tffe0; 751 __m128i t1f; 752 // equivalent to shifting 0x1f left by bitdepth - 8 753 // and setting new bits to 1 754 const __m128i t1 = _mm_set1_epi16(0x1); 755 __m128i t7f; 756 // equivalent to shifting 0x7f left by bitdepth - 8 757 // and setting new bits to 1 758 __m128i ps1, ps0, qs0, qs1; 759 __m128i filt; 760 __m128i work_a; 761 __m128i filter1, filter2; 762 763 if (bd == 8) { 764 blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); 765 limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); 766 thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); 767 t80 = _mm_set1_epi16(0x80); 768 tff80 = _mm_set1_epi16(0xff80); 769 tffe0 = _mm_set1_epi16(0xffe0); 770 t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8); 771 t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8); 772 } else if (bd == 10) { 773 blimit = _mm_slli_epi16( 774 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); 775 limit = _mm_slli_epi16( 776 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); 777 thresh = _mm_slli_epi16( 778 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); 779 t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2); 780 tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2); 781 tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2); 782 t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6); 783 t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6); 784 } else { // bd == 12 785 blimit = _mm_slli_epi16( 786 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); 787 limit = _mm_slli_epi16( 788 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); 789 thresh = _mm_slli_epi16( 790 _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); 791 t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4); 792 tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4); 793 tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4); 794 t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4); 795 t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4); 796 } 797 798 ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); 799 ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); 800 qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); 801 qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); 802 803 // filter_mask and hev_mask 804 flat = _mm_max_epi16(abs_p1p0, abs_q1q0); 805 hev = _mm_subs_epu16(flat, thresh); 806 hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff); 807 808 abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); 809 abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); 810 mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit); 811 mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff); 812 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 813 // So taking maximums continues to work: 814 mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one)); 815 mask = _mm_max_epi16(flat, mask); 816 // mask |= (abs(p1 - p0) > limit) * -1; 817 // mask |= (abs(q1 - q0) > limit) * -1; 818 work = _mm_max_epi16( 819 _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)), 820 _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3))); 821 mask = _mm_max_epi16(work, mask); 822 work = _mm_max_epi16( 823 _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)), 824 _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3))); 825 mask = _mm_max_epi16(work, mask); 826 mask = _mm_subs_epu16(mask, limit); 827 mask = _mm_cmpeq_epi16(mask, zero); 828 829 // filter4 830 filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd); 831 filt = _mm_and_si128(filt, hev); 832 work_a = _mm_subs_epi16(qs0, ps0); 833 filt = _mm_adds_epi16(filt, work_a); 834 filt = _mm_adds_epi16(filt, work_a); 835 filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); 836 837 // (vpx_filter + 3 * (qs0 - ps0)) & mask 838 filt = _mm_and_si128(filt, mask); 839 840 filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd); 841 filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd); 842 843 // Filter1 >> 3 844 work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0 845 filter1 = _mm_srli_epi16(filter1, 3); 846 work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0 847 filter1 = _mm_and_si128(filter1, t1f); // clamp the range 848 filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits 849 850 // Filter2 >> 3 851 work_a = _mm_cmpgt_epi16(zero, filter2); 852 filter2 = _mm_srli_epi16(filter2, 3); 853 work_a = _mm_and_si128(work_a, tffe0); 854 filter2 = _mm_and_si128(filter2, t1f); 855 filter2 = _mm_or_si128(filter2, work_a); 856 857 // filt >> 1 858 filt = _mm_adds_epi16(filter1, t1); 859 work_a = _mm_cmpgt_epi16(zero, filt); 860 filt = _mm_srli_epi16(filt, 1); 861 work_a = _mm_and_si128(work_a, tff80); 862 filt = _mm_and_si128(filt, t7f); 863 filt = _mm_or_si128(filt, work_a); 864 865 filt = _mm_andnot_si128(hev, filt); 866 867 q0 = _mm_adds_epi16( 868 signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80); 869 q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd), 870 t80); 871 p0 = _mm_adds_epi16( 872 signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80); 873 p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd), 874 t80); 875 876 _mm_storeu_si128((__m128i *)(s - 2 * p), p1); 877 _mm_storeu_si128((__m128i *)(s - 1 * p), p0); 878 _mm_storeu_si128((__m128i *)(s + 0 * p), q0); 879 _mm_storeu_si128((__m128i *)(s + 1 * p), q1); 880} 881 882void vpx_highbd_lpf_horizontal_4_dual_sse2( 883 uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0, 884 const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1, 885 const uint8_t *_thresh1, int bd) { 886 vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd); 887 vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd); 888} 889 890static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[], 891 int out_p, int num_8x8_to_transpose) { 892 int idx8x8 = 0; 893 __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7; 894 do { 895 uint16_t *in = src[idx8x8]; 896 uint16_t *out = dst[idx8x8]; 897 898 p0 = 899 _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 900 p1 = 901 _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 902 p2 = 903 _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 904 p3 = 905 _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 906 p4 = 907 _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 908 p5 = 909 _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 910 p6 = 911 _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 912 p7 = 913 _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 914 // 00 10 01 11 02 12 03 13 915 x0 = _mm_unpacklo_epi16(p0, p1); 916 // 20 30 21 31 22 32 23 33 917 x1 = _mm_unpacklo_epi16(p2, p3); 918 // 40 50 41 51 42 52 43 53 919 x2 = _mm_unpacklo_epi16(p4, p5); 920 // 60 70 61 71 62 72 63 73 921 x3 = _mm_unpacklo_epi16(p6, p7); 922 // 00 10 20 30 01 11 21 31 923 x4 = _mm_unpacklo_epi32(x0, x1); 924 // 40 50 60 70 41 51 61 71 925 x5 = _mm_unpacklo_epi32(x2, x3); 926 // 00 10 20 30 40 50 60 70 927 x6 = _mm_unpacklo_epi64(x4, x5); 928 // 01 11 21 31 41 51 61 71 929 x7 = _mm_unpackhi_epi64(x4, x5); 930 931 _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6); 932 // 00 10 20 30 40 50 60 70 933 _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7); 934 // 01 11 21 31 41 51 61 71 935 936 // 02 12 22 32 03 13 23 33 937 x4 = _mm_unpackhi_epi32(x0, x1); 938 // 42 52 62 72 43 53 63 73 939 x5 = _mm_unpackhi_epi32(x2, x3); 940 // 02 12 22 32 42 52 62 72 941 x6 = _mm_unpacklo_epi64(x4, x5); 942 // 03 13 23 33 43 53 63 73 943 x7 = _mm_unpackhi_epi64(x4, x5); 944 945 _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6); 946 // 02 12 22 32 42 52 62 72 947 _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7); 948 // 03 13 23 33 43 53 63 73 949 950 // 04 14 05 15 06 16 07 17 951 x0 = _mm_unpackhi_epi16(p0, p1); 952 // 24 34 25 35 26 36 27 37 953 x1 = _mm_unpackhi_epi16(p2, p3); 954 // 44 54 45 55 46 56 47 57 955 x2 = _mm_unpackhi_epi16(p4, p5); 956 // 64 74 65 75 66 76 67 77 957 x3 = _mm_unpackhi_epi16(p6, p7); 958 // 04 14 24 34 05 15 25 35 959 x4 = _mm_unpacklo_epi32(x0, x1); 960 // 44 54 64 74 45 55 65 75 961 x5 = _mm_unpacklo_epi32(x2, x3); 962 // 04 14 24 34 44 54 64 74 963 x6 = _mm_unpacklo_epi64(x4, x5); 964 // 05 15 25 35 45 55 65 75 965 x7 = _mm_unpackhi_epi64(x4, x5); 966 967 _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6); 968 // 04 14 24 34 44 54 64 74 969 _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7); 970 // 05 15 25 35 45 55 65 75 971 972 // 06 16 26 36 07 17 27 37 973 x4 = _mm_unpackhi_epi32(x0, x1); 974 // 46 56 66 76 47 57 67 77 975 x5 = _mm_unpackhi_epi32(x2, x3); 976 // 06 16 26 36 46 56 66 76 977 x6 = _mm_unpacklo_epi64(x4, x5); 978 // 07 17 27 37 47 57 67 77 979 x7 = _mm_unpackhi_epi64(x4, x5); 980 981 _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6); 982 // 06 16 26 36 46 56 66 76 983 _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7); 984 // 07 17 27 37 47 57 67 77 985 } while (++idx8x8 < num_8x8_to_transpose); 986} 987 988static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p, 989 uint16_t *out, int out_p) { 990 uint16_t *src0[1]; 991 uint16_t *src1[1]; 992 uint16_t *dest0[1]; 993 uint16_t *dest1[1]; 994 src0[0] = in0; 995 src1[0] = in1; 996 dest0[0] = out; 997 dest1[0] = out + 8; 998 highbd_transpose(src0, in_p, dest0, out_p, 1); 999 highbd_transpose(src1, in_p, dest1, out_p, 1); 1000} 1001 1002void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit, 1003 const uint8_t *limit, const uint8_t *thresh, 1004 int bd) { 1005 DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); 1006 uint16_t *src[1]; 1007 uint16_t *dst[1]; 1008 1009 // Transpose 8x8 1010 src[0] = s - 4; 1011 dst[0] = t_dst; 1012 1013 highbd_transpose(src, p, dst, 8, 1); 1014 1015 // Loop filtering 1016 vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); 1017 1018 src[0] = t_dst; 1019 dst[0] = s - 4; 1020 1021 // Transpose back 1022 highbd_transpose(src, 8, dst, p, 1); 1023} 1024 1025void vpx_highbd_lpf_vertical_4_dual_sse2( 1026 uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, 1027 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, 1028 const uint8_t *thresh1, int bd) { 1029 DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); 1030 uint16_t *src[2]; 1031 uint16_t *dst[2]; 1032 1033 // Transpose 8x16 1034 highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); 1035 1036 // Loop filtering 1037 vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, 1038 thresh0, blimit1, limit1, thresh1, bd); 1039 src[0] = t_dst; 1040 src[1] = t_dst + 8; 1041 dst[0] = s - 4; 1042 dst[1] = s - 4 + p * 8; 1043 1044 // Transpose back 1045 highbd_transpose(src, 16, dst, p, 2); 1046} 1047 1048void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit, 1049 const uint8_t *limit, const uint8_t *thresh, 1050 int bd) { 1051 DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]); 1052 uint16_t *src[1]; 1053 uint16_t *dst[1]; 1054 1055 // Transpose 8x8 1056 src[0] = s - 4; 1057 dst[0] = t_dst; 1058 1059 highbd_transpose(src, p, dst, 8, 1); 1060 1061 // Loop filtering 1062 vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd); 1063 1064 src[0] = t_dst; 1065 dst[0] = s - 4; 1066 1067 // Transpose back 1068 highbd_transpose(src, 8, dst, p, 1); 1069} 1070 1071void vpx_highbd_lpf_vertical_8_dual_sse2( 1072 uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, 1073 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, 1074 const uint8_t *thresh1, int bd) { 1075 DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]); 1076 uint16_t *src[2]; 1077 uint16_t *dst[2]; 1078 1079 // Transpose 8x16 1080 highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16); 1081 1082 // Loop filtering 1083 vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, 1084 thresh0, blimit1, limit1, thresh1, bd); 1085 src[0] = t_dst; 1086 src[1] = t_dst + 8; 1087 1088 dst[0] = s - 4; 1089 dst[1] = s - 4 + p * 8; 1090 1091 // Transpose back 1092 highbd_transpose(src, 16, dst, p, 2); 1093} 1094 1095void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit, 1096 const uint8_t *limit, 1097 const uint8_t *thresh, int bd) { 1098 DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]); 1099 uint16_t *src[2]; 1100 uint16_t *dst[2]; 1101 1102 src[0] = s - 8; 1103 src[1] = s; 1104 dst[0] = t_dst; 1105 dst[1] = t_dst + 8 * 8; 1106 1107 // Transpose 16x8 1108 highbd_transpose(src, p, dst, 8, 2); 1109 1110 // Loop filtering 1111 vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh, 1112 bd); 1113 src[0] = t_dst; 1114 src[1] = t_dst + 8 * 8; 1115 dst[0] = s - 8; 1116 dst[1] = s; 1117 1118 // Transpose back 1119 highbd_transpose(src, 8, dst, p, 2); 1120} 1121 1122void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p, 1123 const uint8_t *blimit, 1124 const uint8_t *limit, 1125 const uint8_t *thresh, int bd) { 1126 DECLARE_ALIGNED(16, uint16_t, t_dst[256]); 1127 1128 // Transpose 16x16 1129 highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16); 1130 highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16); 1131 1132 // Loop filtering 1133 vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, 1134 thresh, bd); 1135 1136 // Transpose back 1137 highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p); 1138 highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p); 1139} 1140