1/* 2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <immintrin.h> 12 13#include "./vpx_dsp_rtcd.h" 14#include "vpx_dsp/x86/convolve.h" 15 16// ----------------------------------------------------------------------------- 17// Copy and average 18 19void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, 20 uint16_t *dst, ptrdiff_t dst_stride, 21 const InterpKernel *filter, int x0_q4, 22 int x_step_q4, int y0_q4, int y_step_q4, 23 int width, int h, int bd) { 24 (void)filter; 25 (void)x0_q4; 26 (void)x_step_q4; 27 (void)y0_q4; 28 (void)y_step_q4; 29 (void)bd; 30 31 assert(width % 4 == 0); 32 if (width > 32) { // width = 64 33 do { 34 const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); 35 const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); 36 const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); 37 const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); 38 src += src_stride; 39 _mm256_storeu_si256((__m256i *)dst, p0); 40 _mm256_storeu_si256((__m256i *)(dst + 16), p1); 41 _mm256_storeu_si256((__m256i *)(dst + 32), p2); 42 _mm256_storeu_si256((__m256i *)(dst + 48), p3); 43 dst += dst_stride; 44 h--; 45 } while (h > 0); 46 } else if (width > 16) { // width = 32 47 do { 48 const __m256i p0 = _mm256_loadu_si256((const __m256i *)src); 49 const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); 50 src += src_stride; 51 _mm256_storeu_si256((__m256i *)dst, p0); 52 _mm256_storeu_si256((__m256i *)(dst + 16), p1); 53 dst += dst_stride; 54 h--; 55 } while (h > 0); 56 } else if (width > 8) { // width = 16 57 __m256i p0, p1; 58 do { 59 p0 = _mm256_loadu_si256((const __m256i *)src); 60 src += src_stride; 61 p1 = _mm256_loadu_si256((const __m256i *)src); 62 src += src_stride; 63 64 _mm256_storeu_si256((__m256i *)dst, p0); 65 dst += dst_stride; 66 _mm256_storeu_si256((__m256i *)dst, p1); 67 dst += dst_stride; 68 h -= 2; 69 } while (h > 0); 70 } else if (width > 4) { // width = 8 71 __m128i p0, p1; 72 do { 73 p0 = _mm_loadu_si128((const __m128i *)src); 74 src += src_stride; 75 p1 = _mm_loadu_si128((const __m128i *)src); 76 src += src_stride; 77 78 _mm_storeu_si128((__m128i *)dst, p0); 79 dst += dst_stride; 80 _mm_storeu_si128((__m128i *)dst, p1); 81 dst += dst_stride; 82 h -= 2; 83 } while (h > 0); 84 } else { // width = 4 85 __m128i p0, p1; 86 do { 87 p0 = _mm_loadl_epi64((const __m128i *)src); 88 src += src_stride; 89 p1 = _mm_loadl_epi64((const __m128i *)src); 90 src += src_stride; 91 92 _mm_storel_epi64((__m128i *)dst, p0); 93 dst += dst_stride; 94 _mm_storel_epi64((__m128i *)dst, p1); 95 dst += dst_stride; 96 h -= 2; 97 } while (h > 0); 98 } 99} 100 101void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, 102 uint16_t *dst, ptrdiff_t dst_stride, 103 const InterpKernel *filter, int x0_q4, 104 int x_step_q4, int y0_q4, int y_step_q4, 105 int width, int h, int bd) { 106 (void)filter; 107 (void)x0_q4; 108 (void)x_step_q4; 109 (void)y0_q4; 110 (void)y_step_q4; 111 (void)bd; 112 113 assert(width % 4 == 0); 114 if (width > 32) { // width = 64 115 __m256i p0, p1, p2, p3, u0, u1, u2, u3; 116 do { 117 p0 = _mm256_loadu_si256((const __m256i *)src); 118 p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); 119 p2 = _mm256_loadu_si256((const __m256i *)(src + 32)); 120 p3 = _mm256_loadu_si256((const __m256i *)(src + 48)); 121 src += src_stride; 122 u0 = _mm256_loadu_si256((const __m256i *)dst); 123 u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); 124 u2 = _mm256_loadu_si256((const __m256i *)(dst + 32)); 125 u3 = _mm256_loadu_si256((const __m256i *)(dst + 48)); 126 _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); 127 _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); 128 _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2)); 129 _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3)); 130 dst += dst_stride; 131 h--; 132 } while (h > 0); 133 } else if (width > 16) { // width = 32 134 __m256i p0, p1, u0, u1; 135 do { 136 p0 = _mm256_loadu_si256((const __m256i *)src); 137 p1 = _mm256_loadu_si256((const __m256i *)(src + 16)); 138 src += src_stride; 139 u0 = _mm256_loadu_si256((const __m256i *)dst); 140 u1 = _mm256_loadu_si256((const __m256i *)(dst + 16)); 141 _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); 142 _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1)); 143 dst += dst_stride; 144 h--; 145 } while (h > 0); 146 } else if (width > 8) { // width = 16 147 __m256i p0, p1, u0, u1; 148 do { 149 p0 = _mm256_loadu_si256((const __m256i *)src); 150 p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride)); 151 src += src_stride << 1; 152 u0 = _mm256_loadu_si256((const __m256i *)dst); 153 u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride)); 154 155 _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0)); 156 _mm256_storeu_si256((__m256i *)(dst + dst_stride), 157 _mm256_avg_epu16(p1, u1)); 158 dst += dst_stride << 1; 159 h -= 2; 160 } while (h > 0); 161 } else if (width > 4) { // width = 8 162 __m128i p0, p1, u0, u1; 163 do { 164 p0 = _mm_loadu_si128((const __m128i *)src); 165 p1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); 166 src += src_stride << 1; 167 u0 = _mm_loadu_si128((const __m128i *)dst); 168 u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride)); 169 170 _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0)); 171 _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1)); 172 dst += dst_stride << 1; 173 h -= 2; 174 } while (h > 0); 175 } else { // width = 4 176 __m128i p0, p1, u0, u1; 177 do { 178 p0 = _mm_loadl_epi64((const __m128i *)src); 179 p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride)); 180 src += src_stride << 1; 181 u0 = _mm_loadl_epi64((const __m128i *)dst); 182 u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride)); 183 184 _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0)); 185 _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1)); 186 dst += dst_stride << 1; 187 h -= 2; 188 } while (h > 0); 189 } 190} 191 192// ----------------------------------------------------------------------------- 193// Horizontal and vertical filtering 194 195#define CONV8_ROUNDING_BITS (7) 196 197static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 198 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 199 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; 200 201static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9, 202 8, 9, 10, 11, 10, 11, 12, 13, 203 4, 5, 6, 7, 6, 7, 8, 9, 204 8, 9, 10, 11, 10, 11, 12, 13 }; 205 206static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, 207 10, 11, 12, 13, 12, 13, 14, 15, 208 6, 7, 8, 9, 8, 9, 10, 11, 209 10, 11, 12, 13, 12, 13, 14, 15 }; 210 211static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; 212 213// ----------------------------------------------------------------------------- 214// Horizontal Filtering 215 216static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) { 217 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); 218 const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0); 219 const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1); 220 const __m256i c = _mm256_permutevar8x32_epi32(*s, idx); 221 222 p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6 223 p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7 224 p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4 225 p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5 226} 227 228// Note: 229// Shared by 8x2 and 16x1 block 230static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1, 231 __m256i *x /*x[8]*/) { 232 __m256i pp[8]; 233 pack_pixels(s0, pp); 234 pack_pixels(s1, &pp[4]); 235 x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20); 236 x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20); 237 x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20); 238 x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20); 239 x[4] = x[2]; 240 x[5] = x[3]; 241 x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31); 242 x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31); 243} 244 245static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) { 246 __m256i pp[8]; 247 __m256i s0; 248 s0 = _mm256_loadu_si256((const __m256i *)src); 249 pack_pixels(&s0, pp); 250 x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30); 251 x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30); 252 x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30); 253 x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30); 254} 255 256static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride, 257 __m256i *x) { 258 __m256i s0, s1; 259 s0 = _mm256_loadu_si256((const __m256i *)src); 260 s1 = _mm256_loadu_si256((const __m256i *)(src + stride)); 261 pack_16_pixels(&s0, &s1, x); 262} 263 264static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) { 265 __m256i s0, s1; 266 s0 = _mm256_loadu_si256((const __m256i *)src); 267 s1 = _mm256_loadu_si256((const __m256i *)(src + 8)); 268 pack_16_pixels(&s0, &s1, x); 269} 270 271// Note: 272// Shared by horizontal and vertical filtering 273static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) { 274 const __m128i h = _mm_loadu_si128((const __m128i *)filter); 275 const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); 276 const __m256i p0 = _mm256_set1_epi32(0x03020100); 277 const __m256i p1 = _mm256_set1_epi32(0x07060504); 278 const __m256i p2 = _mm256_set1_epi32(0x0b0a0908); 279 const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c); 280 f[0] = _mm256_shuffle_epi8(hh, p0); 281 f[1] = _mm256_shuffle_epi8(hh, p1); 282 f[2] = _mm256_shuffle_epi8(hh, p2); 283 f[3] = _mm256_shuffle_epi8(hh, p3); 284} 285 286static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/, 287 const __m256i *fil /*fil[4]*/, 288 __m256i *y) { 289 __m256i a, a0, a1; 290 291 a0 = _mm256_madd_epi16(fil[0], sig[0]); 292 a1 = _mm256_madd_epi16(fil[3], sig[3]); 293 a = _mm256_add_epi32(a0, a1); 294 295 a0 = _mm256_madd_epi16(fil[1], sig[1]); 296 a1 = _mm256_madd_epi16(fil[2], sig[2]); 297 298 { 299 const __m256i min = _mm256_min_epi32(a0, a1); 300 a = _mm256_add_epi32(a, min); 301 } 302 { 303 const __m256i max = _mm256_max_epi32(a0, a1); 304 a = _mm256_add_epi32(a, max); 305 } 306 { 307 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 308 a = _mm256_add_epi32(a, rounding); 309 *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS); 310 } 311} 312 313static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask, 314 uint16_t *dst) { 315 const __m128i a0 = _mm256_castsi256_si128(*y); 316 const __m128i a1 = _mm256_extractf128_si256(*y, 1); 317 __m128i res = _mm_packus_epi32(a0, a1); 318 res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); 319 _mm_storeu_si128((__m128i *)dst, res); 320} 321 322static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1, 323 const __m256i *mask, uint16_t *dst, 324 ptrdiff_t pitch) { 325 __m256i a = _mm256_packus_epi32(*y0, *y1); 326 a = _mm256_min_epi16(a, *mask); 327 _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); 328 _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); 329} 330 331static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1, 332 const __m256i *mask, uint16_t *dst) { 333 __m256i a = _mm256_packus_epi32(*y0, *y1); 334 a = _mm256_min_epi16(a, *mask); 335 _mm256_storeu_si256((__m256i *)dst, a); 336} 337 338static void vpx_highbd_filter_block1d8_h8_avx2( 339 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 340 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 341 __m256i signal[8], res0, res1; 342 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 343 344 __m256i ff[4]; 345 pack_filters(filter, ff); 346 347 src_ptr -= 3; 348 do { 349 pack_8x2_pixels(src_ptr, src_pitch, signal); 350 filter_8x1_pixels(signal, ff, &res0); 351 filter_8x1_pixels(&signal[4], ff, &res1); 352 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 353 height -= 2; 354 src_ptr += src_pitch << 1; 355 dst_ptr += dst_pitch << 1; 356 } while (height > 1); 357 358 if (height > 0) { 359 pack_8x1_pixels(src_ptr, signal); 360 filter_8x1_pixels(signal, ff, &res0); 361 store_8x1_pixels(&res0, &max, dst_ptr); 362 } 363} 364 365static void vpx_highbd_filter_block1d16_h8_avx2( 366 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 367 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 368 __m256i signal[8], res0, res1; 369 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 370 371 __m256i ff[4]; 372 pack_filters(filter, ff); 373 374 src_ptr -= 3; 375 do { 376 pack_16x1_pixels(src_ptr, signal); 377 filter_8x1_pixels(signal, ff, &res0); 378 filter_8x1_pixels(&signal[4], ff, &res1); 379 store_16x1_pixels(&res0, &res1, &max, dst_ptr); 380 height -= 1; 381 src_ptr += src_pitch; 382 dst_ptr += dst_pitch; 383 } while (height > 0); 384} 385 386// ----------------------------------------------------------------------------- 387// 2-tap horizontal filtering 388 389static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) { 390 const __m128i h = _mm_loadu_si128((const __m128i *)filter); 391 const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1); 392 const __m256i p = _mm256_set1_epi32(0x09080706); 393 f[0] = _mm256_shuffle_epi8(hh, p); 394} 395 396// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels() 397// the difference is s0/s1 specifies first and second rows or, 398// first 16 samples and 8-sample shifted 16 samples 399static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1, 400 __m256i *sig) { 401 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); 402 const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); 403 __m256i x0 = _mm256_shuffle_epi8(*s0, sf2); 404 __m256i x1 = _mm256_shuffle_epi8(*s1, sf2); 405 __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx); 406 __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx); 407 r0 = _mm256_shuffle_epi8(r0, sf2); 408 r1 = _mm256_shuffle_epi8(r1, sf2); 409 sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20); 410 sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20); 411} 412 413static INLINE void pack_8x2_2t_pixels(const uint16_t *src, 414 const ptrdiff_t pitch, __m256i *sig) { 415 const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); 416 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); 417 pack_16_2t_pixels(&r0, &r1, sig); 418} 419 420static INLINE void pack_16x1_2t_pixels(const uint16_t *src, 421 __m256i *sig /*sig[2]*/) { 422 const __m256i r0 = _mm256_loadu_si256((const __m256i *)src); 423 const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8)); 424 pack_16_2t_pixels(&r0, &r1, sig); 425} 426 427static INLINE void pack_8x1_2t_pixels(const uint16_t *src, 428 __m256i *sig /*sig[2]*/) { 429 const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index); 430 const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2); 431 __m256i r0 = _mm256_loadu_si256((const __m256i *)src); 432 __m256i x0 = _mm256_shuffle_epi8(r0, sf2); 433 r0 = _mm256_permutevar8x32_epi32(r0, idx); 434 r0 = _mm256_shuffle_epi8(r0, sf2); 435 sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20); 436} 437 438// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels() 439static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f, 440 __m256i *y0, __m256i *y1) { 441 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 442 __m256i x0 = _mm256_madd_epi16(sig[0], *f); 443 __m256i x1 = _mm256_madd_epi16(sig[1], *f); 444 x0 = _mm256_add_epi32(x0, rounding); 445 x1 = _mm256_add_epi32(x1, rounding); 446 *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); 447 *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS); 448} 449 450static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f, 451 __m256i *y0) { 452 const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 453 __m256i x0 = _mm256_madd_epi16(sig[0], *f); 454 x0 = _mm256_add_epi32(x0, rounding); 455 *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS); 456} 457 458static void vpx_highbd_filter_block1d8_h2_avx2( 459 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 460 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 461 __m256i signal[2], res0, res1; 462 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 463 464 __m256i ff; 465 pack_2t_filter(filter, &ff); 466 467 src_ptr -= 3; 468 do { 469 pack_8x2_2t_pixels(src_ptr, src_pitch, signal); 470 filter_16_2t_pixels(signal, &ff, &res0, &res1); 471 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 472 height -= 2; 473 src_ptr += src_pitch << 1; 474 dst_ptr += dst_pitch << 1; 475 } while (height > 1); 476 477 if (height > 0) { 478 pack_8x1_2t_pixels(src_ptr, signal); 479 filter_8x1_2t_pixels(signal, &ff, &res0); 480 store_8x1_pixels(&res0, &max, dst_ptr); 481 } 482} 483 484static void vpx_highbd_filter_block1d16_h2_avx2( 485 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 486 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 487 __m256i signal[2], res0, res1; 488 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 489 490 __m256i ff; 491 pack_2t_filter(filter, &ff); 492 493 src_ptr -= 3; 494 do { 495 pack_16x1_2t_pixels(src_ptr, signal); 496 filter_16_2t_pixels(signal, &ff, &res0, &res1); 497 store_16x1_pixels(&res0, &res1, &max, dst_ptr); 498 height -= 1; 499 src_ptr += src_pitch; 500 dst_ptr += dst_pitch; 501 } while (height > 0); 502} 503 504// ----------------------------------------------------------------------------- 505// Vertical Filtering 506 507static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { 508 __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src)); 509 __m256i s1 = 510 _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch))); 511 __m256i s2 = _mm256_castsi128_si256( 512 _mm_loadu_si128((const __m128i *)(src + 2 * pitch))); 513 __m256i s3 = _mm256_castsi128_si256( 514 _mm_loadu_si128((const __m128i *)(src + 3 * pitch))); 515 __m256i s4 = _mm256_castsi128_si256( 516 _mm_loadu_si128((const __m128i *)(src + 4 * pitch))); 517 __m256i s5 = _mm256_castsi128_si256( 518 _mm_loadu_si128((const __m128i *)(src + 5 * pitch))); 519 __m256i s6 = _mm256_castsi128_si256( 520 _mm_loadu_si128((const __m128i *)(src + 6 * pitch))); 521 522 s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); 523 s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1); 524 s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1); 525 s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1); 526 s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1); 527 s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1); 528 529 sig[0] = _mm256_unpacklo_epi16(s0, s1); 530 sig[4] = _mm256_unpackhi_epi16(s0, s1); 531 sig[1] = _mm256_unpacklo_epi16(s2, s3); 532 sig[5] = _mm256_unpackhi_epi16(s2, s3); 533 sig[2] = _mm256_unpacklo_epi16(s4, s5); 534 sig[6] = _mm256_unpackhi_epi16(s4, s5); 535 sig[8] = s6; 536} 537 538static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch, 539 __m256i *sig) { 540 // base + 7th row 541 __m256i s0 = _mm256_castsi128_si256( 542 _mm_loadu_si128((const __m128i *)(src + 7 * pitch))); 543 // base + 8th row 544 __m256i s1 = _mm256_castsi128_si256( 545 _mm_loadu_si128((const __m128i *)(src + 8 * pitch))); 546 __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1); 547 __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1); 548 sig[3] = _mm256_unpacklo_epi16(s2, s3); 549 sig[7] = _mm256_unpackhi_epi16(s2, s3); 550 sig[8] = s1; 551} 552 553static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f, 554 __m256i *y0, __m256i *y1) { 555 filter_8x1_pixels(sig, f, y0); 556 filter_8x1_pixels(&sig[4], f, y1); 557} 558 559static INLINE void update_pixels(__m256i *sig) { 560 int i; 561 for (i = 0; i < 3; ++i) { 562 sig[i] = sig[i + 1]; 563 sig[i + 4] = sig[i + 5]; 564 } 565} 566 567static void vpx_highbd_filter_block1d8_v8_avx2( 568 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 569 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 570 __m256i signal[9], res0, res1; 571 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 572 573 __m256i ff[4]; 574 pack_filters(filter, ff); 575 576 pack_8x9_init(src_ptr, src_pitch, signal); 577 578 do { 579 pack_8x9_pixels(src_ptr, src_pitch, signal); 580 581 filter_8x9_pixels(signal, ff, &res0, &res1); 582 store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 583 update_pixels(signal); 584 585 src_ptr += src_pitch << 1; 586 dst_ptr += dst_pitch << 1; 587 height -= 2; 588 } while (height > 0); 589} 590 591static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) { 592 __m256i u0, u1, u2, u3; 593 // load 0-6 rows 594 const __m256i s0 = _mm256_loadu_si256((const __m256i *)src); 595 const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch)); 596 const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch)); 597 const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch)); 598 const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch)); 599 const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch)); 600 const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch)); 601 602 u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low 603 u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high 604 605 u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low 606 u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high 607 608 sig[0] = _mm256_unpacklo_epi16(u0, u2); 609 sig[4] = _mm256_unpackhi_epi16(u0, u2); 610 611 sig[8] = _mm256_unpacklo_epi16(u1, u3); 612 sig[12] = _mm256_unpackhi_epi16(u1, u3); 613 614 u0 = _mm256_permute2x128_si256(s2, s3, 0x20); 615 u1 = _mm256_permute2x128_si256(s2, s3, 0x31); 616 617 u2 = _mm256_permute2x128_si256(s3, s4, 0x20); 618 u3 = _mm256_permute2x128_si256(s3, s4, 0x31); 619 620 sig[1] = _mm256_unpacklo_epi16(u0, u2); 621 sig[5] = _mm256_unpackhi_epi16(u0, u2); 622 623 sig[9] = _mm256_unpacklo_epi16(u1, u3); 624 sig[13] = _mm256_unpackhi_epi16(u1, u3); 625 626 u0 = _mm256_permute2x128_si256(s4, s5, 0x20); 627 u1 = _mm256_permute2x128_si256(s4, s5, 0x31); 628 629 u2 = _mm256_permute2x128_si256(s5, s6, 0x20); 630 u3 = _mm256_permute2x128_si256(s5, s6, 0x31); 631 632 sig[2] = _mm256_unpacklo_epi16(u0, u2); 633 sig[6] = _mm256_unpackhi_epi16(u0, u2); 634 635 sig[10] = _mm256_unpacklo_epi16(u1, u3); 636 sig[14] = _mm256_unpackhi_epi16(u1, u3); 637 638 sig[16] = s6; 639} 640 641static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch, 642 __m256i *sig) { 643 // base + 7th row 644 const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch)); 645 // base + 8th row 646 const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch)); 647 648 __m256i u0, u1, u2, u3; 649 u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20); 650 u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31); 651 652 u2 = _mm256_permute2x128_si256(s7, s8, 0x20); 653 u3 = _mm256_permute2x128_si256(s7, s8, 0x31); 654 655 sig[3] = _mm256_unpacklo_epi16(u0, u2); 656 sig[7] = _mm256_unpackhi_epi16(u0, u2); 657 658 sig[11] = _mm256_unpacklo_epi16(u1, u3); 659 sig[15] = _mm256_unpackhi_epi16(u1, u3); 660 661 sig[16] = s8; 662} 663 664static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f, 665 __m256i *y0, __m256i *y1) { 666 __m256i res[4]; 667 int i; 668 for (i = 0; i < 4; ++i) { 669 filter_8x1_pixels(&sig[i << 2], f, &res[i]); 670 } 671 672 { 673 const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]); 674 const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]); 675 *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20); 676 *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31); 677 } 678} 679 680static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1, 681 const __m256i *mask, uint16_t *dst, 682 ptrdiff_t pitch) { 683 __m256i p = _mm256_min_epi16(*y0, *mask); 684 _mm256_storeu_si256((__m256i *)dst, p); 685 p = _mm256_min_epi16(*y1, *mask); 686 _mm256_storeu_si256((__m256i *)(dst + pitch), p); 687} 688 689static void update_16x9_pixels(__m256i *sig) { 690 update_pixels(&sig[0]); 691 update_pixels(&sig[8]); 692} 693 694static void vpx_highbd_filter_block1d16_v8_avx2( 695 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 696 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 697 __m256i signal[17], res0, res1; 698 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 699 700 __m256i ff[4]; 701 pack_filters(filter, ff); 702 703 pack_16x9_init(src_ptr, src_pitch, signal); 704 705 do { 706 pack_16x9_pixels(src_ptr, src_pitch, signal); 707 filter_16x9_pixels(signal, ff, &res0, &res1); 708 store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 709 update_16x9_pixels(signal); 710 711 src_ptr += src_pitch << 1; 712 dst_ptr += dst_pitch << 1; 713 height -= 2; 714 } while (height > 0); 715} 716 717// ----------------------------------------------------------------------------- 718// 2-tap vertical filtering 719 720static void pack_16x2_init(const uint16_t *src, __m256i *sig) { 721 sig[2] = _mm256_loadu_si256((const __m256i *)src); 722} 723 724static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch, 725 __m256i *sig) { 726 // load the next row 727 const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch)); 728 sig[0] = _mm256_unpacklo_epi16(sig[2], u); 729 sig[1] = _mm256_unpackhi_epi16(sig[2], u); 730 sig[2] = u; 731} 732 733static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f, 734 __m256i *y0, __m256i *y1) { 735 filter_16_2t_pixels(sig, f, y0, y1); 736} 737 738static void vpx_highbd_filter_block1d16_v2_avx2( 739 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 740 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 741 __m256i signal[3], res0, res1; 742 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 743 __m256i ff; 744 745 pack_2t_filter(filter, &ff); 746 pack_16x2_init(src_ptr, signal); 747 748 do { 749 pack_16x2_2t_pixels(src_ptr, src_pitch, signal); 750 filter_16x2_2t_pixels(signal, &ff, &res0, &res1); 751 store_16x1_pixels(&res0, &res1, &max, dst_ptr); 752 753 src_ptr += src_pitch; 754 dst_ptr += dst_pitch; 755 height -= 1; 756 } while (height > 0); 757} 758 759static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) { 760 const __m128i h = _mm_loadu_si128((const __m128i *)filter); 761 const __m128i p = _mm_set1_epi32(0x09080706); 762 f[0] = _mm_shuffle_epi8(h, p); 763} 764 765static void pack_8x2_init(const uint16_t *src, __m128i *sig) { 766 sig[2] = _mm_loadu_si128((const __m128i *)src); 767} 768 769static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch, 770 __m128i *sig) { 771 // load the next row 772 const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch)); 773 sig[0] = _mm_unpacklo_epi16(sig[2], u); 774 sig[1] = _mm_unpackhi_epi16(sig[2], u); 775 sig[2] = u; 776} 777 778static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f, 779 __m128i *y0, __m128i *y1) { 780 const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1)); 781 __m128i x0 = _mm_madd_epi16(sig[0], *f); 782 __m128i x1 = _mm_madd_epi16(sig[1], *f); 783 x0 = _mm_add_epi32(x0, rounding); 784 x1 = _mm_add_epi32(x1, rounding); 785 *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS); 786 *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS); 787} 788 789static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1, 790 const __m128i *mask, uint16_t *dst) { 791 __m128i res = _mm_packus_epi32(*y0, *y1); 792 res = _mm_min_epi16(res, *mask); 793 _mm_storeu_si128((__m128i *)dst, res); 794} 795 796static void vpx_highbd_filter_block1d8_v2_avx2( 797 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 798 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 799 __m128i signal[3], res0, res1; 800 const __m128i max = _mm_set1_epi16((1 << bd) - 1); 801 __m128i ff; 802 803 pack_8x1_2t_filter(filter, &ff); 804 pack_8x2_init(src_ptr, signal); 805 806 do { 807 pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); 808 filter_8_2t_pixels(signal, &ff, &res0, &res1); 809 store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr); 810 811 src_ptr += src_pitch; 812 dst_ptr += dst_pitch; 813 height -= 1; 814 } while (height > 0); 815} 816 817// Calculation with averaging the input pixels 818 819static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask, 820 uint16_t *dst) { 821 const __m128i a0 = _mm256_castsi256_si128(*y0); 822 const __m128i a1 = _mm256_extractf128_si256(*y0, 1); 823 __m128i res = _mm_packus_epi32(a0, a1); 824 const __m128i pix = _mm_loadu_si128((const __m128i *)dst); 825 res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask)); 826 res = _mm_avg_epu16(res, pix); 827 _mm_storeu_si128((__m128i *)dst, res); 828} 829 830static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1, 831 const __m256i *mask, uint16_t *dst, 832 ptrdiff_t pitch) { 833 __m256i a = _mm256_packus_epi32(*y0, *y1); 834 const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst); 835 const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch)); 836 const __m256i pix = 837 _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1); 838 a = _mm256_min_epi16(a, *mask); 839 a = _mm256_avg_epu16(a, pix); 840 _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a)); 841 _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1)); 842} 843 844static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1, 845 const __m256i *mask, uint16_t *dst) { 846 __m256i a = _mm256_packus_epi32(*y0, *y1); 847 const __m256i pix = _mm256_loadu_si256((const __m256i *)dst); 848 a = _mm256_min_epi16(a, *mask); 849 a = _mm256_avg_epu16(a, pix); 850 _mm256_storeu_si256((__m256i *)dst, a); 851} 852 853static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1, 854 const __m256i *mask, uint16_t *dst, 855 ptrdiff_t pitch) { 856 const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst); 857 const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch)); 858 __m256i p = _mm256_min_epi16(*y0, *mask); 859 p = _mm256_avg_epu16(p, pix0); 860 _mm256_storeu_si256((__m256i *)dst, p); 861 862 p = _mm256_min_epi16(*y1, *mask); 863 p = _mm256_avg_epu16(p, pix1); 864 _mm256_storeu_si256((__m256i *)(dst + pitch), p); 865} 866 867static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0, 868 const __m128i *y1, 869 const __m128i *mask, 870 uint16_t *dst) { 871 __m128i res = _mm_packus_epi32(*y0, *y1); 872 const __m128i pix = _mm_loadu_si128((const __m128i *)dst); 873 res = _mm_min_epi16(res, *mask); 874 res = _mm_avg_epu16(res, pix); 875 _mm_storeu_si128((__m128i *)dst, res); 876} 877 878static void vpx_highbd_filter_block1d8_h8_avg_avx2( 879 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 880 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 881 __m256i signal[8], res0, res1; 882 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 883 884 __m256i ff[4]; 885 pack_filters(filter, ff); 886 887 src_ptr -= 3; 888 do { 889 pack_8x2_pixels(src_ptr, src_pitch, signal); 890 filter_8x1_pixels(signal, ff, &res0); 891 filter_8x1_pixels(&signal[4], ff, &res1); 892 store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 893 height -= 2; 894 src_ptr += src_pitch << 1; 895 dst_ptr += dst_pitch << 1; 896 } while (height > 1); 897 898 if (height > 0) { 899 pack_8x1_pixels(src_ptr, signal); 900 filter_8x1_pixels(signal, ff, &res0); 901 store_8x1_avg_pixels(&res0, &max, dst_ptr); 902 } 903} 904 905static void vpx_highbd_filter_block1d16_h8_avg_avx2( 906 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 907 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 908 __m256i signal[8], res0, res1; 909 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 910 911 __m256i ff[4]; 912 pack_filters(filter, ff); 913 914 src_ptr -= 3; 915 do { 916 pack_16x1_pixels(src_ptr, signal); 917 filter_8x1_pixels(signal, ff, &res0); 918 filter_8x1_pixels(&signal[4], ff, &res1); 919 store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); 920 height -= 1; 921 src_ptr += src_pitch; 922 dst_ptr += dst_pitch; 923 } while (height > 0); 924} 925 926static void vpx_highbd_filter_block1d8_v8_avg_avx2( 927 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 928 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 929 __m256i signal[9], res0, res1; 930 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 931 932 __m256i ff[4]; 933 pack_filters(filter, ff); 934 935 pack_8x9_init(src_ptr, src_pitch, signal); 936 937 do { 938 pack_8x9_pixels(src_ptr, src_pitch, signal); 939 940 filter_8x9_pixels(signal, ff, &res0, &res1); 941 store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 942 update_pixels(signal); 943 944 src_ptr += src_pitch << 1; 945 dst_ptr += dst_pitch << 1; 946 height -= 2; 947 } while (height > 0); 948} 949 950static void vpx_highbd_filter_block1d16_v8_avg_avx2( 951 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 952 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 953 __m256i signal[17], res0, res1; 954 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 955 956 __m256i ff[4]; 957 pack_filters(filter, ff); 958 959 pack_16x9_init(src_ptr, src_pitch, signal); 960 961 do { 962 pack_16x9_pixels(src_ptr, src_pitch, signal); 963 filter_16x9_pixels(signal, ff, &res0, &res1); 964 store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 965 update_16x9_pixels(signal); 966 967 src_ptr += src_pitch << 1; 968 dst_ptr += dst_pitch << 1; 969 height -= 2; 970 } while (height > 0); 971} 972 973static void vpx_highbd_filter_block1d8_h2_avg_avx2( 974 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 975 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 976 __m256i signal[2], res0, res1; 977 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 978 979 __m256i ff; 980 pack_2t_filter(filter, &ff); 981 982 src_ptr -= 3; 983 do { 984 pack_8x2_2t_pixels(src_ptr, src_pitch, signal); 985 filter_16_2t_pixels(signal, &ff, &res0, &res1); 986 store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch); 987 height -= 2; 988 src_ptr += src_pitch << 1; 989 dst_ptr += dst_pitch << 1; 990 } while (height > 1); 991 992 if (height > 0) { 993 pack_8x1_2t_pixels(src_ptr, signal); 994 filter_8x1_2t_pixels(signal, &ff, &res0); 995 store_8x1_avg_pixels(&res0, &max, dst_ptr); 996 } 997} 998 999static void vpx_highbd_filter_block1d16_h2_avg_avx2( 1000 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 1001 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 1002 __m256i signal[2], res0, res1; 1003 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 1004 1005 __m256i ff; 1006 pack_2t_filter(filter, &ff); 1007 1008 src_ptr -= 3; 1009 do { 1010 pack_16x1_2t_pixels(src_ptr, signal); 1011 filter_16_2t_pixels(signal, &ff, &res0, &res1); 1012 store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); 1013 height -= 1; 1014 src_ptr += src_pitch; 1015 dst_ptr += dst_pitch; 1016 } while (height > 0); 1017} 1018 1019static void vpx_highbd_filter_block1d16_v2_avg_avx2( 1020 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 1021 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 1022 __m256i signal[3], res0, res1; 1023 const __m256i max = _mm256_set1_epi16((1 << bd) - 1); 1024 __m256i ff; 1025 1026 pack_2t_filter(filter, &ff); 1027 pack_16x2_init(src_ptr, signal); 1028 1029 do { 1030 pack_16x2_2t_pixels(src_ptr, src_pitch, signal); 1031 filter_16x2_2t_pixels(signal, &ff, &res0, &res1); 1032 store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr); 1033 1034 src_ptr += src_pitch; 1035 dst_ptr += dst_pitch; 1036 height -= 1; 1037 } while (height > 0); 1038} 1039 1040static void vpx_highbd_filter_block1d8_v2_avg_avx2( 1041 const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, 1042 ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { 1043 __m128i signal[3], res0, res1; 1044 const __m128i max = _mm_set1_epi16((1 << bd) - 1); 1045 __m128i ff; 1046 1047 pack_8x1_2t_filter(filter, &ff); 1048 pack_8x2_init(src_ptr, signal); 1049 1050 do { 1051 pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal); 1052 filter_8_2t_pixels(signal, &ff, &res0, &res1); 1053 store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr); 1054 1055 src_ptr += src_pitch; 1056 dst_ptr += dst_pitch; 1057 height -= 1; 1058 } while (height > 0); 1059} 1060 1061void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 1062 ptrdiff_t, uint32_t, const int16_t *, 1063 int); 1064void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 1065 ptrdiff_t, uint32_t, const int16_t *, 1066 int); 1067void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 1068 ptrdiff_t, uint32_t, const int16_t *, 1069 int); 1070void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, 1071 ptrdiff_t, uint32_t, const int16_t *, 1072 int); 1073#define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2 1074#define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2 1075#define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 1076#define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 1077 1078HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); 1079HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); 1080HIGH_FUN_CONV_2D(, avx2); 1081 1082void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, 1083 uint16_t *, ptrdiff_t, uint32_t, 1084 const int16_t *, int); 1085void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t, 1086 uint16_t *, ptrdiff_t, uint32_t, 1087 const int16_t *, int); 1088void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t, 1089 uint16_t *, ptrdiff_t, uint32_t, 1090 const int16_t *, int); 1091void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, 1092 uint16_t *, ptrdiff_t, uint32_t, 1093 const int16_t *, int); 1094#define vpx_highbd_filter_block1d4_h8_avg_avx2 \ 1095 vpx_highbd_filter_block1d4_h8_avg_sse2 1096#define vpx_highbd_filter_block1d4_h2_avg_avx2 \ 1097 vpx_highbd_filter_block1d4_h2_avg_sse2 1098#define vpx_highbd_filter_block1d4_v8_avg_avx2 \ 1099 vpx_highbd_filter_block1d4_v8_avg_sse2 1100#define vpx_highbd_filter_block1d4_v2_avg_avx2 \ 1101 vpx_highbd_filter_block1d4_v2_avg_sse2 1102 1103HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2); 1104HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, 1105 avx2); 1106HIGH_FUN_CONV_2D(avg_, avx2); 1107 1108#undef HIGHBD_FUNC 1109