1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <immintrin.h> // AVX2 12 13#include "./vpx_dsp_rtcd.h" 14#include "vpx_ports/mem.h" 15 16DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { 17 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 18 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 19 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 20 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 21 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 22 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 23 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 24 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 25 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 26 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 27 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 28 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 29 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 30 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 31 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 32 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 33}; 34 35 36void vpx_get16x16var_avx2(const unsigned char *src_ptr, 37 int source_stride, 38 const unsigned char *ref_ptr, 39 int recon_stride, 40 unsigned int *SSE, 41 int *Sum) { 42 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; 43 __m256i ref_expand_high, madd_low, madd_high; 44 unsigned int i, src_2strides, ref_2strides; 45 __m256i zero_reg = _mm256_set1_epi16(0); 46 __m256i sum_ref_src = _mm256_set1_epi16(0); 47 __m256i madd_ref_src = _mm256_set1_epi16(0); 48 49 // processing two strides in a 256 bit register reducing the number 50 // of loop stride by half (comparing to the sse2 code) 51 src_2strides = source_stride << 1; 52 ref_2strides = recon_stride << 1; 53 for (i = 0; i < 8; i++) { 54 src = _mm256_castsi128_si256( 55 _mm_loadu_si128((__m128i const *) (src_ptr))); 56 src = _mm256_inserti128_si256(src, 57 _mm_loadu_si128((__m128i const *)(src_ptr+source_stride)), 1); 58 59 ref =_mm256_castsi128_si256( 60 _mm_loadu_si128((__m128i const *) (ref_ptr))); 61 ref = _mm256_inserti128_si256(ref, 62 _mm_loadu_si128((__m128i const *)(ref_ptr+recon_stride)), 1); 63 64 // expanding to 16 bit each lane 65 src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); 66 src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); 67 68 ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); 69 ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); 70 71 // src-ref 72 src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); 73 src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); 74 75 // madd low (src - ref) 76 madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); 77 78 // add high to low 79 src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); 80 81 // madd high (src - ref) 82 madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); 83 84 sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); 85 86 // add high to low 87 madd_ref_src = _mm256_add_epi32(madd_ref_src, 88 _mm256_add_epi32(madd_low, madd_high)); 89 90 src_ptr+= src_2strides; 91 ref_ptr+= ref_2strides; 92 } 93 94 { 95 __m128i sum_res, madd_res; 96 __m128i expand_sum_low, expand_sum_high, expand_sum; 97 __m128i expand_madd_low, expand_madd_high, expand_madd; 98 __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; 99 100 // extract the low lane and add it to the high lane 101 sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src), 102 _mm256_extractf128_si256(sum_ref_src, 1)); 103 104 madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src), 105 _mm256_extractf128_si256(madd_ref_src, 1)); 106 107 // padding each 2 bytes with another 2 zeroed bytes 108 expand_sum_low = _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), 109 sum_res); 110 expand_sum_high = _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), 111 sum_res); 112 113 // shifting the sign 16 bits right 114 expand_sum_low = _mm_srai_epi32(expand_sum_low, 16); 115 expand_sum_high = _mm_srai_epi32(expand_sum_high, 16); 116 117 expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high); 118 119 // expand each 32 bits of the madd result to 64 bits 120 expand_madd_low = _mm_unpacklo_epi32(madd_res, 121 _mm256_castsi256_si128(zero_reg)); 122 expand_madd_high = _mm_unpackhi_epi32(madd_res, 123 _mm256_castsi256_si128(zero_reg)); 124 125 expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high); 126 127 ex_expand_sum_low = _mm_unpacklo_epi32(expand_sum, 128 _mm256_castsi256_si128(zero_reg)); 129 ex_expand_sum_high = _mm_unpackhi_epi32(expand_sum, 130 _mm256_castsi256_si128(zero_reg)); 131 132 ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high); 133 134 // shift 8 bytes eight 135 madd_res = _mm_srli_si128(expand_madd, 8); 136 sum_res = _mm_srli_si128(ex_expand_sum, 8); 137 138 madd_res = _mm_add_epi32(madd_res, expand_madd); 139 sum_res = _mm_add_epi32(sum_res, ex_expand_sum); 140 141 *((int*)SSE)= _mm_cvtsi128_si32(madd_res); 142 143 *((int*)Sum)= _mm_cvtsi128_si32(sum_res); 144 } 145} 146 147void vpx_get32x32var_avx2(const unsigned char *src_ptr, 148 int source_stride, 149 const unsigned char *ref_ptr, 150 int recon_stride, 151 unsigned int *SSE, 152 int *Sum) { 153 __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low; 154 __m256i ref_expand_high, madd_low, madd_high; 155 unsigned int i; 156 __m256i zero_reg = _mm256_set1_epi16(0); 157 __m256i sum_ref_src = _mm256_set1_epi16(0); 158 __m256i madd_ref_src = _mm256_set1_epi16(0); 159 160 // processing 32 elements in parallel 161 for (i = 0; i < 16; i++) { 162 src = _mm256_loadu_si256((__m256i const *) (src_ptr)); 163 164 ref = _mm256_loadu_si256((__m256i const *) (ref_ptr)); 165 166 // expanding to 16 bit each lane 167 src_expand_low = _mm256_unpacklo_epi8(src, zero_reg); 168 src_expand_high = _mm256_unpackhi_epi8(src, zero_reg); 169 170 ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg); 171 ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg); 172 173 // src-ref 174 src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low); 175 src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high); 176 177 // madd low (src - ref) 178 madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low); 179 180 // add high to low 181 src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high); 182 183 // madd high (src - ref) 184 madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high); 185 186 sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low); 187 188 // add high to low 189 madd_ref_src = _mm256_add_epi32(madd_ref_src, 190 _mm256_add_epi32(madd_low, madd_high)); 191 192 src_ptr+= source_stride; 193 ref_ptr+= recon_stride; 194 } 195 196 { 197 __m256i expand_sum_low, expand_sum_high, expand_sum; 198 __m256i expand_madd_low, expand_madd_high, expand_madd; 199 __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum; 200 201 // padding each 2 bytes with another 2 zeroed bytes 202 expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src); 203 expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src); 204 205 // shifting the sign 16 bits right 206 expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16); 207 expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16); 208 209 expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high); 210 211 // expand each 32 bits of the madd result to 64 bits 212 expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg); 213 expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg); 214 215 expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high); 216 217 ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg); 218 ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg); 219 220 ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high); 221 222 // shift 8 bytes eight 223 madd_ref_src = _mm256_srli_si256(expand_madd, 8); 224 sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8); 225 226 madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd); 227 sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum); 228 229 // extract the low lane and the high lane and add the results 230 *((int*)SSE)= _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) + 231 _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1)); 232 233 *((int*)Sum)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) + 234 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1)); 235 } 236} 237 238#define FILTER_SRC(filter) \ 239 /* filter the source */ \ 240 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ 241 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ 242 \ 243 /* add 8 to source */ \ 244 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ 245 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ 246 \ 247 /* divide source by 16 */ \ 248 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ 249 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); 250 251#define MERGE_WITH_SRC(src_reg, reg) \ 252 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ 253 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); 254 255#define LOAD_SRC_DST \ 256 /* load source and destination */ \ 257 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ 258 dst_reg = _mm256_loadu_si256((__m256i const *) (dst)); 259 260#define AVG_NEXT_SRC(src_reg, size_stride) \ 261 src_next_reg = _mm256_loadu_si256((__m256i const *) \ 262 (src + size_stride)); \ 263 /* average between current and next stride source */ \ 264 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); 265 266#define MERGE_NEXT_SRC(src_reg, size_stride) \ 267 src_next_reg = _mm256_loadu_si256((__m256i const *) \ 268 (src + size_stride)); \ 269 MERGE_WITH_SRC(src_reg, src_next_reg) 270 271#define CALC_SUM_SSE_INSIDE_LOOP \ 272 /* expand each byte to 2 bytes */ \ 273 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ 274 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ 275 /* source - dest */ \ 276 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ 277 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ 278 /* caculate sum */ \ 279 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ 280 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ 281 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ 282 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ 283 /* calculate sse */ \ 284 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ 285 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); 286 287// final calculation to sum and sse 288#define CALC_SUM_AND_SSE \ 289 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ 290 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ 291 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ 292 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ 293 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ 294 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ 295 \ 296 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ 297 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ 298 \ 299 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ 300 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ 301 *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ 302 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ 303 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ 304 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ 305 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ 306 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); 307 308 309unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, 310 int src_stride, 311 int x_offset, 312 int y_offset, 313 const uint8_t *dst, 314 int dst_stride, 315 int height, 316 unsigned int *sse) { 317 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; 318 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; 319 __m256i zero_reg; 320 int i, sum; 321 sum_reg = _mm256_set1_epi16(0); 322 sse_reg = _mm256_set1_epi16(0); 323 zero_reg = _mm256_set1_epi16(0); 324 325 // x_offset = 0 and y_offset = 0 326 if (x_offset == 0) { 327 if (y_offset == 0) { 328 for (i = 0; i < height ; i++) { 329 LOAD_SRC_DST 330 // expend each byte to 2 bytes 331 MERGE_WITH_SRC(src_reg, zero_reg) 332 CALC_SUM_SSE_INSIDE_LOOP 333 src+= src_stride; 334 dst+= dst_stride; 335 } 336 // x_offset = 0 and y_offset = 8 337 } else if (y_offset == 8) { 338 __m256i src_next_reg; 339 for (i = 0; i < height ; i++) { 340 LOAD_SRC_DST 341 AVG_NEXT_SRC(src_reg, src_stride) 342 // expend each byte to 2 bytes 343 MERGE_WITH_SRC(src_reg, zero_reg) 344 CALC_SUM_SSE_INSIDE_LOOP 345 src+= src_stride; 346 dst+= dst_stride; 347 } 348 // x_offset = 0 and y_offset = bilin interpolation 349 } else { 350 __m256i filter, pw8, src_next_reg; 351 352 y_offset <<= 5; 353 filter = _mm256_load_si256((__m256i const *) 354 (bilinear_filters_avx2 + y_offset)); 355 pw8 = _mm256_set1_epi16(8); 356 for (i = 0; i < height ; i++) { 357 LOAD_SRC_DST 358 MERGE_NEXT_SRC(src_reg, src_stride) 359 FILTER_SRC(filter) 360 CALC_SUM_SSE_INSIDE_LOOP 361 src+= src_stride; 362 dst+= dst_stride; 363 } 364 } 365 // x_offset = 8 and y_offset = 0 366 } else if (x_offset == 8) { 367 if (y_offset == 0) { 368 __m256i src_next_reg; 369 for (i = 0; i < height ; i++) { 370 LOAD_SRC_DST 371 AVG_NEXT_SRC(src_reg, 1) 372 // expand each byte to 2 bytes 373 MERGE_WITH_SRC(src_reg, zero_reg) 374 CALC_SUM_SSE_INSIDE_LOOP 375 src+= src_stride; 376 dst+= dst_stride; 377 } 378 // x_offset = 8 and y_offset = 8 379 } else if (y_offset == 8) { 380 __m256i src_next_reg, src_avg; 381 // load source and another source starting from the next 382 // following byte 383 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 384 AVG_NEXT_SRC(src_reg, 1) 385 for (i = 0; i < height ; i++) { 386 src_avg = src_reg; 387 src+= src_stride; 388 LOAD_SRC_DST 389 AVG_NEXT_SRC(src_reg, 1) 390 // average between previous average to current average 391 src_avg = _mm256_avg_epu8(src_avg, src_reg); 392 // expand each byte to 2 bytes 393 MERGE_WITH_SRC(src_avg, zero_reg) 394 // save current source average 395 CALC_SUM_SSE_INSIDE_LOOP 396 dst+= dst_stride; 397 } 398 // x_offset = 8 and y_offset = bilin interpolation 399 } else { 400 __m256i filter, pw8, src_next_reg, src_avg; 401 y_offset <<= 5; 402 filter = _mm256_load_si256((__m256i const *) 403 (bilinear_filters_avx2 + y_offset)); 404 pw8 = _mm256_set1_epi16(8); 405 // load source and another source starting from the next 406 // following byte 407 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 408 AVG_NEXT_SRC(src_reg, 1) 409 for (i = 0; i < height ; i++) { 410 // save current source average 411 src_avg = src_reg; 412 src+= src_stride; 413 LOAD_SRC_DST 414 AVG_NEXT_SRC(src_reg, 1) 415 MERGE_WITH_SRC(src_avg, src_reg) 416 FILTER_SRC(filter) 417 CALC_SUM_SSE_INSIDE_LOOP 418 dst+= dst_stride; 419 } 420 } 421 // x_offset = bilin interpolation and y_offset = 0 422 } else { 423 if (y_offset == 0) { 424 __m256i filter, pw8, src_next_reg; 425 x_offset <<= 5; 426 filter = _mm256_load_si256((__m256i const *) 427 (bilinear_filters_avx2 + x_offset)); 428 pw8 = _mm256_set1_epi16(8); 429 for (i = 0; i < height ; i++) { 430 LOAD_SRC_DST 431 MERGE_NEXT_SRC(src_reg, 1) 432 FILTER_SRC(filter) 433 CALC_SUM_SSE_INSIDE_LOOP 434 src+= src_stride; 435 dst+= dst_stride; 436 } 437 // x_offset = bilin interpolation and y_offset = 8 438 } else if (y_offset == 8) { 439 __m256i filter, pw8, src_next_reg, src_pack; 440 x_offset <<= 5; 441 filter = _mm256_load_si256((__m256i const *) 442 (bilinear_filters_avx2 + x_offset)); 443 pw8 = _mm256_set1_epi16(8); 444 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 445 MERGE_NEXT_SRC(src_reg, 1) 446 FILTER_SRC(filter) 447 // convert each 16 bit to 8 bit to each low and high lane source 448 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 449 for (i = 0; i < height ; i++) { 450 src+= src_stride; 451 LOAD_SRC_DST 452 MERGE_NEXT_SRC(src_reg, 1) 453 FILTER_SRC(filter) 454 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 455 // average between previous pack to the current 456 src_pack = _mm256_avg_epu8(src_pack, src_reg); 457 MERGE_WITH_SRC(src_pack, zero_reg) 458 CALC_SUM_SSE_INSIDE_LOOP 459 src_pack = src_reg; 460 dst+= dst_stride; 461 } 462 // x_offset = bilin interpolation and y_offset = bilin interpolation 463 } else { 464 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; 465 x_offset <<= 5; 466 xfilter = _mm256_load_si256((__m256i const *) 467 (bilinear_filters_avx2 + x_offset)); 468 y_offset <<= 5; 469 yfilter = _mm256_load_si256((__m256i const *) 470 (bilinear_filters_avx2 + y_offset)); 471 pw8 = _mm256_set1_epi16(8); 472 // load source and another source starting from the next 473 // following byte 474 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 475 MERGE_NEXT_SRC(src_reg, 1) 476 477 FILTER_SRC(xfilter) 478 // convert each 16 bit to 8 bit to each low and high lane source 479 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 480 for (i = 0; i < height ; i++) { 481 src+= src_stride; 482 LOAD_SRC_DST 483 MERGE_NEXT_SRC(src_reg, 1) 484 FILTER_SRC(xfilter) 485 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 486 // merge previous pack to current pack source 487 MERGE_WITH_SRC(src_pack, src_reg) 488 // filter the source 489 FILTER_SRC(yfilter) 490 src_pack = src_reg; 491 CALC_SUM_SSE_INSIDE_LOOP 492 dst+= dst_stride; 493 } 494 } 495 } 496 CALC_SUM_AND_SSE 497 return sum; 498} 499 500unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, 501 int src_stride, 502 int x_offset, 503 int y_offset, 504 const uint8_t *dst, 505 int dst_stride, 506 const uint8_t *sec, 507 int sec_stride, 508 int height, 509 unsigned int *sse) { 510 __m256i sec_reg; 511 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; 512 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; 513 __m256i zero_reg; 514 int i, sum; 515 sum_reg = _mm256_set1_epi16(0); 516 sse_reg = _mm256_set1_epi16(0); 517 zero_reg = _mm256_set1_epi16(0); 518 519 // x_offset = 0 and y_offset = 0 520 if (x_offset == 0) { 521 if (y_offset == 0) { 522 for (i = 0; i < height ; i++) { 523 LOAD_SRC_DST 524 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); 525 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 526 sec+= sec_stride; 527 // expend each byte to 2 bytes 528 MERGE_WITH_SRC(src_reg, zero_reg) 529 CALC_SUM_SSE_INSIDE_LOOP 530 src+= src_stride; 531 dst+= dst_stride; 532 } 533 } else if (y_offset == 8) { 534 __m256i src_next_reg; 535 for (i = 0; i < height ; i++) { 536 LOAD_SRC_DST 537 AVG_NEXT_SRC(src_reg, src_stride) 538 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); 539 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 540 sec+= sec_stride; 541 // expend each byte to 2 bytes 542 MERGE_WITH_SRC(src_reg, zero_reg) 543 CALC_SUM_SSE_INSIDE_LOOP 544 src+= src_stride; 545 dst+= dst_stride; 546 } 547 // x_offset = 0 and y_offset = bilin interpolation 548 } else { 549 __m256i filter, pw8, src_next_reg; 550 551 y_offset <<= 5; 552 filter = _mm256_load_si256((__m256i const *) 553 (bilinear_filters_avx2 + y_offset)); 554 pw8 = _mm256_set1_epi16(8); 555 for (i = 0; i < height ; i++) { 556 LOAD_SRC_DST 557 MERGE_NEXT_SRC(src_reg, src_stride) 558 FILTER_SRC(filter) 559 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 560 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); 561 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 562 sec+= sec_stride; 563 MERGE_WITH_SRC(src_reg, zero_reg) 564 CALC_SUM_SSE_INSIDE_LOOP 565 src+= src_stride; 566 dst+= dst_stride; 567 } 568 } 569 // x_offset = 8 and y_offset = 0 570 } else if (x_offset == 8) { 571 if (y_offset == 0) { 572 __m256i src_next_reg; 573 for (i = 0; i < height ; i++) { 574 LOAD_SRC_DST 575 AVG_NEXT_SRC(src_reg, 1) 576 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); 577 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 578 sec+= sec_stride; 579 // expand each byte to 2 bytes 580 MERGE_WITH_SRC(src_reg, zero_reg) 581 CALC_SUM_SSE_INSIDE_LOOP 582 src+= src_stride; 583 dst+= dst_stride; 584 } 585 // x_offset = 8 and y_offset = 8 586 } else if (y_offset == 8) { 587 __m256i src_next_reg, src_avg; 588 // load source and another source starting from the next 589 // following byte 590 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 591 AVG_NEXT_SRC(src_reg, 1) 592 for (i = 0; i < height ; i++) { 593 // save current source average 594 src_avg = src_reg; 595 src+= src_stride; 596 LOAD_SRC_DST 597 AVG_NEXT_SRC(src_reg, 1) 598 // average between previous average to current average 599 src_avg = _mm256_avg_epu8(src_avg, src_reg); 600 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); 601 src_avg = _mm256_avg_epu8(src_avg, sec_reg); 602 sec+= sec_stride; 603 // expand each byte to 2 bytes 604 MERGE_WITH_SRC(src_avg, zero_reg) 605 CALC_SUM_SSE_INSIDE_LOOP 606 dst+= dst_stride; 607 } 608 // x_offset = 8 and y_offset = bilin interpolation 609 } else { 610 __m256i filter, pw8, src_next_reg, src_avg; 611 y_offset <<= 5; 612 filter = _mm256_load_si256((__m256i const *) 613 (bilinear_filters_avx2 + y_offset)); 614 pw8 = _mm256_set1_epi16(8); 615 // load source and another source starting from the next 616 // following byte 617 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 618 AVG_NEXT_SRC(src_reg, 1) 619 for (i = 0; i < height ; i++) { 620 // save current source average 621 src_avg = src_reg; 622 src+= src_stride; 623 LOAD_SRC_DST 624 AVG_NEXT_SRC(src_reg, 1) 625 MERGE_WITH_SRC(src_avg, src_reg) 626 FILTER_SRC(filter) 627 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 628 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); 629 src_avg = _mm256_avg_epu8(src_avg, sec_reg); 630 // expand each byte to 2 bytes 631 MERGE_WITH_SRC(src_avg, zero_reg) 632 sec+= sec_stride; 633 CALC_SUM_SSE_INSIDE_LOOP 634 dst+= dst_stride; 635 } 636 } 637 // x_offset = bilin interpolation and y_offset = 0 638 } else { 639 if (y_offset == 0) { 640 __m256i filter, pw8, src_next_reg; 641 x_offset <<= 5; 642 filter = _mm256_load_si256((__m256i const *) 643 (bilinear_filters_avx2 + x_offset)); 644 pw8 = _mm256_set1_epi16(8); 645 for (i = 0; i < height ; i++) { 646 LOAD_SRC_DST 647 MERGE_NEXT_SRC(src_reg, 1) 648 FILTER_SRC(filter) 649 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 650 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); 651 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 652 MERGE_WITH_SRC(src_reg, zero_reg) 653 sec+= sec_stride; 654 CALC_SUM_SSE_INSIDE_LOOP 655 src+= src_stride; 656 dst+= dst_stride; 657 } 658 // x_offset = bilin interpolation and y_offset = 8 659 } else if (y_offset == 8) { 660 __m256i filter, pw8, src_next_reg, src_pack; 661 x_offset <<= 5; 662 filter = _mm256_load_si256((__m256i const *) 663 (bilinear_filters_avx2 + x_offset)); 664 pw8 = _mm256_set1_epi16(8); 665 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 666 MERGE_NEXT_SRC(src_reg, 1) 667 FILTER_SRC(filter) 668 // convert each 16 bit to 8 bit to each low and high lane source 669 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 670 for (i = 0; i < height ; i++) { 671 src+= src_stride; 672 LOAD_SRC_DST 673 MERGE_NEXT_SRC(src_reg, 1) 674 FILTER_SRC(filter) 675 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 676 // average between previous pack to the current 677 src_pack = _mm256_avg_epu8(src_pack, src_reg); 678 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); 679 src_pack = _mm256_avg_epu8(src_pack, sec_reg); 680 sec+= sec_stride; 681 MERGE_WITH_SRC(src_pack, zero_reg) 682 src_pack = src_reg; 683 CALC_SUM_SSE_INSIDE_LOOP 684 dst+= dst_stride; 685 } 686 // x_offset = bilin interpolation and y_offset = bilin interpolation 687 } else { 688 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; 689 x_offset <<= 5; 690 xfilter = _mm256_load_si256((__m256i const *) 691 (bilinear_filters_avx2 + x_offset)); 692 y_offset <<= 5; 693 yfilter = _mm256_load_si256((__m256i const *) 694 (bilinear_filters_avx2 + y_offset)); 695 pw8 = _mm256_set1_epi16(8); 696 // load source and another source starting from the next 697 // following byte 698 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 699 MERGE_NEXT_SRC(src_reg, 1) 700 701 FILTER_SRC(xfilter) 702 // convert each 16 bit to 8 bit to each low and high lane source 703 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 704 for (i = 0; i < height ; i++) { 705 src+= src_stride; 706 LOAD_SRC_DST 707 MERGE_NEXT_SRC(src_reg, 1) 708 FILTER_SRC(xfilter) 709 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 710 // merge previous pack to current pack source 711 MERGE_WITH_SRC(src_pack, src_reg) 712 // filter the source 713 FILTER_SRC(yfilter) 714 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 715 sec_reg = _mm256_loadu_si256((__m256i const *) (sec)); 716 src_pack = _mm256_avg_epu8(src_pack, sec_reg); 717 MERGE_WITH_SRC(src_pack, zero_reg) 718 src_pack = src_reg; 719 sec+= sec_stride; 720 CALC_SUM_SSE_INSIDE_LOOP 721 dst+= dst_stride; 722 } 723 } 724 } 725 CALC_SUM_AND_SSE 726 return sum; 727} 728