1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <immintrin.h> // AVX2 12#include "vpx_ports/mem.h" 13#include "vp9/encoder/vp9_variance.h" 14 15DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = { 16 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 17 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 18 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 19 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 20 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 21 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 22 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 23 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 24 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 25 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 26 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 27 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 28 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 29 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 30 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 31 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 32 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 33 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 34 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 35 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 36 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 37 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 38 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 39 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 40 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 41 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 42 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 43 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 44 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 45 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 46 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 47 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15 48}; 49 50#define FILTER_SRC(filter) \ 51 /* filter the source */ \ 52 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \ 53 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \ 54 \ 55 /* add 8 to source */ \ 56 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \ 57 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \ 58 \ 59 /* divide source by 16 */ \ 60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \ 61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4); 62 63#define MERGE_WITH_SRC(src_reg, reg) \ 64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \ 65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg); 66 67#define LOAD_SRC_DST \ 68 /* load source and destination */ \ 69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \ 70 dst_reg = _mm256_loadu_si256((__m256i const *) (dst)); 71 72#define AVG_NEXT_SRC(src_reg, size_stride) \ 73 src_next_reg = _mm256_loadu_si256((__m256i const *) \ 74 (src + size_stride)); \ 75 /* average between current and next stride source */ \ 76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg); 77 78#define MERGE_NEXT_SRC(src_reg, size_stride) \ 79 src_next_reg = _mm256_loadu_si256((__m256i const *) \ 80 (src + size_stride)); \ 81 MERGE_WITH_SRC(src_reg, src_next_reg) 82 83#define CALC_SUM_SSE_INSIDE_LOOP \ 84 /* expand each byte to 2 bytes */ \ 85 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \ 86 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \ 87 /* source - dest */ \ 88 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \ 89 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \ 90 /* caculate sum */ \ 91 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \ 92 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \ 93 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \ 94 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \ 95 /* calculate sse */ \ 96 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \ 97 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi); 98 99// final calculation to sum and sse 100#define CALC_SUM_AND_SSE \ 101 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \ 102 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \ 103 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \ 104 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \ 105 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ 106 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \ 107 \ 108 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \ 109 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \ 110 \ 111 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \ 112 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ 113 *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \ 114 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \ 115 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \ 116 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \ 117 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \ 118 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1)); 119 120 121unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, 122 int src_stride, 123 int x_offset, 124 int y_offset, 125 const uint8_t *dst, 126 int dst_stride, 127 int height, 128 unsigned int *sse) { 129 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; 130 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; 131 __m256i zero_reg; 132 int i, sum; 133 sum_reg = _mm256_set1_epi16(0); 134 sse_reg = _mm256_set1_epi16(0); 135 zero_reg = _mm256_set1_epi16(0); 136 137 // x_offset = 0 and y_offset = 0 138 if (x_offset == 0) { 139 if (y_offset == 0) { 140 for (i = 0; i < height ; i++) { 141 LOAD_SRC_DST 142 // expend each byte to 2 bytes 143 MERGE_WITH_SRC(src_reg, zero_reg) 144 CALC_SUM_SSE_INSIDE_LOOP 145 src+= src_stride; 146 dst+= dst_stride; 147 } 148 // x_offset = 0 and y_offset = 8 149 } else if (y_offset == 8) { 150 __m256i src_next_reg; 151 for (i = 0; i < height ; i++) { 152 LOAD_SRC_DST 153 AVG_NEXT_SRC(src_reg, src_stride) 154 // expend each byte to 2 bytes 155 MERGE_WITH_SRC(src_reg, zero_reg) 156 CALC_SUM_SSE_INSIDE_LOOP 157 src+= src_stride; 158 dst+= dst_stride; 159 } 160 // x_offset = 0 and y_offset = bilin interpolation 161 } else { 162 __m256i filter, pw8, src_next_reg; 163 164 y_offset <<= 5; 165 filter = _mm256_load_si256((__m256i const *) 166 (bilinear_filters_avx2 + y_offset)); 167 pw8 = _mm256_set1_epi16(8); 168 for (i = 0; i < height ; i++) { 169 LOAD_SRC_DST 170 MERGE_NEXT_SRC(src_reg, src_stride) 171 FILTER_SRC(filter) 172 CALC_SUM_SSE_INSIDE_LOOP 173 src+= src_stride; 174 dst+= dst_stride; 175 } 176 } 177 // x_offset = 8 and y_offset = 0 178 } else if (x_offset == 8) { 179 if (y_offset == 0) { 180 __m256i src_next_reg; 181 for (i = 0; i < height ; i++) { 182 LOAD_SRC_DST 183 AVG_NEXT_SRC(src_reg, 1) 184 // expand each byte to 2 bytes 185 MERGE_WITH_SRC(src_reg, zero_reg) 186 CALC_SUM_SSE_INSIDE_LOOP 187 src+= src_stride; 188 dst+= dst_stride; 189 } 190 // x_offset = 8 and y_offset = 8 191 } else if (y_offset == 8) { 192 __m256i src_next_reg, src_avg; 193 // load source and another source starting from the next 194 // following byte 195 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 196 AVG_NEXT_SRC(src_reg, 1) 197 for (i = 0; i < height ; i++) { 198 src_avg = src_reg; 199 src+= src_stride; 200 LOAD_SRC_DST 201 AVG_NEXT_SRC(src_reg, 1) 202 // average between previous average to current average 203 src_avg = _mm256_avg_epu8(src_avg, src_reg); 204 // expand each byte to 2 bytes 205 MERGE_WITH_SRC(src_avg, zero_reg) 206 // save current source average 207 CALC_SUM_SSE_INSIDE_LOOP 208 dst+= dst_stride; 209 } 210 // x_offset = 8 and y_offset = bilin interpolation 211 } else { 212 __m256i filter, pw8, src_next_reg, src_avg; 213 y_offset <<= 5; 214 filter = _mm256_load_si256((__m256i const *) 215 (bilinear_filters_avx2 + y_offset)); 216 pw8 = _mm256_set1_epi16(8); 217 // load source and another source starting from the next 218 // following byte 219 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 220 AVG_NEXT_SRC(src_reg, 1) 221 for (i = 0; i < height ; i++) { 222 // save current source average 223 src_avg = src_reg; 224 src+= src_stride; 225 LOAD_SRC_DST 226 AVG_NEXT_SRC(src_reg, 1) 227 MERGE_WITH_SRC(src_avg, src_reg) 228 FILTER_SRC(filter) 229 CALC_SUM_SSE_INSIDE_LOOP 230 dst+= dst_stride; 231 } 232 } 233 // x_offset = bilin interpolation and y_offset = 0 234 } else { 235 if (y_offset == 0) { 236 __m256i filter, pw8, src_next_reg; 237 x_offset <<= 5; 238 filter = _mm256_load_si256((__m256i const *) 239 (bilinear_filters_avx2 + x_offset)); 240 pw8 = _mm256_set1_epi16(8); 241 for (i = 0; i < height ; i++) { 242 LOAD_SRC_DST 243 MERGE_NEXT_SRC(src_reg, 1) 244 FILTER_SRC(filter) 245 CALC_SUM_SSE_INSIDE_LOOP 246 src+= src_stride; 247 dst+= dst_stride; 248 } 249 // x_offset = bilin interpolation and y_offset = 8 250 } else if (y_offset == 8) { 251 __m256i filter, pw8, src_next_reg, src_pack; 252 x_offset <<= 5; 253 filter = _mm256_load_si256((__m256i const *) 254 (bilinear_filters_avx2 + x_offset)); 255 pw8 = _mm256_set1_epi16(8); 256 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 257 MERGE_NEXT_SRC(src_reg, 1) 258 FILTER_SRC(filter) 259 // convert each 16 bit to 8 bit to each low and high lane source 260 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 261 for (i = 0; i < height ; i++) { 262 src+= src_stride; 263 LOAD_SRC_DST 264 MERGE_NEXT_SRC(src_reg, 1) 265 FILTER_SRC(filter) 266 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 267 // average between previous pack to the current 268 src_pack = _mm256_avg_epu8(src_pack, src_reg); 269 MERGE_WITH_SRC(src_pack, zero_reg) 270 CALC_SUM_SSE_INSIDE_LOOP 271 src_pack = src_reg; 272 dst+= dst_stride; 273 } 274 // x_offset = bilin interpolation and y_offset = bilin interpolation 275 } else { 276 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; 277 x_offset <<= 5; 278 xfilter = _mm256_load_si256((__m256i const *) 279 (bilinear_filters_avx2 + x_offset)); 280 y_offset <<= 5; 281 yfilter = _mm256_load_si256((__m256i const *) 282 (bilinear_filters_avx2 + y_offset)); 283 pw8 = _mm256_set1_epi16(8); 284 // load source and another source starting from the next 285 // following byte 286 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 287 MERGE_NEXT_SRC(src_reg, 1) 288 289 FILTER_SRC(xfilter) 290 // convert each 16 bit to 8 bit to each low and high lane source 291 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 292 for (i = 0; i < height ; i++) { 293 src+= src_stride; 294 LOAD_SRC_DST 295 MERGE_NEXT_SRC(src_reg, 1) 296 FILTER_SRC(xfilter) 297 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 298 // merge previous pack to current pack source 299 MERGE_WITH_SRC(src_pack, src_reg) 300 // filter the source 301 FILTER_SRC(yfilter) 302 src_pack = src_reg; 303 CALC_SUM_SSE_INSIDE_LOOP 304 dst+= dst_stride; 305 } 306 } 307 } 308 CALC_SUM_AND_SSE 309 return sum; 310} 311 312unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, 313 int src_stride, 314 int x_offset, 315 int y_offset, 316 const uint8_t *dst, 317 int dst_stride, 318 const uint8_t *sec, 319 int sec_stride, 320 int height, 321 unsigned int *sse) { 322 __m256i sec_reg; 323 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; 324 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; 325 __m256i zero_reg; 326 int i, sum; 327 sum_reg = _mm256_set1_epi16(0); 328 sse_reg = _mm256_set1_epi16(0); 329 zero_reg = _mm256_set1_epi16(0); 330 331 // x_offset = 0 and y_offset = 0 332 if (x_offset == 0) { 333 if (y_offset == 0) { 334 for (i = 0; i < height ; i++) { 335 LOAD_SRC_DST 336 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 337 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 338 sec+= sec_stride; 339 // expend each byte to 2 bytes 340 MERGE_WITH_SRC(src_reg, zero_reg) 341 CALC_SUM_SSE_INSIDE_LOOP 342 src+= src_stride; 343 dst+= dst_stride; 344 } 345 } else if (y_offset == 8) { 346 __m256i src_next_reg; 347 for (i = 0; i < height ; i++) { 348 LOAD_SRC_DST 349 AVG_NEXT_SRC(src_reg, src_stride) 350 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 351 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 352 sec+= sec_stride; 353 // expend each byte to 2 bytes 354 MERGE_WITH_SRC(src_reg, zero_reg) 355 CALC_SUM_SSE_INSIDE_LOOP 356 src+= src_stride; 357 dst+= dst_stride; 358 } 359 // x_offset = 0 and y_offset = bilin interpolation 360 } else { 361 __m256i filter, pw8, src_next_reg; 362 363 y_offset <<= 5; 364 filter = _mm256_load_si256((__m256i const *) 365 (bilinear_filters_avx2 + y_offset)); 366 pw8 = _mm256_set1_epi16(8); 367 for (i = 0; i < height ; i++) { 368 LOAD_SRC_DST 369 MERGE_NEXT_SRC(src_reg, src_stride) 370 FILTER_SRC(filter) 371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 372 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 373 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 374 sec+= sec_stride; 375 MERGE_WITH_SRC(src_reg, zero_reg) 376 CALC_SUM_SSE_INSIDE_LOOP 377 src+= src_stride; 378 dst+= dst_stride; 379 } 380 } 381 // x_offset = 8 and y_offset = 0 382 } else if (x_offset == 8) { 383 if (y_offset == 0) { 384 __m256i src_next_reg; 385 for (i = 0; i < height ; i++) { 386 LOAD_SRC_DST 387 AVG_NEXT_SRC(src_reg, 1) 388 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 389 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 390 sec+= sec_stride; 391 // expand each byte to 2 bytes 392 MERGE_WITH_SRC(src_reg, zero_reg) 393 CALC_SUM_SSE_INSIDE_LOOP 394 src+= src_stride; 395 dst+= dst_stride; 396 } 397 // x_offset = 8 and y_offset = 8 398 } else if (y_offset == 8) { 399 __m256i src_next_reg, src_avg; 400 // load source and another source starting from the next 401 // following byte 402 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 403 AVG_NEXT_SRC(src_reg, 1) 404 for (i = 0; i < height ; i++) { 405 // save current source average 406 src_avg = src_reg; 407 src+= src_stride; 408 LOAD_SRC_DST 409 AVG_NEXT_SRC(src_reg, 1) 410 // average between previous average to current average 411 src_avg = _mm256_avg_epu8(src_avg, src_reg); 412 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 413 src_avg = _mm256_avg_epu8(src_avg, sec_reg); 414 sec+= sec_stride; 415 // expand each byte to 2 bytes 416 MERGE_WITH_SRC(src_avg, zero_reg) 417 CALC_SUM_SSE_INSIDE_LOOP 418 dst+= dst_stride; 419 } 420 // x_offset = 8 and y_offset = bilin interpolation 421 } else { 422 __m256i filter, pw8, src_next_reg, src_avg; 423 y_offset <<= 5; 424 filter = _mm256_load_si256((__m256i const *) 425 (bilinear_filters_avx2 + y_offset)); 426 pw8 = _mm256_set1_epi16(8); 427 // load source and another source starting from the next 428 // following byte 429 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 430 AVG_NEXT_SRC(src_reg, 1) 431 for (i = 0; i < height ; i++) { 432 // save current source average 433 src_avg = src_reg; 434 src+= src_stride; 435 LOAD_SRC_DST 436 AVG_NEXT_SRC(src_reg, 1) 437 MERGE_WITH_SRC(src_avg, src_reg) 438 FILTER_SRC(filter) 439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 440 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 441 src_avg = _mm256_avg_epu8(src_avg, sec_reg); 442 // expand each byte to 2 bytes 443 MERGE_WITH_SRC(src_avg, zero_reg) 444 sec+= sec_stride; 445 CALC_SUM_SSE_INSIDE_LOOP 446 dst+= dst_stride; 447 } 448 } 449 // x_offset = bilin interpolation and y_offset = 0 450 } else { 451 if (y_offset == 0) { 452 __m256i filter, pw8, src_next_reg; 453 x_offset <<= 5; 454 filter = _mm256_load_si256((__m256i const *) 455 (bilinear_filters_avx2 + x_offset)); 456 pw8 = _mm256_set1_epi16(8); 457 for (i = 0; i < height ; i++) { 458 LOAD_SRC_DST 459 MERGE_NEXT_SRC(src_reg, 1) 460 FILTER_SRC(filter) 461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 462 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 463 src_reg = _mm256_avg_epu8(src_reg, sec_reg); 464 MERGE_WITH_SRC(src_reg, zero_reg) 465 sec+= sec_stride; 466 CALC_SUM_SSE_INSIDE_LOOP 467 src+= src_stride; 468 dst+= dst_stride; 469 } 470 // x_offset = bilin interpolation and y_offset = 8 471 } else if (y_offset == 8) { 472 __m256i filter, pw8, src_next_reg, src_pack; 473 x_offset <<= 5; 474 filter = _mm256_load_si256((__m256i const *) 475 (bilinear_filters_avx2 + x_offset)); 476 pw8 = _mm256_set1_epi16(8); 477 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 478 MERGE_NEXT_SRC(src_reg, 1) 479 FILTER_SRC(filter) 480 // convert each 16 bit to 8 bit to each low and high lane source 481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 482 for (i = 0; i < height ; i++) { 483 src+= src_stride; 484 LOAD_SRC_DST 485 MERGE_NEXT_SRC(src_reg, 1) 486 FILTER_SRC(filter) 487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 488 // average between previous pack to the current 489 src_pack = _mm256_avg_epu8(src_pack, src_reg); 490 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 491 src_pack = _mm256_avg_epu8(src_pack, sec_reg); 492 sec+= sec_stride; 493 MERGE_WITH_SRC(src_pack, zero_reg) 494 src_pack = src_reg; 495 CALC_SUM_SSE_INSIDE_LOOP 496 dst+= dst_stride; 497 } 498 // x_offset = bilin interpolation and y_offset = bilin interpolation 499 } else { 500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; 501 x_offset <<= 5; 502 xfilter = _mm256_load_si256((__m256i const *) 503 (bilinear_filters_avx2 + x_offset)); 504 y_offset <<= 5; 505 yfilter = _mm256_load_si256((__m256i const *) 506 (bilinear_filters_avx2 + y_offset)); 507 pw8 = _mm256_set1_epi16(8); 508 // load source and another source starting from the next 509 // following byte 510 src_reg = _mm256_loadu_si256((__m256i const *) (src)); 511 MERGE_NEXT_SRC(src_reg, 1) 512 513 FILTER_SRC(xfilter) 514 // convert each 16 bit to 8 bit to each low and high lane source 515 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 516 for (i = 0; i < height ; i++) { 517 src+= src_stride; 518 LOAD_SRC_DST 519 MERGE_NEXT_SRC(src_reg, 1) 520 FILTER_SRC(xfilter) 521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 522 // merge previous pack to current pack source 523 MERGE_WITH_SRC(src_pack, src_reg) 524 // filter the source 525 FILTER_SRC(yfilter) 526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); 527 sec_reg = _mm256_load_si256((__m256i const *) (sec)); 528 src_pack = _mm256_avg_epu8(src_pack, sec_reg); 529 MERGE_WITH_SRC(src_pack, zero_reg) 530 src_pack = src_reg; 531 sec+= sec_stride; 532 CALC_SUM_SSE_INSIDE_LOOP 533 dst+= dst_stride; 534 } 535 } 536 } 537 CALC_SUM_AND_SSE 538 return sum; 539} 540