highbd_variance_sse2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10#include "./vpx_config.h" 11 12#include "vpx_ports/mem.h" 13 14typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride, 15 const uint16_t *ref, int ref_stride, 16 uint32_t *sse, int *sum); 17 18uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, 19 const uint16_t *ref, int ref_stride, 20 uint32_t *sse, int *sum); 21 22uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, 23 const uint16_t *ref, int ref_stride, 24 uint32_t *sse, int *sum); 25 26static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, 27 const uint16_t *ref, int ref_stride, 28 int w, int h, uint32_t *sse, int *sum, 29 high_variance_fn_t var_fn, int block_size) { 30 int i, j; 31 32 *sse = 0; 33 *sum = 0; 34 35 for (i = 0; i < h; i += block_size) { 36 for (j = 0; j < w; j += block_size) { 37 unsigned int sse0; 38 int sum0; 39 var_fn(src + src_stride * i + j, src_stride, 40 ref + ref_stride * i + j, ref_stride, &sse0, &sum0); 41 *sse += sse0; 42 *sum += sum0; 43 } 44 } 45} 46 47static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, 48 const uint16_t *ref, int ref_stride, 49 int w, int h, uint32_t *sse, int *sum, 50 high_variance_fn_t var_fn, int block_size) { 51 int i, j; 52 uint64_t sse_long = 0; 53 int32_t sum_long = 0; 54 55 for (i = 0; i < h; i += block_size) { 56 for (j = 0; j < w; j += block_size) { 57 unsigned int sse0; 58 int sum0; 59 var_fn(src + src_stride * i + j, src_stride, 60 ref + ref_stride * i + j, ref_stride, &sse0, &sum0); 61 sse_long += sse0; 62 sum_long += sum0; 63 } 64 } 65 *sum = ROUND_POWER_OF_TWO(sum_long, 2); 66 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); 67} 68 69static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, 70 const uint16_t *ref, int ref_stride, 71 int w, int h, uint32_t *sse, int *sum, 72 high_variance_fn_t var_fn, int block_size) { 73 int i, j; 74 uint64_t sse_long = 0; 75 int32_t sum_long = 0; 76 77 for (i = 0; i < h; i += block_size) { 78 for (j = 0; j < w; j += block_size) { 79 unsigned int sse0; 80 int sum0; 81 var_fn(src + src_stride * i + j, src_stride, 82 ref + ref_stride * i + j, ref_stride, &sse0, &sum0); 83 sse_long += sse0; 84 sum_long += sum0; 85 } 86 } 87 *sum = ROUND_POWER_OF_TWO(sum_long, 4); 88 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); 89} 90 91 92#define HIGH_GET_VAR(S) \ 93void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ 94 const uint8_t *ref8, int ref_stride, \ 95 uint32_t *sse, int *sum) { \ 96 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 97 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 98 vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ 99 sse, sum); \ 100} \ 101\ 102void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ 103 const uint8_t *ref8, int ref_stride, \ 104 uint32_t *sse, int *sum) { \ 105 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 106 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 107 vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ 108 sse, sum); \ 109 *sum = ROUND_POWER_OF_TWO(*sum, 2); \ 110 *sse = ROUND_POWER_OF_TWO(*sse, 4); \ 111} \ 112\ 113void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ 114 const uint8_t *ref8, int ref_stride, \ 115 uint32_t *sse, int *sum) { \ 116 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 117 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 118 vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ 119 sse, sum); \ 120 *sum = ROUND_POWER_OF_TWO(*sum, 4); \ 121 *sse = ROUND_POWER_OF_TWO(*sse, 8); \ 122} 123 124HIGH_GET_VAR(16); 125HIGH_GET_VAR(8); 126 127#undef HIGH_GET_VAR 128 129#define VAR_FN(w, h, block_size, shift) \ 130uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \ 131 const uint8_t *src8, int src_stride, \ 132 const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ 133 int sum; \ 134 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 135 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 136 highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ 137 vpx_highbd_calc##block_size##x##block_size##var_sse2, \ 138 block_size); \ 139 return *sse - (((int64_t)sum * sum) >> shift); \ 140} \ 141\ 142uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ 143 const uint8_t *src8, int src_stride, \ 144 const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ 145 int sum; \ 146 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 147 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 148 highbd_10_variance_sse2( \ 149 src, src_stride, ref, ref_stride, w, h, sse, &sum, \ 150 vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ 151 return *sse - (((int64_t)sum * sum) >> shift); \ 152} \ 153\ 154uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \ 155 const uint8_t *src8, int src_stride, \ 156 const uint8_t *ref8, int ref_stride, uint32_t *sse) { \ 157 int sum; \ 158 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 159 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 160 highbd_12_variance_sse2( \ 161 src, src_stride, ref, ref_stride, w, h, sse, &sum, \ 162 vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ 163 return *sse - (((int64_t)sum * sum) >> shift); \ 164} 165 166VAR_FN(64, 64, 16, 12); 167VAR_FN(64, 32, 16, 11); 168VAR_FN(32, 64, 16, 11); 169VAR_FN(32, 32, 16, 10); 170VAR_FN(32, 16, 16, 9); 171VAR_FN(16, 32, 16, 9); 172VAR_FN(16, 16, 16, 8); 173VAR_FN(16, 8, 8, 7); 174VAR_FN(8, 16, 8, 7); 175VAR_FN(8, 8, 8, 6); 176 177#undef VAR_FN 178 179unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, 180 const uint8_t *ref8, int ref_stride, 181 unsigned int *sse) { 182 int sum; 183 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 184 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 185 highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, 186 sse, &sum, vpx_highbd_calc16x16var_sse2, 16); 187 return *sse; 188} 189 190unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, 191 const uint8_t *ref8, int ref_stride, 192 unsigned int *sse) { 193 int sum; 194 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 195 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 196 highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, 197 sse, &sum, vpx_highbd_calc16x16var_sse2, 16); 198 return *sse; 199} 200 201unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, 202 const uint8_t *ref8, int ref_stride, 203 unsigned int *sse) { 204 int sum; 205 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 206 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 207 highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, 208 sse, &sum, vpx_highbd_calc16x16var_sse2, 16); 209 return *sse; 210} 211 212unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, 213 const uint8_t *ref8, int ref_stride, 214 unsigned int *sse) { 215 int sum; 216 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 217 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 218 highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, 219 sse, &sum, vpx_highbd_calc8x8var_sse2, 8); 220 return *sse; 221} 222 223unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, 224 const uint8_t *ref8, int ref_stride, 225 unsigned int *sse) { 226 int sum; 227 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 228 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 229 highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, 230 sse, &sum, vpx_highbd_calc8x8var_sse2, 8); 231 return *sse; 232} 233 234unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, 235 const uint8_t *ref8, int ref_stride, 236 unsigned int *sse) { 237 int sum; 238 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 239 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 240 highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, 241 sse, &sum, vpx_highbd_calc8x8var_sse2, 8); 242 return *sse; 243} 244 245#if CONFIG_USE_X86INC 246#define DECL(w, opt) \ 247 int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ 248 ptrdiff_t src_stride, \ 249 int x_offset, int y_offset, \ 250 const uint16_t *dst, \ 251 ptrdiff_t dst_stride, \ 252 int height, unsigned int *sse); 253#define DECLS(opt1, opt2) \ 254 DECL(8, opt1); \ 255 DECL(16, opt1) 256 257DECLS(sse2, sse); 258// TODO(johannkoenig): enable the ssse3 or delete 259// DECLS(ssse3, ssse3); 260#undef DECLS 261#undef DECL 262 263#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 264uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ 265 int src_stride, \ 266 int x_offset, \ 267 int y_offset, \ 268 const uint8_t *dst8, \ 269 int dst_stride, \ 270 uint32_t *sse_ptr) { \ 271 uint32_t sse; \ 272 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 273 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 274 int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ 275 x_offset, y_offset, \ 276 dst, dst_stride, h, \ 277 &sse); \ 278 if (w > wf) { \ 279 unsigned int sse2; \ 280 int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ 281 src_stride, \ 282 x_offset, y_offset, \ 283 dst + 16, \ 284 dst_stride, \ 285 h, &sse2); \ 286 se += se2; \ 287 sse += sse2; \ 288 if (w > wf * 2) { \ 289 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ 290 x_offset, y_offset, \ 291 dst + 32, dst_stride, \ 292 h, &sse2); \ 293 se += se2; \ 294 sse += sse2; \ 295 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 296 src + 48, src_stride, x_offset, y_offset, \ 297 dst + 48, dst_stride, h, &sse2); \ 298 se += se2; \ 299 sse += sse2; \ 300 } \ 301 } \ 302 *sse_ptr = sse; \ 303 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 304} \ 305\ 306uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ 307 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 308 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ 309 uint32_t sse; \ 310 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 311 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 312 int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ 313 x_offset, y_offset, \ 314 dst, dst_stride, \ 315 h, &sse); \ 316 if (w > wf) { \ 317 uint32_t sse2; \ 318 int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ 319 src_stride, \ 320 x_offset, y_offset, \ 321 dst + 16, \ 322 dst_stride, \ 323 h, &sse2); \ 324 se += se2; \ 325 sse += sse2; \ 326 if (w > wf * 2) { \ 327 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ 328 x_offset, y_offset, \ 329 dst + 32, dst_stride, \ 330 h, &sse2); \ 331 se += se2; \ 332 sse += sse2; \ 333 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ 334 x_offset, y_offset, \ 335 dst + 48, dst_stride, \ 336 h, &sse2); \ 337 se += se2; \ 338 sse += sse2; \ 339 } \ 340 } \ 341 se = ROUND_POWER_OF_TWO(se, 2); \ 342 sse = ROUND_POWER_OF_TWO(sse, 4); \ 343 *sse_ptr = sse; \ 344 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 345} \ 346\ 347uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ 348 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 349 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ 350 int start_row; \ 351 uint32_t sse; \ 352 int se = 0; \ 353 uint64_t long_sse = 0; \ 354 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 355 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 356 for (start_row = 0; start_row < h; start_row +=16) { \ 357 uint32_t sse2; \ 358 int height = h - start_row < 16 ? h - start_row : 16; \ 359 int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 360 src + (start_row * src_stride), src_stride, \ 361 x_offset, y_offset, dst + (start_row * dst_stride), \ 362 dst_stride, height, &sse2); \ 363 se += se2; \ 364 long_sse += sse2; \ 365 if (w > wf) { \ 366 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 367 src + 16 + (start_row * src_stride), src_stride, \ 368 x_offset, y_offset, dst + 16 + (start_row * dst_stride), \ 369 dst_stride, height, &sse2); \ 370 se += se2; \ 371 long_sse += sse2; \ 372 if (w > wf * 2) { \ 373 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 374 src + 32 + (start_row * src_stride), src_stride, \ 375 x_offset, y_offset, dst + 32 + (start_row * dst_stride), \ 376 dst_stride, height, &sse2); \ 377 se += se2; \ 378 long_sse += sse2; \ 379 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 380 src + 48 + (start_row * src_stride), src_stride, \ 381 x_offset, y_offset, dst + 48 + (start_row * dst_stride), \ 382 dst_stride, height, &sse2); \ 383 se += se2; \ 384 long_sse += sse2; \ 385 }\ 386 } \ 387 } \ 388 se = ROUND_POWER_OF_TWO(se, 4); \ 389 sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ 390 *sse_ptr = sse; \ 391 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 392} 393 394#define FNS(opt1, opt2) \ 395FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ 396FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ 397FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ 398FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ 399FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ 400FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ 401FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ 402FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ 403FN(8, 16, 8, 3, 4, opt1, (int64_t)); \ 404FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ 405FN(8, 4, 8, 3, 2, opt1, (int64_t)); 406 407 408FNS(sse2, sse); 409 410#undef FNS 411#undef FN 412 413#define DECL(w, opt) \ 414int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ 415 ptrdiff_t src_stride, \ 416 int x_offset, int y_offset, \ 417 const uint16_t *dst, \ 418 ptrdiff_t dst_stride, \ 419 const uint16_t *sec, \ 420 ptrdiff_t sec_stride, \ 421 int height, \ 422 unsigned int *sse); 423#define DECLS(opt1) \ 424DECL(16, opt1) \ 425DECL(8, opt1) 426 427DECLS(sse2); 428#undef DECL 429#undef DECLS 430 431#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 432uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ 433 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 434 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ 435 const uint8_t *sec8) { \ 436 uint32_t sse; \ 437 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 438 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 439 uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ 440 int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 441 src, src_stride, x_offset, \ 442 y_offset, dst, dst_stride, sec, w, h, &sse); \ 443 if (w > wf) { \ 444 uint32_t sse2; \ 445 int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 446 src + 16, src_stride, x_offset, y_offset, \ 447 dst + 16, dst_stride, sec + 16, w, h, &sse2); \ 448 se += se2; \ 449 sse += sse2; \ 450 if (w > wf * 2) { \ 451 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 452 src + 32, src_stride, x_offset, y_offset, \ 453 dst + 32, dst_stride, sec + 32, w, h, &sse2); \ 454 se += se2; \ 455 sse += sse2; \ 456 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 457 src + 48, src_stride, x_offset, y_offset, \ 458 dst + 48, dst_stride, sec + 48, w, h, &sse2); \ 459 se += se2; \ 460 sse += sse2; \ 461 } \ 462 } \ 463 *sse_ptr = sse; \ 464 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 465} \ 466\ 467uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ 468 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 469 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ 470 const uint8_t *sec8) { \ 471 uint32_t sse; \ 472 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 473 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 474 uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ 475 int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 476 src, src_stride, x_offset, \ 477 y_offset, dst, dst_stride, \ 478 sec, w, h, &sse); \ 479 if (w > wf) { \ 480 uint32_t sse2; \ 481 int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 482 src + 16, src_stride, \ 483 x_offset, y_offset, \ 484 dst + 16, dst_stride, \ 485 sec + 16, w, h, &sse2); \ 486 se += se2; \ 487 sse += sse2; \ 488 if (w > wf * 2) { \ 489 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 490 src + 32, src_stride, \ 491 x_offset, y_offset, \ 492 dst + 32, dst_stride, \ 493 sec + 32, w, h, &sse2); \ 494 se += se2; \ 495 sse += sse2; \ 496 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 497 src + 48, src_stride, \ 498 x_offset, y_offset, \ 499 dst + 48, dst_stride, \ 500 sec + 48, w, h, &sse2); \ 501 se += se2; \ 502 sse += sse2; \ 503 } \ 504 } \ 505 se = ROUND_POWER_OF_TWO(se, 2); \ 506 sse = ROUND_POWER_OF_TWO(sse, 4); \ 507 *sse_ptr = sse; \ 508 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 509} \ 510\ 511uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ 512 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 513 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ 514 const uint8_t *sec8) { \ 515 int start_row; \ 516 uint32_t sse; \ 517 int se = 0; \ 518 uint64_t long_sse = 0; \ 519 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 520 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 521 uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ 522 for (start_row = 0; start_row < h; start_row +=16) { \ 523 uint32_t sse2; \ 524 int height = h - start_row < 16 ? h - start_row : 16; \ 525 int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 526 src + (start_row * src_stride), src_stride, x_offset, \ 527 y_offset, dst + (start_row * dst_stride), dst_stride, \ 528 sec + (start_row * w), w, height, &sse2); \ 529 se += se2; \ 530 long_sse += sse2; \ 531 if (w > wf) { \ 532 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 533 src + 16 + (start_row * src_stride), src_stride, \ 534 x_offset, y_offset, \ 535 dst + 16 + (start_row * dst_stride), dst_stride, \ 536 sec + 16 + (start_row * w), w, height, &sse2); \ 537 se += se2; \ 538 long_sse += sse2; \ 539 if (w > wf * 2) { \ 540 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 541 src + 32 + (start_row * src_stride), src_stride, \ 542 x_offset, y_offset, \ 543 dst + 32 + (start_row * dst_stride), dst_stride, \ 544 sec + 32 + (start_row * w), w, height, &sse2); \ 545 se += se2; \ 546 long_sse += sse2; \ 547 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 548 src + 48 + (start_row * src_stride), src_stride, \ 549 x_offset, y_offset, \ 550 dst + 48 + (start_row * dst_stride), dst_stride, \ 551 sec + 48 + (start_row * w), w, height, &sse2); \ 552 se += se2; \ 553 long_sse += sse2; \ 554 } \ 555 } \ 556 } \ 557 se = ROUND_POWER_OF_TWO(se, 4); \ 558 sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ 559 *sse_ptr = sse; \ 560 return sse - ((cast se * se) >> (wlog2 + hlog2)); \ 561} 562 563 564#define FNS(opt1) \ 565FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ 566FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ 567FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ 568FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ 569FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ 570FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ 571FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ 572FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ 573FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ 574FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ 575FN(8, 4, 8, 3, 2, opt1, (int64_t)); 576 577FNS(sse2); 578 579#undef FNS 580#undef FN 581#endif // CONFIG_USE_X86INC 582