1/* 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10#include "./vpx_config.h" 11 12#include "vpx_ports/mem.h" 13 14typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, 15 const uint16_t *ref, int ref_stride, 16 uint32_t *sse, int *sum); 17 18uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, 19 const uint16_t *ref, int ref_stride, 20 uint32_t *sse, int *sum); 21 22uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, 23 const uint16_t *ref, int ref_stride, 24 uint32_t *sse, int *sum); 25 26static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, 27 const uint16_t *ref, int ref_stride, int w, 28 int h, uint32_t *sse, int *sum, 29 high_variance_fn_t var_fn, int block_size) { 30 int i, j; 31 32 *sse = 0; 33 *sum = 0; 34 35 for (i = 0; i < h; i += block_size) { 36 for (j = 0; j < w; j += block_size) { 37 unsigned int sse0; 38 int sum0; 39 var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, 40 ref_stride, &sse0, &sum0); 41 *sse += sse0; 42 *sum += sum0; 43 } 44 } 45} 46 47static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, 48 const uint16_t *ref, int ref_stride, int w, 49 int h, uint32_t *sse, int *sum, 50 high_variance_fn_t var_fn, int block_size) { 51 int i, j; 52 uint64_t sse_long = 0; 53 int32_t sum_long = 0; 54 55 for (i = 0; i < h; i += block_size) { 56 for (j = 0; j < w; j += block_size) { 57 unsigned int sse0; 58 int sum0; 59 var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, 60 ref_stride, &sse0, &sum0); 61 sse_long += sse0; 62 sum_long += sum0; 63 } 64 } 65 *sum = ROUND_POWER_OF_TWO(sum_long, 2); 66 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); 67} 68 69static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, 70 const uint16_t *ref, int ref_stride, int w, 71 int h, uint32_t *sse, int *sum, 72 high_variance_fn_t var_fn, int block_size) { 73 int i, j; 74 uint64_t sse_long = 0; 75 int32_t sum_long = 0; 76 77 for (i = 0; i < h; i += block_size) { 78 for (j = 0; j < w; j += block_size) { 79 unsigned int sse0; 80 int sum0; 81 var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, 82 ref_stride, &sse0, &sum0); 83 sse_long += sse0; 84 sum_long += sum0; 85 } 86 } 87 *sum = ROUND_POWER_OF_TWO(sum_long, 4); 88 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); 89} 90 91#define HIGH_GET_VAR(S) \ 92 void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ 93 const uint8_t *ref8, int ref_stride, \ 94 uint32_t *sse, int *sum) { \ 95 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 96 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 97 vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ 98 sum); \ 99 } \ 100 \ 101 void vpx_highbd_10_get##S##x##S##var_sse2( \ 102 const uint8_t *src8, int src_stride, const uint8_t *ref8, \ 103 int ref_stride, uint32_t *sse, int *sum) { \ 104 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 105 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 106 vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ 107 sum); \ 108 *sum = ROUND_POWER_OF_TWO(*sum, 2); \ 109 *sse = ROUND_POWER_OF_TWO(*sse, 4); \ 110 } \ 111 \ 112 void vpx_highbd_12_get##S##x##S##var_sse2( \ 113 const uint8_t *src8, int src_stride, const uint8_t *ref8, \ 114 int ref_stride, uint32_t *sse, int *sum) { \ 115 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 116 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 117 vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \ 118 sum); \ 119 *sum = ROUND_POWER_OF_TWO(*sum, 4); \ 120 *sse = ROUND_POWER_OF_TWO(*sse, 8); \ 121 } 122 123HIGH_GET_VAR(16); 124HIGH_GET_VAR(8); 125 126#undef HIGH_GET_VAR 127 128#define VAR_FN(w, h, block_size, shift) \ 129 uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \ 130 const uint8_t *src8, int src_stride, const uint8_t *ref8, \ 131 int ref_stride, uint32_t *sse) { \ 132 int sum; \ 133 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 134 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 135 highbd_8_variance_sse2( \ 136 src, src_stride, ref, ref_stride, w, h, sse, &sum, \ 137 vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ 138 return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ 139 } \ 140 \ 141 uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \ 142 const uint8_t *src8, int src_stride, const uint8_t *ref8, \ 143 int ref_stride, uint32_t *sse) { \ 144 int sum; \ 145 int64_t var; \ 146 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 147 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 148 highbd_10_variance_sse2( \ 149 src, src_stride, ref, ref_stride, w, h, sse, &sum, \ 150 vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ 151 var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ 152 return (var >= 0) ? (uint32_t)var : 0; \ 153 } \ 154 \ 155 uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \ 156 const uint8_t *src8, int src_stride, const uint8_t *ref8, \ 157 int ref_stride, uint32_t *sse) { \ 158 int sum; \ 159 int64_t var; \ 160 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 161 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 162 highbd_12_variance_sse2( \ 163 src, src_stride, ref, ref_stride, w, h, sse, &sum, \ 164 vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ 165 var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ 166 return (var >= 0) ? (uint32_t)var : 0; \ 167 } 168 169VAR_FN(64, 64, 16, 12); 170VAR_FN(64, 32, 16, 11); 171VAR_FN(32, 64, 16, 11); 172VAR_FN(32, 32, 16, 10); 173VAR_FN(32, 16, 16, 9); 174VAR_FN(16, 32, 16, 9); 175VAR_FN(16, 16, 16, 8); 176VAR_FN(16, 8, 8, 7); 177VAR_FN(8, 16, 8, 7); 178VAR_FN(8, 8, 8, 6); 179 180#undef VAR_FN 181 182unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, 183 const uint8_t *ref8, int ref_stride, 184 unsigned int *sse) { 185 int sum; 186 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 187 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 188 highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, 189 vpx_highbd_calc16x16var_sse2, 16); 190 return *sse; 191} 192 193unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, 194 const uint8_t *ref8, int ref_stride, 195 unsigned int *sse) { 196 int sum; 197 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 198 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 199 highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, 200 vpx_highbd_calc16x16var_sse2, 16); 201 return *sse; 202} 203 204unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, 205 const uint8_t *ref8, int ref_stride, 206 unsigned int *sse) { 207 int sum; 208 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 209 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 210 highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, 211 vpx_highbd_calc16x16var_sse2, 16); 212 return *sse; 213} 214 215unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, 216 const uint8_t *ref8, int ref_stride, 217 unsigned int *sse) { 218 int sum; 219 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 220 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 221 highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, 222 vpx_highbd_calc8x8var_sse2, 8); 223 return *sse; 224} 225 226unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, 227 const uint8_t *ref8, int ref_stride, 228 unsigned int *sse) { 229 int sum; 230 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 231 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 232 highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, 233 vpx_highbd_calc8x8var_sse2, 8); 234 return *sse; 235} 236 237unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, 238 const uint8_t *ref8, int ref_stride, 239 unsigned int *sse) { 240 int sum; 241 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 242 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 243 highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, 244 vpx_highbd_calc8x8var_sse2, 8); 245 return *sse; 246} 247 248// The 2 unused parameters are place holders for PIC enabled build. 249// These definitions are for functions defined in 250// highbd_subpel_variance_impl_sse2.asm 251#define DECL(w, opt) \ 252 int vpx_highbd_sub_pixel_variance##w##xh_##opt( \ 253 const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ 254 const uint16_t *dst, ptrdiff_t dst_stride, int height, \ 255 unsigned int *sse, void *unused0, void *unused); 256#define DECLS(opt) \ 257 DECL(8, opt); \ 258 DECL(16, opt) 259 260DECLS(sse2); 261 262#undef DECLS 263#undef DECL 264 265#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 266 uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ 267 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 268 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ 269 uint32_t sse; \ 270 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 271 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 272 int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 273 src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ 274 NULL); \ 275 if (w > wf) { \ 276 unsigned int sse2; \ 277 int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 278 src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ 279 &sse2, NULL, NULL); \ 280 se += se2; \ 281 sse += sse2; \ 282 if (w > wf * 2) { \ 283 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 284 src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ 285 &sse2, NULL, NULL); \ 286 se += se2; \ 287 sse += sse2; \ 288 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 289 src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ 290 &sse2, NULL, NULL); \ 291 se += se2; \ 292 sse += sse2; \ 293 } \ 294 } \ 295 *sse_ptr = sse; \ 296 return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ 297 } \ 298 \ 299 uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ 300 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 301 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ 302 int64_t var; \ 303 uint32_t sse; \ 304 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 305 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 306 int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 307 src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ 308 NULL); \ 309 if (w > wf) { \ 310 uint32_t sse2; \ 311 int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 312 src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ 313 &sse2, NULL, NULL); \ 314 se += se2; \ 315 sse += sse2; \ 316 if (w > wf * 2) { \ 317 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 318 src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ 319 &sse2, NULL, NULL); \ 320 se += se2; \ 321 sse += sse2; \ 322 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 323 src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ 324 &sse2, NULL, NULL); \ 325 se += se2; \ 326 sse += sse2; \ 327 } \ 328 } \ 329 se = ROUND_POWER_OF_TWO(se, 2); \ 330 sse = ROUND_POWER_OF_TWO(sse, 4); \ 331 *sse_ptr = sse; \ 332 var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ 333 return (var >= 0) ? (uint32_t)var : 0; \ 334 } \ 335 \ 336 uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ 337 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 338 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ 339 int start_row; \ 340 uint32_t sse; \ 341 int se = 0; \ 342 int64_t var; \ 343 uint64_t long_sse = 0; \ 344 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 345 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 346 for (start_row = 0; start_row < h; start_row += 16) { \ 347 uint32_t sse2; \ 348 int height = h - start_row < 16 ? h - start_row : 16; \ 349 int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 350 src + (start_row * src_stride), src_stride, x_offset, y_offset, \ 351 dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \ 352 NULL); \ 353 se += se2; \ 354 long_sse += sse2; \ 355 if (w > wf) { \ 356 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 357 src + 16 + (start_row * src_stride), src_stride, x_offset, \ 358 y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \ 359 &sse2, NULL, NULL); \ 360 se += se2; \ 361 long_sse += sse2; \ 362 if (w > wf * 2) { \ 363 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 364 src + 32 + (start_row * src_stride), src_stride, x_offset, \ 365 y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ 366 height, &sse2, NULL, NULL); \ 367 se += se2; \ 368 long_sse += sse2; \ 369 se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ 370 src + 48 + (start_row * src_stride), src_stride, x_offset, \ 371 y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ 372 height, &sse2, NULL, NULL); \ 373 se += se2; \ 374 long_sse += sse2; \ 375 } \ 376 } \ 377 } \ 378 se = ROUND_POWER_OF_TWO(se, 4); \ 379 sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ 380 *sse_ptr = sse; \ 381 var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ 382 return (var >= 0) ? (uint32_t)var : 0; \ 383 } 384 385#define FNS(opt) \ 386 FN(64, 64, 16, 6, 6, opt, (int64_t)); \ 387 FN(64, 32, 16, 6, 5, opt, (int64_t)); \ 388 FN(32, 64, 16, 5, 6, opt, (int64_t)); \ 389 FN(32, 32, 16, 5, 5, opt, (int64_t)); \ 390 FN(32, 16, 16, 5, 4, opt, (int64_t)); \ 391 FN(16, 32, 16, 4, 5, opt, (int64_t)); \ 392 FN(16, 16, 16, 4, 4, opt, (int64_t)); \ 393 FN(16, 8, 16, 4, 3, opt, (int64_t)); \ 394 FN(8, 16, 8, 3, 4, opt, (int64_t)); \ 395 FN(8, 8, 8, 3, 3, opt, (int64_t)); \ 396 FN(8, 4, 8, 3, 2, opt, (int64_t)); 397 398FNS(sse2); 399 400#undef FNS 401#undef FN 402 403// The 2 unused parameters are place holders for PIC enabled build. 404#define DECL(w, opt) \ 405 int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt( \ 406 const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ 407 const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ 408 ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ 409 void *unused); 410#define DECLS(opt1) \ 411 DECL(16, opt1) \ 412 DECL(8, opt1) 413 414DECLS(sse2); 415#undef DECL 416#undef DECLS 417 418#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 419 uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ 420 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 421 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ 422 const uint8_t *sec8) { \ 423 uint32_t sse; \ 424 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 425 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 426 uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ 427 int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 428 src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ 429 NULL, NULL); \ 430 if (w > wf) { \ 431 uint32_t sse2; \ 432 int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 433 src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ 434 sec + 16, w, h, &sse2, NULL, NULL); \ 435 se += se2; \ 436 sse += sse2; \ 437 if (w > wf * 2) { \ 438 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 439 src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ 440 sec + 32, w, h, &sse2, NULL, NULL); \ 441 se += se2; \ 442 sse += sse2; \ 443 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 444 src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ 445 sec + 48, w, h, &sse2, NULL, NULL); \ 446 se += se2; \ 447 sse += sse2; \ 448 } \ 449 } \ 450 *sse_ptr = sse; \ 451 return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ 452 } \ 453 \ 454 uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ 455 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 456 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ 457 const uint8_t *sec8) { \ 458 int64_t var; \ 459 uint32_t sse; \ 460 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 461 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 462 uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ 463 int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 464 src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ 465 NULL, NULL); \ 466 if (w > wf) { \ 467 uint32_t sse2; \ 468 int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 469 src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \ 470 sec + 16, w, h, &sse2, NULL, NULL); \ 471 se += se2; \ 472 sse += sse2; \ 473 if (w > wf * 2) { \ 474 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 475 src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ 476 sec + 32, w, h, &sse2, NULL, NULL); \ 477 se += se2; \ 478 sse += sse2; \ 479 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 480 src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ 481 sec + 48, w, h, &sse2, NULL, NULL); \ 482 se += se2; \ 483 sse += sse2; \ 484 } \ 485 } \ 486 se = ROUND_POWER_OF_TWO(se, 2); \ 487 sse = ROUND_POWER_OF_TWO(sse, 4); \ 488 *sse_ptr = sse; \ 489 var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ 490 return (var >= 0) ? (uint32_t)var : 0; \ 491 } \ 492 \ 493 uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ 494 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 495 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ 496 const uint8_t *sec8) { \ 497 int start_row; \ 498 int64_t var; \ 499 uint32_t sse; \ 500 int se = 0; \ 501 uint64_t long_sse = 0; \ 502 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 503 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 504 uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ 505 for (start_row = 0; start_row < h; start_row += 16) { \ 506 uint32_t sse2; \ 507 int height = h - start_row < 16 ? h - start_row : 16; \ 508 int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 509 src + (start_row * src_stride), src_stride, x_offset, y_offset, \ 510 dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ 511 w, height, &sse2, NULL, NULL); \ 512 se += se2; \ 513 long_sse += sse2; \ 514 if (w > wf) { \ 515 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 516 src + 16 + (start_row * src_stride), src_stride, x_offset, \ 517 y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \ 518 sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ 519 se += se2; \ 520 long_sse += sse2; \ 521 if (w > wf * 2) { \ 522 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 523 src + 32 + (start_row * src_stride), src_stride, x_offset, \ 524 y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ 525 sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ 526 se += se2; \ 527 long_sse += sse2; \ 528 se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 529 src + 48 + (start_row * src_stride), src_stride, x_offset, \ 530 y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ 531 sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ 532 se += se2; \ 533 long_sse += sse2; \ 534 } \ 535 } \ 536 } \ 537 se = ROUND_POWER_OF_TWO(se, 4); \ 538 sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ 539 *sse_ptr = sse; \ 540 var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ 541 return (var >= 0) ? (uint32_t)var : 0; \ 542 } 543 544#define FNS(opt1) \ 545 FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ 546 FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ 547 FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ 548 FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ 549 FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ 550 FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ 551 FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ 552 FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ 553 FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ 554 FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ 555 FN(8, 4, 8, 3, 2, opt1, (int64_t)); 556 557FNS(sse2); 558 559#undef FNS 560#undef FN 561