1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 12#include "vp8/encoder/variance.h" 13#include "vp8/common/pragmas.h" 14#include "vpx_ports/mem.h" 15 16extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 17extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 18extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 19extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); 20 21extern void vp8_filter_block2d_bil4x4_var_mmx 22( 23 const unsigned char *ref_ptr, 24 int ref_pixels_per_line, 25 const unsigned char *src_ptr, 26 int src_pixels_per_line, 27 const short *HFilter, 28 const short *VFilter, 29 int *sum, 30 unsigned int *sumsquared 31); 32 33extern unsigned int vp8_get4x4var_mmx 34( 35 const unsigned char *src_ptr, 36 int source_stride, 37 const unsigned char *ref_ptr, 38 int recon_stride, 39 unsigned int *SSE, 40 int *Sum 41); 42 43unsigned int vp8_get_mb_ss_sse2 44( 45 const short *src_ptr 46); 47unsigned int vp8_get16x16var_sse2 48( 49 const unsigned char *src_ptr, 50 int source_stride, 51 const unsigned char *ref_ptr, 52 int recon_stride, 53 unsigned int *SSE, 54 int *Sum 55); 56unsigned int vp8_get16x16pred_error_sse2 57( 58 const unsigned char *src_ptr, 59 int src_stride, 60 const unsigned char *ref_ptr, 61 int ref_stride 62); 63unsigned int vp8_get8x8var_sse2 64( 65 const unsigned char *src_ptr, 66 int source_stride, 67 const unsigned char *ref_ptr, 68 int recon_stride, 69 unsigned int *SSE, 70 int *Sum 71); 72void vp8_filter_block2d_bil_var_sse2 73( 74 const unsigned char *ref_ptr, 75 int ref_pixels_per_line, 76 const unsigned char *src_ptr, 77 int src_pixels_per_line, 78 unsigned int Height, 79 int xoffset, 80 int yoffset, 81 int *sum, 82 unsigned int *sumsquared 83); 84void vp8_half_horiz_vert_variance8x_h_sse2 85( 86 const unsigned char *ref_ptr, 87 int ref_pixels_per_line, 88 const unsigned char *src_ptr, 89 int src_pixels_per_line, 90 unsigned int Height, 91 int *sum, 92 unsigned int *sumsquared 93); 94void vp8_half_horiz_vert_variance16x_h_sse2 95( 96 const unsigned char *ref_ptr, 97 int ref_pixels_per_line, 98 const unsigned char *src_ptr, 99 int src_pixels_per_line, 100 unsigned int Height, 101 int *sum, 102 unsigned int *sumsquared 103); 104void vp8_half_horiz_variance8x_h_sse2 105( 106 const unsigned char *ref_ptr, 107 int ref_pixels_per_line, 108 const unsigned char *src_ptr, 109 int src_pixels_per_line, 110 unsigned int Height, 111 int *sum, 112 unsigned int *sumsquared 113); 114void vp8_half_horiz_variance16x_h_sse2 115( 116 const unsigned char *ref_ptr, 117 int ref_pixels_per_line, 118 const unsigned char *src_ptr, 119 int src_pixels_per_line, 120 unsigned int Height, 121 int *sum, 122 unsigned int *sumsquared 123); 124void vp8_half_vert_variance8x_h_sse2 125( 126 const unsigned char *ref_ptr, 127 int ref_pixels_per_line, 128 const unsigned char *src_ptr, 129 int src_pixels_per_line, 130 unsigned int Height, 131 int *sum, 132 unsigned int *sumsquared 133); 134void vp8_half_vert_variance16x_h_sse2 135( 136 const unsigned char *ref_ptr, 137 int ref_pixels_per_line, 138 const unsigned char *src_ptr, 139 int src_pixels_per_line, 140 unsigned int Height, 141 int *sum, 142 unsigned int *sumsquared 143); 144 145DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]); 146 147unsigned int vp8_variance4x4_wmt( 148 const unsigned char *src_ptr, 149 int source_stride, 150 const unsigned char *ref_ptr, 151 int recon_stride) 152{ 153 unsigned int var; 154 int avg; 155 156 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 157 return (var - ((avg * avg) >> 4)); 158 159} 160 161 162 163unsigned int vp8_variance8x8_wmt 164( 165 const unsigned char *src_ptr, 166 int source_stride, 167 const unsigned char *ref_ptr, 168 int recon_stride) 169{ 170 unsigned int var; 171 int avg; 172 173 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 174 175 return (var - ((avg * avg) >> 6)); 176 177} 178 179 180unsigned int vp8_variance16x16_wmt 181( 182 const unsigned char *src_ptr, 183 int source_stride, 184 const unsigned char *ref_ptr, 185 int recon_stride, 186 unsigned int *sse) 187{ 188 unsigned int sse0; 189 int sum0; 190 191 192 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 193 *sse = sse0; 194 return (sse0 - ((sum0 * sum0) >> 8)); 195} 196unsigned int vp8_mse16x16_wmt( 197 const unsigned char *src_ptr, 198 int source_stride, 199 const unsigned char *ref_ptr, 200 int recon_stride, 201 unsigned int *sse) 202{ 203 204 unsigned int sse0; 205 int sum0; 206 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 207 *sse = sse0; 208 return sse0; 209 210} 211 212 213unsigned int vp8_variance16x8_wmt 214( 215 const unsigned char *src_ptr, 216 int source_stride, 217 const unsigned char *ref_ptr, 218 int recon_stride, 219 unsigned int *sse) 220{ 221 unsigned int sse0, sse1, var; 222 int sum0, sum1, avg; 223 224 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 225 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 226 227 var = sse0 + sse1; 228 avg = sum0 + sum1; 229 *sse = var; 230 return (var - ((avg * avg) >> 7)); 231 232} 233 234unsigned int vp8_variance8x16_wmt 235( 236 const unsigned char *src_ptr, 237 int source_stride, 238 const unsigned char *ref_ptr, 239 int recon_stride, 240 unsigned int *sse) 241{ 242 unsigned int sse0, sse1, var; 243 int sum0, sum1, avg; 244 245 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 246 vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; 247 248 var = sse0 + sse1; 249 avg = sum0 + sum1; 250 *sse = var; 251 return (var - ((avg * avg) >> 7)); 252 253} 254 255unsigned int vp8_sub_pixel_variance4x4_wmt 256( 257 const unsigned char *src_ptr, 258 int src_pixels_per_line, 259 int xoffset, 260 int yoffset, 261 const unsigned char *dst_ptr, 262 int dst_pixels_per_line, 263 unsigned int *sse 264) 265{ 266 int xsum; 267 unsigned int xxsum; 268 vp8_filter_block2d_bil4x4_var_mmx( 269 src_ptr, src_pixels_per_line, 270 dst_ptr, dst_pixels_per_line, 271 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset], 272 &xsum, &xxsum 273 ); 274 *sse = xxsum; 275 return (xxsum - ((xsum * xsum) >> 4)); 276} 277 278 279unsigned int vp8_sub_pixel_variance8x8_wmt 280( 281 const unsigned char *src_ptr, 282 int src_pixels_per_line, 283 int xoffset, 284 int yoffset, 285 const unsigned char *dst_ptr, 286 int dst_pixels_per_line, 287 unsigned int *sse 288) 289{ 290 int xsum; 291 unsigned int xxsum; 292 293 if (xoffset == 4 && yoffset == 0) 294 { 295 vp8_half_horiz_variance8x_h_sse2( 296 src_ptr, src_pixels_per_line, 297 dst_ptr, dst_pixels_per_line, 8, 298 &xsum, &xxsum); 299 } 300 else if (xoffset == 0 && yoffset == 4) 301 { 302 vp8_half_vert_variance8x_h_sse2( 303 src_ptr, src_pixels_per_line, 304 dst_ptr, dst_pixels_per_line, 8, 305 &xsum, &xxsum); 306 } 307 else if (xoffset == 4 && yoffset == 4) 308 { 309 vp8_half_horiz_vert_variance8x_h_sse2( 310 src_ptr, src_pixels_per_line, 311 dst_ptr, dst_pixels_per_line, 8, 312 &xsum, &xxsum); 313 } 314 else 315 { 316 vp8_filter_block2d_bil_var_sse2( 317 src_ptr, src_pixels_per_line, 318 dst_ptr, dst_pixels_per_line, 8, 319 xoffset, yoffset, 320 &xsum, &xxsum); 321 } 322 323 *sse = xxsum; 324 return (xxsum - ((xsum * xsum) >> 6)); 325} 326 327unsigned int vp8_sub_pixel_variance16x16_wmt 328( 329 const unsigned char *src_ptr, 330 int src_pixels_per_line, 331 int xoffset, 332 int yoffset, 333 const unsigned char *dst_ptr, 334 int dst_pixels_per_line, 335 unsigned int *sse 336) 337{ 338 int xsum0, xsum1; 339 unsigned int xxsum0, xxsum1; 340 341 342 // note we could avoid these if statements if the calling function 343 // just called the appropriate functions inside. 344 if (xoffset == 4 && yoffset == 0) 345 { 346 vp8_half_horiz_variance16x_h_sse2( 347 src_ptr, src_pixels_per_line, 348 dst_ptr, dst_pixels_per_line, 16, 349 &xsum0, &xxsum0); 350 } 351 else if (xoffset == 0 && yoffset == 4) 352 { 353 vp8_half_vert_variance16x_h_sse2( 354 src_ptr, src_pixels_per_line, 355 dst_ptr, dst_pixels_per_line, 16, 356 &xsum0, &xxsum0); 357 } 358 else if (xoffset == 4 && yoffset == 4) 359 { 360 vp8_half_horiz_vert_variance16x_h_sse2( 361 src_ptr, src_pixels_per_line, 362 dst_ptr, dst_pixels_per_line, 16, 363 &xsum0, &xxsum0); 364 } 365 else 366 { 367 vp8_filter_block2d_bil_var_sse2( 368 src_ptr, src_pixels_per_line, 369 dst_ptr, dst_pixels_per_line, 16, 370 xoffset, yoffset, 371 &xsum0, &xxsum0 372 ); 373 374 vp8_filter_block2d_bil_var_sse2( 375 src_ptr + 8, src_pixels_per_line, 376 dst_ptr + 8, dst_pixels_per_line, 16, 377 xoffset, yoffset, 378 &xsum1, &xxsum1 379 ); 380 xsum0 += xsum1; 381 xxsum0 += xxsum1; 382 } 383 384 *sse = xxsum0; 385 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 386} 387 388unsigned int vp8_sub_pixel_mse16x16_wmt( 389 const unsigned char *src_ptr, 390 int src_pixels_per_line, 391 int xoffset, 392 int yoffset, 393 const unsigned char *dst_ptr, 394 int dst_pixels_per_line, 395 unsigned int *sse 396) 397{ 398 vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); 399 return *sse; 400} 401 402unsigned int vp8_sub_pixel_variance16x8_wmt 403( 404 const unsigned char *src_ptr, 405 int src_pixels_per_line, 406 int xoffset, 407 int yoffset, 408 const unsigned char *dst_ptr, 409 int dst_pixels_per_line, 410 unsigned int *sse 411 412) 413{ 414 int xsum0, xsum1; 415 unsigned int xxsum0, xxsum1; 416 417 if (xoffset == 4 && yoffset == 0) 418 { 419 vp8_half_horiz_variance16x_h_sse2( 420 src_ptr, src_pixels_per_line, 421 dst_ptr, dst_pixels_per_line, 8, 422 &xsum0, &xxsum0); 423 } 424 else if (xoffset == 0 && yoffset == 4) 425 { 426 vp8_half_vert_variance16x_h_sse2( 427 src_ptr, src_pixels_per_line, 428 dst_ptr, dst_pixels_per_line, 8, 429 &xsum0, &xxsum0); 430 } 431 else if (xoffset == 4 && yoffset == 4) 432 { 433 vp8_half_horiz_vert_variance16x_h_sse2( 434 src_ptr, src_pixels_per_line, 435 dst_ptr, dst_pixels_per_line, 8, 436 &xsum0, &xxsum0); 437 } 438 else 439 { 440 vp8_filter_block2d_bil_var_sse2( 441 src_ptr, src_pixels_per_line, 442 dst_ptr, dst_pixels_per_line, 8, 443 xoffset, yoffset, 444 &xsum0, &xxsum0); 445 446 vp8_filter_block2d_bil_var_sse2( 447 src_ptr + 8, src_pixels_per_line, 448 dst_ptr + 8, dst_pixels_per_line, 8, 449 xoffset, yoffset, 450 &xsum1, &xxsum1); 451 xsum0 += xsum1; 452 xxsum0 += xxsum1; 453 } 454 455 *sse = xxsum0; 456 return (xxsum0 - ((xsum0 * xsum0) >> 7)); 457} 458 459unsigned int vp8_sub_pixel_variance8x16_wmt 460( 461 const unsigned char *src_ptr, 462 int src_pixels_per_line, 463 int xoffset, 464 int yoffset, 465 const unsigned char *dst_ptr, 466 int dst_pixels_per_line, 467 unsigned int *sse 468) 469{ 470 int xsum; 471 unsigned int xxsum; 472 473 if (xoffset == 4 && yoffset == 0) 474 { 475 vp8_half_horiz_variance8x_h_sse2( 476 src_ptr, src_pixels_per_line, 477 dst_ptr, dst_pixels_per_line, 16, 478 &xsum, &xxsum); 479 } 480 else if (xoffset == 0 && yoffset == 4) 481 { 482 vp8_half_vert_variance8x_h_sse2( 483 src_ptr, src_pixels_per_line, 484 dst_ptr, dst_pixels_per_line, 16, 485 &xsum, &xxsum); 486 } 487 else if (xoffset == 4 && yoffset == 4) 488 { 489 vp8_half_horiz_vert_variance8x_h_sse2( 490 src_ptr, src_pixels_per_line, 491 dst_ptr, dst_pixels_per_line, 16, 492 &xsum, &xxsum); 493 } 494 else 495 { 496 vp8_filter_block2d_bil_var_sse2( 497 src_ptr, src_pixels_per_line, 498 dst_ptr, dst_pixels_per_line, 16, 499 xoffset, yoffset, 500 &xsum, &xxsum); 501 } 502 503 *sse = xxsum; 504 return (xxsum - ((xsum * xsum) >> 7)); 505} 506 507 508unsigned int vp8_variance_halfpixvar16x16_h_wmt( 509 const unsigned char *src_ptr, 510 int src_pixels_per_line, 511 const unsigned char *dst_ptr, 512 int dst_pixels_per_line, 513 unsigned int *sse) 514{ 515 int xsum0; 516 unsigned int xxsum0; 517 518 vp8_half_horiz_variance16x_h_sse2( 519 src_ptr, src_pixels_per_line, 520 dst_ptr, dst_pixels_per_line, 16, 521 &xsum0, &xxsum0); 522 523 *sse = xxsum0; 524 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 525} 526 527 528unsigned int vp8_variance_halfpixvar16x16_v_wmt( 529 const unsigned char *src_ptr, 530 int src_pixels_per_line, 531 const unsigned char *dst_ptr, 532 int dst_pixels_per_line, 533 unsigned int *sse) 534{ 535 int xsum0; 536 unsigned int xxsum0; 537 vp8_half_vert_variance16x_h_sse2( 538 src_ptr, src_pixels_per_line, 539 dst_ptr, dst_pixels_per_line, 16, 540 &xsum0, &xxsum0); 541 542 *sse = xxsum0; 543 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 544} 545 546 547unsigned int vp8_variance_halfpixvar16x16_hv_wmt( 548 const unsigned char *src_ptr, 549 int src_pixels_per_line, 550 const unsigned char *dst_ptr, 551 int dst_pixels_per_line, 552 unsigned int *sse) 553{ 554 int xsum0; 555 unsigned int xxsum0; 556 557 vp8_half_horiz_vert_variance16x_h_sse2( 558 src_ptr, src_pixels_per_line, 559 dst_ptr, dst_pixels_per_line, 16, 560 &xsum0, &xxsum0); 561 562 *sse = xxsum0; 563 return (xxsum0 - ((xsum0 * xsum0) >> 8)); 564} 565