1/****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19*/ 20 21/** 22****************************************************************************** 23* @file ime_distortion_metrics_sse42.c 24* 25* @brief 26* This file contains definitions of routines that compute distortion 27* between two macro/sub blocks of identical dimensions 28* 29* @author 30* Ittiam 31* 32* @par List of Functions: 33* - ime_compute_sad_16x16_sse42() 34* - ime_compute_sad_16x16_fast_sse42() 35* - ime_compute_sad_16x16_ea8_sse42() 36* - ime_compute_sad_16x8_sse42() 37* - ime_calculate_sad4_prog_sse42() 38* - ime_sub_pel_compute_sad_16x16_sse42() 39* - ime_compute_satqd_16x16_lumainter_sse42() 40* 41* @remarks 42* None 43* 44******************************************************************************* 45*/ 46 47/*****************************************************************************/ 48/* File Includes */ 49/*****************************************************************************/ 50 51/* System include files */ 52#include <stdio.h> 53#include <stdlib.h> 54#include <string.h> 55 56/* User include files */ 57#include "ime_typedefs.h" 58#include "ime_defs.h" 59#include "ime_macros.h" 60#include "ime_statistics.h" 61#include "ime_platform_macros.h" 62#include "ime_distortion_metrics.h" 63#include <immintrin.h> 64 65/*****************************************************************************/ 66/* Function Definitions */ 67/*****************************************************************************/ 68 69/** 70****************************************************************************** 71* 72* @brief computes distortion (SAD) between 2 16x16 blocks 73* 74* @par Description 75* This functions computes SAD between 2 16x16 blocks. There is a provision 76* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 77* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 78* 79* @param[in] pu1_src 80* UWORD8 pointer to the source 81* 82* @param[out] pu1_dst 83* UWORD8 pointer to the destination 84* 85* @param[in] src_strd 86* integer source stride 87* 88* @param[in] dst_strd 89* integer destination stride 90* 91* @param[in] i4_max_sad 92* integer maximum allowed distortion 93* 94* @param[out] pi4_mb_distortion 95* integer evaluated sad 96* 97* @remarks 98* 99****************************************************************************** 100*/ 101void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src, 102 UWORD8 *pu1_est, 103 WORD32 src_strd, 104 WORD32 est_strd, 105 WORD32 i4_max_sad, 106 WORD32 *pi4_mb_distortion) 107{ 108 __m128i src_r0, src_r1, src_r2, src_r3; 109 __m128i est_r0, est_r1, est_r2, est_r3; 110 __m128i res_r0, res_r1, res_r2, res_r3; 111 __m128i sad_val; 112 int val1, val2; 113 UNUSED (i4_max_sad); 114 115 // Row 0-3 sad calculation 116 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 117 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 118 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 119 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 120 121 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 122 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 123 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 124 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 125 126 res_r0 = _mm_sad_epu8(src_r0, est_r0); 127 res_r1 = _mm_sad_epu8(src_r1, est_r1); 128 res_r2 = _mm_sad_epu8(src_r2, est_r2); 129 res_r3 = _mm_sad_epu8(src_r3, est_r3); 130 131 sad_val = _mm_add_epi64(res_r0, res_r1); 132 sad_val = _mm_add_epi64(sad_val, res_r2); 133 sad_val = _mm_add_epi64(sad_val, res_r3); 134 135 // Row 4-7 sad calculation 136 pu1_src += 4*src_strd; 137 pu1_est += 4*est_strd; 138 139 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 140 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 141 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 142 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 143 144 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 145 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 146 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 147 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 148 149 res_r0 = _mm_sad_epu8(src_r0, est_r0); 150 res_r1 = _mm_sad_epu8(src_r1, est_r1); 151 res_r2 = _mm_sad_epu8(src_r2, est_r2); 152 res_r3 = _mm_sad_epu8(src_r3, est_r3); 153 154 sad_val = _mm_add_epi64(sad_val, res_r0); 155 sad_val = _mm_add_epi64(sad_val, res_r1); 156 sad_val = _mm_add_epi64(sad_val, res_r2); 157 sad_val = _mm_add_epi64(sad_val, res_r3); 158 159 // Row 8-11 sad calculation 160 pu1_src += 4*src_strd; 161 pu1_est += 4*est_strd; 162 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 163 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 164 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 165 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 166 167 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 168 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 169 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 170 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 171 172 res_r0 = _mm_sad_epu8(src_r0, est_r0); 173 res_r1 = _mm_sad_epu8(src_r1, est_r1); 174 res_r2 = _mm_sad_epu8(src_r2, est_r2); 175 res_r3 = _mm_sad_epu8(src_r3, est_r3); 176 177 sad_val = _mm_add_epi64(sad_val, res_r0); 178 sad_val = _mm_add_epi64(sad_val, res_r1); 179 sad_val = _mm_add_epi64(sad_val, res_r2); 180 sad_val = _mm_add_epi64(sad_val, res_r3); 181 182 // Row 12-15 sad calculation 183 pu1_src += 4*src_strd; 184 pu1_est += 4*est_strd; 185 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 186 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 187 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 188 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 189 190 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 191 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 192 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 193 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 194 195 res_r0 = _mm_sad_epu8(src_r0, est_r0); 196 res_r1 = _mm_sad_epu8(src_r1, est_r1); 197 res_r2 = _mm_sad_epu8(src_r2, est_r2); 198 res_r3 = _mm_sad_epu8(src_r3, est_r3); 199 200 sad_val = _mm_add_epi64(sad_val, res_r0); 201 sad_val = _mm_add_epi64(sad_val, res_r1); 202 sad_val = _mm_add_epi64(sad_val, res_r2); 203 sad_val = _mm_add_epi64(sad_val, res_r3); 204 205 val1 = _mm_extract_epi32(sad_val,0); 206 val2 = _mm_extract_epi32(sad_val, 2); 207 *pi4_mb_distortion = (val1+val2); 208 209 return; 210} 211 212/** 213****************************************************************************** 214* 215* @brief computes distortion (SAD) between 2 16x8 blocks 216* 217* 218* @par Description 219* This functions computes SAD between 2 16x8 blocks. There is a provision 220* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 221* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 222* 223* @param[in] pu1_src 224* UWORD8 pointer to the source 225* 226* @param[out] pu1_dst 227* UWORD8 pointer to the destination 228* 229* @param[in] src_strd 230* integer source stride 231* 232* @param[in] dst_strd 233* integer destination stride 234* 235* @param[in] u4_max_sad 236* integer maximum allowed distortion 237* 238* @param[out] pi4_mb_distortion 239* integer evaluated sad 240* 241* @remarks 242* 243****************************************************************************** 244*/ 245void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src, 246 UWORD8 *pu1_est, 247 WORD32 src_strd, 248 WORD32 est_strd, 249 WORD32 i4_max_sad, 250 WORD32 *pi4_mb_distortion) 251{ 252 __m128i src_r0, src_r1, src_r2, src_r3; 253 __m128i est_r0, est_r1, est_r2, est_r3; 254 __m128i res_r0, res_r1, res_r2, res_r3; 255 __m128i sad_val; 256 int val1, val2; 257 UNUSED (i4_max_sad); 258 259 // Row 0-3 sad calculation 260 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 261 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 262 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 263 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 264 265 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 266 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 267 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 268 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 269 270 res_r0 = _mm_sad_epu8(src_r0, est_r0); 271 res_r1 = _mm_sad_epu8(src_r1, est_r1); 272 res_r2 = _mm_sad_epu8(src_r2, est_r2); 273 res_r3 = _mm_sad_epu8(src_r3, est_r3); 274 275 sad_val = _mm_add_epi64(res_r0, res_r1); 276 sad_val = _mm_add_epi64(sad_val, res_r2); 277 sad_val = _mm_add_epi64(sad_val, res_r3); 278 279 // Row 4-7 sad calculation 280 pu1_src += 4*src_strd; 281 pu1_est += 4*est_strd; 282 283 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 284 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 285 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 286 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 287 288 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 289 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 290 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 291 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 292 293 res_r0 = _mm_sad_epu8(src_r0, est_r0); 294 res_r1 = _mm_sad_epu8(src_r1, est_r1); 295 res_r2 = _mm_sad_epu8(src_r2, est_r2); 296 res_r3 = _mm_sad_epu8(src_r3, est_r3); 297 298 sad_val = _mm_add_epi64(sad_val, res_r0); 299 sad_val = _mm_add_epi64(sad_val, res_r1); 300 sad_val = _mm_add_epi64(sad_val, res_r2); 301 sad_val = _mm_add_epi64(sad_val, res_r3); 302 303 val1 = _mm_extract_epi32(sad_val,0); 304 val2 = _mm_extract_epi32(sad_val, 2); 305 *pi4_mb_distortion = (val1+val2); 306 return; 307} 308 309/** 310****************************************************************************** 311* 312* @brief computes distortion (SAD) between 2 16x16 blocks 313* 314* @par Description 315* This functions computes SAD between 2 16x16 blocks. There is a provision 316* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 317* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 318* 319* @param[in] pu1_src 320* UWORD8 pointer to the source 321* 322* @param[out] pu1_dst 323* UWORD8 pointer to the destination 324* 325* @param[in] src_strd 326* integer source stride 327* 328* @param[in] dst_strd 329* integer destination stride 330* 331* @param[in] i4_max_sad 332* integer maximum allowed distortion 333* 334* @param[out] pi4_mb_distortion 335* integer evaluated sad 336* 337* @remarks 338* 339****************************************************************************** 340*/ 341void ime_compute_sad_16x16_ea8_sse42(UWORD8 *pu1_src, 342 UWORD8 *pu1_est, 343 WORD32 src_strd, 344 WORD32 est_strd, 345 WORD32 i4_max_sad, 346 WORD32 *pi4_mb_distortion) 347{ 348 __m128i src_r0, src_r1, src_r2, src_r3; 349 __m128i est_r0, est_r1, est_r2, est_r3; 350 __m128i res_r0, res_r1, res_r2, res_r3; 351 __m128i sad_val; 352 WORD32 val1, val2; 353 WORD32 i4_sad; 354 UWORD8 *pu1_src_temp = pu1_src + src_strd; 355 UWORD8 *pu1_est_temp = pu1_est + est_strd; 356 357 // Row 0,2,4,6 sad calculation 358 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 359 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 360 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); 361 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); 362 363 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 364 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 365 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); 366 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); 367 368 res_r0 = _mm_sad_epu8(src_r0, est_r0); 369 res_r1 = _mm_sad_epu8(src_r1, est_r1); 370 res_r2 = _mm_sad_epu8(src_r2, est_r2); 371 res_r3 = _mm_sad_epu8(src_r3, est_r3); 372 373 sad_val = _mm_add_epi64(res_r0, res_r1); 374 sad_val = _mm_add_epi64(sad_val, res_r2); 375 sad_val = _mm_add_epi64(sad_val, res_r3); 376 377 // Row 8,10,12,14 sad calculation 378 pu1_src += 8*src_strd; 379 pu1_est += 8*est_strd; 380 381 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 382 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 383 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); 384 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); 385 386 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 387 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 388 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); 389 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); 390 391 res_r0 = _mm_sad_epu8(src_r0, est_r0); 392 res_r1 = _mm_sad_epu8(src_r1, est_r1); 393 res_r2 = _mm_sad_epu8(src_r2, est_r2); 394 res_r3 = _mm_sad_epu8(src_r3, est_r3); 395 396 sad_val = _mm_add_epi64(sad_val, res_r0); 397 sad_val = _mm_add_epi64(sad_val, res_r1); 398 sad_val = _mm_add_epi64(sad_val, res_r2); 399 sad_val = _mm_add_epi64(sad_val, res_r3); 400 401 pu1_src = pu1_src_temp; 402 pu1_est = pu1_est_temp; 403 404 val1 = _mm_extract_epi32(sad_val, 0); 405 val2 = _mm_extract_epi32(sad_val, 2); 406 407 i4_sad = val1 + val2; 408 if (i4_max_sad < i4_sad) 409 { 410 *pi4_mb_distortion = i4_sad; 411 return ; 412 } 413 // Row 1,3,5,7 sad calculation 414 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 415 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 416 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); 417 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); 418 419 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 420 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 421 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); 422 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); 423 424 res_r0 = _mm_sad_epu8(src_r0, est_r0); 425 res_r1 = _mm_sad_epu8(src_r1, est_r1); 426 res_r2 = _mm_sad_epu8(src_r2, est_r2); 427 res_r3 = _mm_sad_epu8(src_r3, est_r3); 428 429 sad_val = _mm_add_epi64(sad_val, res_r0); 430 sad_val = _mm_add_epi64(sad_val, res_r1); 431 sad_val = _mm_add_epi64(sad_val, res_r2); 432 sad_val = _mm_add_epi64(sad_val, res_r3); 433 434 // Row 9,11,13,15 sad calculation 435 pu1_src += 8*src_strd; 436 pu1_est += 8*est_strd; 437 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 438 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 439 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); 440 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); 441 442 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 443 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 444 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); 445 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); 446 447 res_r0 = _mm_sad_epu8(src_r0, est_r0); 448 res_r1 = _mm_sad_epu8(src_r1, est_r1); 449 res_r2 = _mm_sad_epu8(src_r2, est_r2); 450 res_r3 = _mm_sad_epu8(src_r3, est_r3); 451 452 sad_val = _mm_add_epi64(sad_val, res_r0); 453 sad_val = _mm_add_epi64(sad_val, res_r1); 454 sad_val = _mm_add_epi64(sad_val, res_r2); 455 sad_val = _mm_add_epi64(sad_val, res_r3); 456 457 val1 = _mm_extract_epi32(sad_val, 0); 458 val2 = _mm_extract_epi32(sad_val, 2); 459 *pi4_mb_distortion = (val1+val2); 460 461 return; 462} 463 464/** 465****************************************************************************** 466* 467* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) 468* 469* @par Description 470* This functions computes SAD between 2 16x16 blocks by processing alternate 471* rows (fast mode). For fast mode it is assumed sad obtained by processing 472* alternate rows is approximately twice as that for the whole block. 473* 474* @param[in] pu1_src 475* UWORD8 pointer to the source 476* 477* @param[out] pu1_dst 478* UWORD8 pointer to the destination 479* 480* @param[in] src_strd 481* integer source stride 482* 483* @param[in] dst_strd 484* integer destination stride 485* 486* @param[in] i4_max_sad 487* integer maximum allowed distortion 488* 489* @param[out] pi4_mb_distortion 490* integer evaluated sad 491* 492* @remarks 493* 494****************************************************************************** 495*/ 496void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src, 497 UWORD8 *pu1_est, 498 WORD32 src_strd, 499 WORD32 est_strd, 500 WORD32 i4_max_sad, 501 WORD32 *pi4_mb_distortion) 502{ 503 __m128i src_r0, src_r1, src_r2, src_r3; 504 __m128i est_r0, est_r1, est_r2, est_r3; 505 __m128i res_r0, res_r1, res_r2, res_r3; 506 __m128i sad_val; 507 WORD32 val1, val2; 508 WORD32 i4_sad; 509 UWORD8 *pu1_src_temp = pu1_src + src_strd; 510 UWORD8 *pu1_est_temp = pu1_est + est_strd; 511 UNUSED (i4_max_sad); 512 513 // Row 0,2,4,6 sad calculation 514 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 515 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd)); 516 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd)); 517 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd)); 518 519 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 520 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd)); 521 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd)); 522 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd)); 523 524 res_r0 = _mm_sad_epu8(src_r0, est_r0); 525 res_r1 = _mm_sad_epu8(src_r1, est_r1); 526 res_r2 = _mm_sad_epu8(src_r2, est_r2); 527 res_r3 = _mm_sad_epu8(src_r3, est_r3); 528 529 sad_val = _mm_add_epi64(res_r0, res_r1); 530 sad_val = _mm_add_epi64(sad_val, res_r2); 531 sad_val = _mm_add_epi64(sad_val, res_r3); 532 533 // Row 8,10,12,14 sad calculation 534 pu1_src += 8 * src_strd; 535 pu1_est += 8 * est_strd; 536 537 src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 538 src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd)); 539 src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd)); 540 src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd)); 541 542 est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 543 est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd)); 544 est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd)); 545 est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd)); 546 547 res_r0 = _mm_sad_epu8(src_r0, est_r0); 548 res_r1 = _mm_sad_epu8(src_r1, est_r1); 549 res_r2 = _mm_sad_epu8(src_r2, est_r2); 550 res_r3 = _mm_sad_epu8(src_r3, est_r3); 551 552 sad_val = _mm_add_epi64(sad_val, res_r0); 553 sad_val = _mm_add_epi64(sad_val, res_r1); 554 sad_val = _mm_add_epi64(sad_val, res_r2); 555 sad_val = _mm_add_epi64(sad_val, res_r3); 556 557 pu1_src = pu1_src_temp; 558 pu1_est = pu1_est_temp; 559 560 val1 = _mm_extract_epi32(sad_val, 0); 561 val2 = _mm_extract_epi32(sad_val, 2); 562 563 i4_sad = val1 + val2; 564 *pi4_mb_distortion = (i4_sad<<1); 565 return; 566} 567 568/** 569******************************************************************************* 570* 571* @brief compute sad 572* 573* @par Description: This function computes the sad at vertices of diamond grid 574* centered at reference pointer and at unit distance from it. 575* 576* @param[in] pu1_ref 577* UWORD8 pointer to the reference 578* 579* @param[out] pu1_src 580* UWORD8 pointer to the source 581* 582* @param[in] ref_strd 583* integer reference stride 584* 585* @param[in] src_strd 586* integer source stride 587* 588* @param[out] pi4_sad 589* pointer to integer array evaluated sad 590* 591* @returns sad at all evaluated vertexes 592* 593* @remarks none 594* 595******************************************************************************* 596*/ 597void ime_calculate_sad4_prog_sse42(UWORD8 *pu1_ref, 598 UWORD8 *pu1_src, 599 WORD32 ref_strd, 600 WORD32 src_strd, 601 WORD32 *pi4_sad) 602{ 603 /* reference ptrs at unit 1 distance in diamond pattern centered at pu1_ref */ 604 UWORD8 *left_ptr = pu1_ref - 1; 605 UWORD8 *right_ptr = pu1_ref + 1; 606 UWORD8 *top_ptr = pu1_ref - ref_strd; 607 UWORD8 *bot_ptr = pu1_ref + ref_strd; 608 609 WORD32 val1, val2; 610 __m128i src, ref_left, ref_right, ref_top, ref_bot; 611 __m128i res_r0, res_r1, res_r2, res_r3; 612 __m128i sad_r0, sad_r1, sad_r2, sad_r3; 613 614 // Row 0 sad calculation 615 src = _mm_loadu_si128((__m128i *) (pu1_src)); 616 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 617 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 618 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 619 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 620 621 sad_r0 = _mm_sad_epu8(src, ref_left); 622 sad_r1 = _mm_sad_epu8(src, ref_right); 623 sad_r2 = _mm_sad_epu8(src, ref_top); 624 sad_r3 = _mm_sad_epu8(src, ref_bot); 625 626 pu1_src += src_strd; 627 left_ptr += ref_strd; 628 right_ptr += ref_strd; 629 top_ptr += ref_strd; 630 bot_ptr += ref_strd; 631 632 // Row 1 sad calculation 633 src = _mm_loadu_si128((__m128i *) (pu1_src)); 634 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 635 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 636 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 637 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 638 639 res_r0 = _mm_sad_epu8(src, ref_left); 640 res_r1 = _mm_sad_epu8(src, ref_right); 641 res_r2 = _mm_sad_epu8(src, ref_top); 642 res_r3 = _mm_sad_epu8(src, ref_bot); 643 644 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 645 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 646 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 647 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 648 649 pu1_src += src_strd; 650 left_ptr += ref_strd; 651 right_ptr += ref_strd; 652 top_ptr += ref_strd; 653 bot_ptr += ref_strd; 654 655 // Row 2 sad calculation 656 src = _mm_loadu_si128((__m128i *) (pu1_src)); 657 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 658 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 659 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 660 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 661 662 res_r0 = _mm_sad_epu8(src, ref_left); 663 res_r1 = _mm_sad_epu8(src, ref_right); 664 res_r2 = _mm_sad_epu8(src, ref_top); 665 res_r3 = _mm_sad_epu8(src, ref_bot); 666 667 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 668 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 669 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 670 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 671 672 pu1_src += src_strd; 673 left_ptr += ref_strd; 674 right_ptr += ref_strd; 675 top_ptr += ref_strd; 676 bot_ptr += ref_strd; 677 678 // Row 3 sad calculation 679 src = _mm_loadu_si128((__m128i *) (pu1_src)); 680 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 681 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 682 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 683 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 684 685 res_r0 = _mm_sad_epu8(src, ref_left); 686 res_r1 = _mm_sad_epu8(src, ref_right); 687 res_r2 = _mm_sad_epu8(src, ref_top); 688 res_r3 = _mm_sad_epu8(src, ref_bot); 689 690 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 691 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 692 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 693 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 694 695 pu1_src += src_strd; 696 left_ptr += ref_strd; 697 right_ptr += ref_strd; 698 top_ptr += ref_strd; 699 bot_ptr += ref_strd; 700 701 // Row 4 sad calculation 702 src = _mm_loadu_si128((__m128i *) (pu1_src)); 703 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 704 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 705 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 706 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 707 708 res_r0 = _mm_sad_epu8(src, ref_left); 709 res_r1 = _mm_sad_epu8(src, ref_right); 710 res_r2 = _mm_sad_epu8(src, ref_top); 711 res_r3 = _mm_sad_epu8(src, ref_bot); 712 713 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 714 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 715 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 716 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 717 718 pu1_src += src_strd; 719 left_ptr += ref_strd; 720 right_ptr += ref_strd; 721 top_ptr += ref_strd; 722 bot_ptr += ref_strd; 723 724 // Row 5 sad calculation 725 src = _mm_loadu_si128((__m128i *) (pu1_src)); 726 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 727 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 728 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 729 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 730 731 res_r0 = _mm_sad_epu8(src, ref_left); 732 res_r1 = _mm_sad_epu8(src, ref_right); 733 res_r2 = _mm_sad_epu8(src, ref_top); 734 res_r3 = _mm_sad_epu8(src, ref_bot); 735 736 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 737 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 738 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 739 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 740 741 pu1_src += src_strd; 742 left_ptr += ref_strd; 743 right_ptr += ref_strd; 744 top_ptr += ref_strd; 745 bot_ptr += ref_strd; 746 747 // Row 6 sad calculation 748 src = _mm_loadu_si128((__m128i *) (pu1_src)); 749 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 750 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 751 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 752 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 753 754 res_r0 = _mm_sad_epu8(src, ref_left); 755 res_r1 = _mm_sad_epu8(src, ref_right); 756 res_r2 = _mm_sad_epu8(src, ref_top); 757 res_r3 = _mm_sad_epu8(src, ref_bot); 758 759 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 760 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 761 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 762 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 763 764 pu1_src += src_strd; 765 left_ptr += ref_strd; 766 right_ptr += ref_strd; 767 top_ptr += ref_strd; 768 bot_ptr += ref_strd; 769 770 // Row 7 sad calculation 771 src = _mm_loadu_si128((__m128i *) (pu1_src)); 772 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 773 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 774 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 775 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 776 777 res_r0 = _mm_sad_epu8(src, ref_left); 778 res_r1 = _mm_sad_epu8(src, ref_right); 779 res_r2 = _mm_sad_epu8(src, ref_top); 780 res_r3 = _mm_sad_epu8(src, ref_bot); 781 782 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 783 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 784 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 785 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 786 787 pu1_src += src_strd; 788 left_ptr += ref_strd; 789 right_ptr += ref_strd; 790 top_ptr += ref_strd; 791 bot_ptr += ref_strd; 792 793 // Row 8 sad calculation 794 src = _mm_loadu_si128((__m128i *) (pu1_src)); 795 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 796 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 797 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 798 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 799 800 res_r0 = _mm_sad_epu8(src, ref_left); 801 res_r1 = _mm_sad_epu8(src, ref_right); 802 res_r2 = _mm_sad_epu8(src, ref_top); 803 res_r3 = _mm_sad_epu8(src, ref_bot); 804 805 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 806 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 807 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 808 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 809 810 pu1_src += src_strd; 811 left_ptr += ref_strd; 812 right_ptr += ref_strd; 813 top_ptr += ref_strd; 814 bot_ptr += ref_strd; 815 816 // Row 9 sad calculation 817 src = _mm_loadu_si128((__m128i *) (pu1_src)); 818 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 819 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 820 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 821 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 822 823 res_r0 = _mm_sad_epu8(src, ref_left); 824 res_r1 = _mm_sad_epu8(src, ref_right); 825 res_r2 = _mm_sad_epu8(src, ref_top); 826 res_r3 = _mm_sad_epu8(src, ref_bot); 827 828 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 829 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 830 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 831 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 832 833 pu1_src += src_strd; 834 left_ptr += ref_strd; 835 right_ptr += ref_strd; 836 top_ptr += ref_strd; 837 bot_ptr += ref_strd; 838 839 // Row 10 sad calculation 840 src = _mm_loadu_si128((__m128i *) (pu1_src)); 841 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 842 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 843 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 844 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 845 846 res_r0 = _mm_sad_epu8(src, ref_left); 847 res_r1 = _mm_sad_epu8(src, ref_right); 848 res_r2 = _mm_sad_epu8(src, ref_top); 849 res_r3 = _mm_sad_epu8(src, ref_bot); 850 851 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 852 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 853 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 854 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 855 856 pu1_src += src_strd; 857 left_ptr += ref_strd; 858 right_ptr += ref_strd; 859 top_ptr += ref_strd; 860 bot_ptr += ref_strd; 861 862 // Row 11 sad calculation 863 src = _mm_loadu_si128((__m128i *) (pu1_src)); 864 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 865 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 866 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 867 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 868 869 res_r0 = _mm_sad_epu8(src, ref_left); 870 res_r1 = _mm_sad_epu8(src, ref_right); 871 res_r2 = _mm_sad_epu8(src, ref_top); 872 res_r3 = _mm_sad_epu8(src, ref_bot); 873 874 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 875 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 876 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 877 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 878 879 pu1_src += src_strd; 880 left_ptr += ref_strd; 881 right_ptr += ref_strd; 882 top_ptr += ref_strd; 883 bot_ptr += ref_strd; 884 885 // Row 12 sad calculation 886 src = _mm_loadu_si128((__m128i *) (pu1_src)); 887 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 888 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 889 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 890 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 891 892 res_r0 = _mm_sad_epu8(src, ref_left); 893 res_r1 = _mm_sad_epu8(src, ref_right); 894 res_r2 = _mm_sad_epu8(src, ref_top); 895 res_r3 = _mm_sad_epu8(src, ref_bot); 896 897 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 898 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 899 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 900 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 901 902 pu1_src += src_strd; 903 left_ptr += ref_strd; 904 right_ptr += ref_strd; 905 top_ptr += ref_strd; 906 bot_ptr += ref_strd; 907 908 // Row 13 sad calculation 909 src = _mm_loadu_si128((__m128i *) (pu1_src)); 910 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 911 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 912 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 913 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 914 915 res_r0 = _mm_sad_epu8(src, ref_left); 916 res_r1 = _mm_sad_epu8(src, ref_right); 917 res_r2 = _mm_sad_epu8(src, ref_top); 918 res_r3 = _mm_sad_epu8(src, ref_bot); 919 920 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 921 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 922 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 923 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 924 925 pu1_src += src_strd; 926 left_ptr += ref_strd; 927 right_ptr += ref_strd; 928 top_ptr += ref_strd; 929 bot_ptr += ref_strd; 930 931 // Row 14 sad calculation 932 src = _mm_loadu_si128((__m128i *) (pu1_src)); 933 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 934 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 935 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 936 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 937 938 res_r0 = _mm_sad_epu8(src, ref_left); 939 res_r1 = _mm_sad_epu8(src, ref_right); 940 res_r2 = _mm_sad_epu8(src, ref_top); 941 res_r3 = _mm_sad_epu8(src, ref_bot); 942 943 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 944 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 945 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 946 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 947 948 pu1_src += src_strd; 949 left_ptr += ref_strd; 950 right_ptr += ref_strd; 951 top_ptr += ref_strd; 952 bot_ptr += ref_strd; 953 954 // Row 15 sad calculation 955 src = _mm_loadu_si128((__m128i *) (pu1_src)); 956 ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 957 ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 958 ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 959 ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 960 961 res_r0 = _mm_sad_epu8(src, ref_left); 962 res_r1 = _mm_sad_epu8(src, ref_right); 963 res_r2 = _mm_sad_epu8(src, ref_top); 964 res_r3 = _mm_sad_epu8(src, ref_bot); 965 966 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 967 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 968 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 969 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 970 971 val1 = _mm_extract_epi32(sad_r0, 0); 972 val2 = _mm_extract_epi32(sad_r0, 2); 973 pi4_sad[0] = (val1 + val2); 974 975 val1 = _mm_extract_epi32(sad_r1, 0); 976 val2 = _mm_extract_epi32(sad_r1, 2); 977 pi4_sad[1] = (val1 + val2); 978 979 val1 = _mm_extract_epi32(sad_r2, 0); 980 val2 = _mm_extract_epi32(sad_r2, 2); 981 pi4_sad[2] = (val1 + val2); 982 983 val1 = _mm_extract_epi32(sad_r3, 0); 984 val2 = _mm_extract_epi32(sad_r3, 2); 985 pi4_sad[3] = (val1 + val2); 986} 987 988/** 989****************************************************************************** 990* 991* @brief computes distortion (SAD) at all subpel points about the src location 992* 993* @par Description 994* This functions computes SAD at all points at a subpel distance from the 995* current source location. 996* 997* @param[in] pu1_src 998* UWORD8 pointer to the source 999* 1000* @param[out] pu1_ref_half_x 1001* UWORD8 pointer to half pel buffer 1002* 1003* @param[out] pu1_ref_half_y 1004* UWORD8 pointer to half pel buffer 1005* 1006* @param[out] pu1_ref_half_xy 1007* UWORD8 pointer to half pel buffer 1008* 1009* @param[in] src_strd 1010* integer source stride 1011* 1012* @param[in] ref_strd 1013* integer ref stride 1014* 1015* @param[out] pi4_sad 1016* integer evaluated sad 1017* pi4_sad[0] - half x 1018* pi4_sad[1] - half x - 1 1019* pi4_sad[2] - half y 1020* pi4_sad[3] - half y - 1 1021* pi4_sad[4] - half xy 1022* pi4_sad[5] - half xy - 1 1023* pi4_sad[6] - half xy - strd 1024* pi4_sad[7] - half xy - 1 - strd 1025* 1026* @remarks 1027* 1028****************************************************************************** 1029*/ 1030void ime_sub_pel_compute_sad_16x16_sse42(UWORD8 *pu1_src, 1031 UWORD8 *pu1_ref_half_x, 1032 UWORD8 *pu1_ref_half_y, 1033 UWORD8 *pu1_ref_half_xy, 1034 WORD32 src_strd, 1035 WORD32 ref_strd, 1036 WORD32 *pi4_sad) 1037{ 1038 UWORD8 *pu1_ref_half_x_left = pu1_ref_half_x - 1; 1039 UWORD8 *pu1_ref_half_y_top = pu1_ref_half_y - ref_strd; 1040 UWORD8 *pu1_ref_half_xy_left = pu1_ref_half_xy - 1; 1041 UWORD8 *pu1_ref_half_xy_top = pu1_ref_half_xy - ref_strd; 1042 UWORD8 *pu1_ref_half_xy_top_left = pu1_ref_half_xy - ref_strd - 1; 1043 WORD32 val1, val2; 1044 1045 __m128i src, ref_half_x, ref_half_y, ref_half_xy; 1046 __m128i ref_half_x_left, ref_half_y_top, ref_half_xy_left, ref_half_xy_top, ref_half_xy_top_left; 1047 __m128i res_r0, res_r1, res_r2, res_r3, res_r4, res_r5, res_r6, res_r7; 1048 __m128i sad_r0, sad_r1, sad_r2, sad_r3, sad_r4, sad_r5, sad_r6, sad_r7; 1049 // Row 0 sad calculation 1050 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1051 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1052 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1053 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1054 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1055 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1056 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1057 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1058 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1059 1060 sad_r0 = _mm_sad_epu8(src, ref_half_x); 1061 sad_r1 = _mm_sad_epu8(src, ref_half_x_left); 1062 sad_r2 = _mm_sad_epu8(src, ref_half_y); 1063 sad_r3 = _mm_sad_epu8(src, ref_half_y_top); 1064 sad_r4 = _mm_sad_epu8(src, ref_half_xy); 1065 sad_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1066 sad_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1067 sad_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1068 1069 pu1_src += src_strd; 1070 pu1_ref_half_x += ref_strd; 1071 pu1_ref_half_x_left += ref_strd; 1072 pu1_ref_half_y += ref_strd; 1073 pu1_ref_half_y_top += ref_strd; 1074 pu1_ref_half_xy += ref_strd; 1075 pu1_ref_half_xy_left += ref_strd; 1076 pu1_ref_half_xy_top += ref_strd; 1077 pu1_ref_half_xy_top_left += ref_strd; 1078 1079 // Row 1 sad calculation 1080 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1081 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1082 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1083 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1084 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1085 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1086 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1087 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1088 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1089 1090 res_r0 = _mm_sad_epu8(src, ref_half_x); 1091 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1092 res_r2 = _mm_sad_epu8(src, ref_half_y); 1093 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1094 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1095 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1096 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1097 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1098 1099 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1100 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1101 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1102 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1103 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1104 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1105 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1106 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1107 1108 pu1_src += src_strd; 1109 pu1_ref_half_x += ref_strd; 1110 pu1_ref_half_x_left += ref_strd; 1111 pu1_ref_half_y += ref_strd; 1112 pu1_ref_half_y_top += ref_strd; 1113 pu1_ref_half_xy += ref_strd; 1114 pu1_ref_half_xy_left += ref_strd; 1115 pu1_ref_half_xy_top += ref_strd; 1116 pu1_ref_half_xy_top_left += ref_strd; 1117 1118 // Row 2 sad calculation 1119 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1120 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1121 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1122 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1123 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1124 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1125 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1126 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1127 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1128 1129 res_r0 = _mm_sad_epu8(src, ref_half_x); 1130 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1131 res_r2 = _mm_sad_epu8(src, ref_half_y); 1132 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1133 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1134 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1135 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1136 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1137 1138 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1139 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1140 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1141 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1142 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1143 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1144 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1145 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1146 1147 pu1_src += src_strd; 1148 pu1_ref_half_x += ref_strd; 1149 pu1_ref_half_x_left += ref_strd; 1150 pu1_ref_half_y += ref_strd; 1151 pu1_ref_half_y_top += ref_strd; 1152 pu1_ref_half_xy += ref_strd; 1153 pu1_ref_half_xy_left += ref_strd; 1154 pu1_ref_half_xy_top += ref_strd; 1155 pu1_ref_half_xy_top_left += ref_strd; 1156 1157 // Row 3 sad calculation 1158 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1159 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1160 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1161 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1162 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1163 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1164 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1165 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1166 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1167 1168 res_r0 = _mm_sad_epu8(src, ref_half_x); 1169 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1170 res_r2 = _mm_sad_epu8(src, ref_half_y); 1171 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1172 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1173 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1174 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1175 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1176 1177 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1178 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1179 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1180 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1181 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1182 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1183 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1184 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1185 1186 pu1_src += src_strd; 1187 pu1_ref_half_x += ref_strd; 1188 pu1_ref_half_x_left += ref_strd; 1189 pu1_ref_half_y += ref_strd; 1190 pu1_ref_half_y_top += ref_strd; 1191 pu1_ref_half_xy += ref_strd; 1192 pu1_ref_half_xy_left += ref_strd; 1193 pu1_ref_half_xy_top += ref_strd; 1194 pu1_ref_half_xy_top_left += ref_strd; 1195 1196 // Row 4 sad calculation 1197 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1198 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1199 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1200 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1201 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1202 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1203 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1204 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1205 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1206 1207 res_r0 = _mm_sad_epu8(src, ref_half_x); 1208 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1209 res_r2 = _mm_sad_epu8(src, ref_half_y); 1210 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1211 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1212 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1213 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1214 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1215 1216 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1217 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1218 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1219 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1220 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1221 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1222 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1223 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1224 1225 pu1_src += src_strd; 1226 pu1_ref_half_x += ref_strd; 1227 pu1_ref_half_x_left += ref_strd; 1228 pu1_ref_half_y += ref_strd; 1229 pu1_ref_half_y_top += ref_strd; 1230 pu1_ref_half_xy += ref_strd; 1231 pu1_ref_half_xy_left += ref_strd; 1232 pu1_ref_half_xy_top += ref_strd; 1233 pu1_ref_half_xy_top_left += ref_strd; 1234 1235 1236 // Row 5 sad calculation 1237 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1238 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1239 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1240 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1241 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1242 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1243 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1244 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1245 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1246 1247 res_r0 = _mm_sad_epu8(src, ref_half_x); 1248 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1249 res_r2 = _mm_sad_epu8(src, ref_half_y); 1250 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1251 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1252 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1253 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1254 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1255 1256 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1257 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1258 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1259 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1260 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1261 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1262 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1263 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1264 1265 pu1_src += src_strd; 1266 pu1_ref_half_x += ref_strd; 1267 pu1_ref_half_x_left += ref_strd; 1268 pu1_ref_half_y += ref_strd; 1269 pu1_ref_half_y_top += ref_strd; 1270 pu1_ref_half_xy += ref_strd; 1271 pu1_ref_half_xy_left += ref_strd; 1272 pu1_ref_half_xy_top += ref_strd; 1273 pu1_ref_half_xy_top_left += ref_strd; 1274 1275 // Row 6 sad calculation 1276 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1277 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1278 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1279 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1280 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1281 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1282 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1283 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1284 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1285 1286 res_r0 = _mm_sad_epu8(src, ref_half_x); 1287 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1288 res_r2 = _mm_sad_epu8(src, ref_half_y); 1289 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1290 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1291 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1292 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1293 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1294 1295 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1296 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1297 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1298 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1299 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1300 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1301 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1302 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1303 1304 pu1_src += src_strd; 1305 pu1_ref_half_x += ref_strd; 1306 pu1_ref_half_x_left += ref_strd; 1307 pu1_ref_half_y += ref_strd; 1308 pu1_ref_half_y_top += ref_strd; 1309 pu1_ref_half_xy += ref_strd; 1310 pu1_ref_half_xy_left += ref_strd; 1311 pu1_ref_half_xy_top += ref_strd; 1312 pu1_ref_half_xy_top_left += ref_strd; 1313 1314 // Row 7 sad calculation 1315 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1316 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1317 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1318 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1319 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1320 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1321 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1322 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1323 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1324 1325 res_r0 = _mm_sad_epu8(src, ref_half_x); 1326 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1327 res_r2 = _mm_sad_epu8(src, ref_half_y); 1328 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1329 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1330 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1331 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1332 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1333 1334 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1335 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1336 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1337 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1338 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1339 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1340 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1341 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1342 1343 pu1_src += src_strd; 1344 pu1_ref_half_x += ref_strd; 1345 pu1_ref_half_x_left += ref_strd; 1346 pu1_ref_half_y += ref_strd; 1347 pu1_ref_half_y_top += ref_strd; 1348 pu1_ref_half_xy += ref_strd; 1349 pu1_ref_half_xy_left += ref_strd; 1350 pu1_ref_half_xy_top += ref_strd; 1351 pu1_ref_half_xy_top_left += ref_strd; 1352 1353 // Row 8 sad calculation 1354 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1355 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1356 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1357 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1358 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1359 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1360 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1361 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1362 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1363 1364 res_r0 = _mm_sad_epu8(src, ref_half_x); 1365 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1366 res_r2 = _mm_sad_epu8(src, ref_half_y); 1367 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1368 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1369 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1370 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1371 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1372 1373 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1374 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1375 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1376 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1377 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1378 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1379 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1380 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1381 1382 pu1_src += src_strd; 1383 pu1_ref_half_x += ref_strd; 1384 pu1_ref_half_x_left += ref_strd; 1385 pu1_ref_half_y += ref_strd; 1386 pu1_ref_half_y_top += ref_strd; 1387 pu1_ref_half_xy += ref_strd; 1388 pu1_ref_half_xy_left += ref_strd; 1389 pu1_ref_half_xy_top += ref_strd; 1390 pu1_ref_half_xy_top_left += ref_strd; 1391 1392 // Row 9 sad calculation 1393 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1394 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1395 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1396 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1397 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1398 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1399 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1400 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1401 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1402 1403 res_r0 = _mm_sad_epu8(src, ref_half_x); 1404 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1405 res_r2 = _mm_sad_epu8(src, ref_half_y); 1406 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1407 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1408 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1409 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1410 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1411 1412 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1413 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1414 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1415 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1416 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1417 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1418 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1419 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1420 1421 pu1_src += src_strd; 1422 pu1_ref_half_x += ref_strd; 1423 pu1_ref_half_x_left += ref_strd; 1424 pu1_ref_half_y += ref_strd; 1425 pu1_ref_half_y_top += ref_strd; 1426 pu1_ref_half_xy += ref_strd; 1427 pu1_ref_half_xy_left += ref_strd; 1428 pu1_ref_half_xy_top += ref_strd; 1429 pu1_ref_half_xy_top_left += ref_strd; 1430 1431 // Row 10 sad calculation 1432 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1433 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1434 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1435 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1436 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1437 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1438 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1439 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1440 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1441 1442 res_r0 = _mm_sad_epu8(src, ref_half_x); 1443 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1444 res_r2 = _mm_sad_epu8(src, ref_half_y); 1445 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1446 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1447 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1448 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1449 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1450 1451 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1452 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1453 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1454 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1455 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1456 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1457 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1458 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1459 1460 pu1_src += src_strd; 1461 pu1_ref_half_x += ref_strd; 1462 pu1_ref_half_x_left += ref_strd; 1463 pu1_ref_half_y += ref_strd; 1464 pu1_ref_half_y_top += ref_strd; 1465 pu1_ref_half_xy += ref_strd; 1466 pu1_ref_half_xy_left += ref_strd; 1467 pu1_ref_half_xy_top += ref_strd; 1468 pu1_ref_half_xy_top_left += ref_strd; 1469 1470 // Row 11 sad calculation 1471 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1472 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1473 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1474 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1475 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1476 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1477 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1478 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1479 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1480 1481 res_r0 = _mm_sad_epu8(src, ref_half_x); 1482 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1483 res_r2 = _mm_sad_epu8(src, ref_half_y); 1484 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1485 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1486 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1487 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1488 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1489 1490 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1491 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1492 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1493 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1494 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1495 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1496 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1497 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1498 1499 pu1_src += src_strd; 1500 pu1_ref_half_x += ref_strd; 1501 pu1_ref_half_x_left += ref_strd; 1502 pu1_ref_half_y += ref_strd; 1503 pu1_ref_half_y_top += ref_strd; 1504 pu1_ref_half_xy += ref_strd; 1505 pu1_ref_half_xy_left += ref_strd; 1506 pu1_ref_half_xy_top += ref_strd; 1507 pu1_ref_half_xy_top_left += ref_strd; 1508 1509 // Row 12 sad calculation 1510 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1511 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1512 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1513 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1514 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1515 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1516 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1517 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1518 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1519 1520 res_r0 = _mm_sad_epu8(src, ref_half_x); 1521 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1522 res_r2 = _mm_sad_epu8(src, ref_half_y); 1523 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1524 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1525 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1526 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1527 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1528 1529 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1530 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1531 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1532 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1533 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1534 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1535 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1536 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1537 1538 pu1_src += src_strd; 1539 pu1_ref_half_x += ref_strd; 1540 pu1_ref_half_x_left += ref_strd; 1541 pu1_ref_half_y += ref_strd; 1542 pu1_ref_half_y_top += ref_strd; 1543 pu1_ref_half_xy += ref_strd; 1544 pu1_ref_half_xy_left += ref_strd; 1545 pu1_ref_half_xy_top += ref_strd; 1546 pu1_ref_half_xy_top_left += ref_strd; 1547 1548 // Row 13 sad calculation 1549 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1550 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1551 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1552 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1553 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1554 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1555 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1556 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1557 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1558 1559 res_r0 = _mm_sad_epu8(src, ref_half_x); 1560 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1561 res_r2 = _mm_sad_epu8(src, ref_half_y); 1562 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1563 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1564 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1565 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1566 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1567 1568 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1569 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1570 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1571 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1572 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1573 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1574 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1575 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1576 1577 pu1_src += src_strd; 1578 pu1_ref_half_x += ref_strd; 1579 pu1_ref_half_x_left += ref_strd; 1580 pu1_ref_half_y += ref_strd; 1581 pu1_ref_half_y_top += ref_strd; 1582 pu1_ref_half_xy += ref_strd; 1583 pu1_ref_half_xy_left += ref_strd; 1584 pu1_ref_half_xy_top += ref_strd; 1585 pu1_ref_half_xy_top_left += ref_strd; 1586 1587 // Row 14 sad calculation 1588 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1589 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1590 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1591 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1592 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1593 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1594 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1595 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1596 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1597 1598 res_r0 = _mm_sad_epu8(src, ref_half_x); 1599 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1600 res_r2 = _mm_sad_epu8(src, ref_half_y); 1601 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1602 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1603 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1604 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1605 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1606 1607 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1608 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1609 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1610 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1611 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1612 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1613 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1614 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1615 1616 pu1_src += src_strd; 1617 pu1_ref_half_x += ref_strd; 1618 pu1_ref_half_x_left += ref_strd; 1619 pu1_ref_half_y += ref_strd; 1620 pu1_ref_half_y_top += ref_strd; 1621 pu1_ref_half_xy += ref_strd; 1622 pu1_ref_half_xy_left += ref_strd; 1623 pu1_ref_half_xy_top += ref_strd; 1624 pu1_ref_half_xy_top_left += ref_strd; 1625 1626 // Row 15 sad calculation 1627 src = _mm_loadu_si128((__m128i *) (pu1_src)); 1628 ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 1629 ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 1630 ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 1631 ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 1632 ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 1633 ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 1634 ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 1635 ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 1636 1637 res_r0 = _mm_sad_epu8(src, ref_half_x); 1638 res_r1 = _mm_sad_epu8(src, ref_half_x_left); 1639 res_r2 = _mm_sad_epu8(src, ref_half_y); 1640 res_r3 = _mm_sad_epu8(src, ref_half_y_top); 1641 res_r4 = _mm_sad_epu8(src, ref_half_xy); 1642 res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 1643 res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 1644 res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 1645 1646 sad_r0 = _mm_add_epi64(sad_r0, res_r0); 1647 sad_r1 = _mm_add_epi64(sad_r1, res_r1); 1648 sad_r2 = _mm_add_epi64(sad_r2, res_r2); 1649 sad_r3 = _mm_add_epi64(sad_r3, res_r3); 1650 sad_r4 = _mm_add_epi64(sad_r4, res_r4); 1651 sad_r5 = _mm_add_epi64(sad_r5, res_r5); 1652 sad_r6 = _mm_add_epi64(sad_r6, res_r6); 1653 sad_r7 = _mm_add_epi64(sad_r7, res_r7); 1654 1655 val1 = _mm_extract_epi32(sad_r0, 0); 1656 val2 = _mm_extract_epi32(sad_r0, 2); 1657 pi4_sad[0] = (val1 + val2); 1658 1659 val1 = _mm_extract_epi32(sad_r1, 0); 1660 val2 = _mm_extract_epi32(sad_r1, 2); 1661 pi4_sad[1] = (val1 + val2); 1662 1663 val1 = _mm_extract_epi32(sad_r2, 0); 1664 val2 = _mm_extract_epi32(sad_r2, 2); 1665 pi4_sad[2] = (val1 + val2); 1666 1667 val1 = _mm_extract_epi32(sad_r3, 0); 1668 val2 = _mm_extract_epi32(sad_r3, 2); 1669 pi4_sad[3] = (val1 + val2); 1670 1671 val1 = _mm_extract_epi32(sad_r4, 0); 1672 val2 = _mm_extract_epi32(sad_r4, 2); 1673 pi4_sad[4] = (val1 + val2); 1674 1675 val1 = _mm_extract_epi32(sad_r5, 0); 1676 val2 = _mm_extract_epi32(sad_r5, 2); 1677 pi4_sad[5] = (val1 + val2); 1678 1679 val1 = _mm_extract_epi32(sad_r6, 0); 1680 val2 = _mm_extract_epi32(sad_r6, 2); 1681 pi4_sad[6] = (val1 + val2); 1682 1683 val1 = _mm_extract_epi32(sad_r7, 0); 1684 val2 = _mm_extract_epi32(sad_r7, 2); 1685 pi4_sad[7] = (val1 + val2); 1686 1687 return; 1688} 1689/* 1690* 1691* @brief This function computes SAD between two 16x16 blocks 1692* It also computes if the block will be zero after H264 transform and quant for 1693* Intra 16x16 blocks 1694* 1695* @param[in] pu1_src 1696* UWORD8 pointer to the source 1697* 1698* @param[out] pu1_dst 1699* UWORD8 pointer to the destination 1700* 1701* @param[in] src_strd 1702* integer source stride 1703* 1704* @param[in] dst_strd 1705* integer destination stride 1706* 1707* @param[in] pu2_thrsh 1708* Threshold for each element of transofrmed quantized block 1709* 1710* @param[out] pi4_mb_distortion 1711* integer evaluated sad 1712* 1713* @param[out] pu4_is_zero 1714* Poitner to store if the block is zero after transform and quantization 1715* 1716* @remarks 1717* 1718****************************************************************************** 1719*/ 1720void ime_compute_satqd_16x16_lumainter_sse42(UWORD8 *pu1_src, 1721 UWORD8 *pu1_est, 1722 WORD32 src_strd, 1723 WORD32 est_strd, 1724 UWORD16 *pu2_thrsh, 1725 WORD32 *pi4_mb_distortion, 1726 UWORD32 *pu4_is_zero) 1727{ 1728 __m128i src_r0, src_r1, src_r2, src_r3; 1729 __m128i est_r0, est_r1, est_r2, est_r3; 1730 __m128i temp0, temp1, temp2, temp3, temp4; 1731 __m128i zero = _mm_setzero_si128(); // all bits reset to zero 1732 __m128i all_one = _mm_set1_epi8(0xFF); 1733 __m128i sad_b1, sad_b2, threshold; 1734 WORD16 sad_1, sad_2; 1735 WORD32 i; 1736 UWORD32 flag = 0; 1737 WORD32 test1, test2; 1738 threshold = _mm_loadu_si128((__m128i *) pu2_thrsh); 1739 (*pi4_mb_distortion) = 0; 1740 1741 for (i=0; i<4; i++) 1742 { 1743 src_r0 = _mm_loadl_epi64((__m128i *) pu1_src); //Row 0 - Block1 and 2 1744 src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2 1745 src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2 1746 src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2 1747 1748 src_r0 = _mm_cvtepu8_epi16(src_r0); 1749 src_r1 = _mm_cvtepu8_epi16(src_r1); 1750 src_r2 = _mm_cvtepu8_epi16(src_r2); 1751 src_r3 = _mm_cvtepu8_epi16(src_r3); 1752 1753 est_r0 = _mm_loadl_epi64((__m128i *) pu1_est); 1754 est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd)); 1755 est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd)); 1756 est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd)); 1757 1758 est_r0 = _mm_cvtepu8_epi16(est_r0); 1759 est_r1 = _mm_cvtepu8_epi16(est_r1); 1760 est_r2 = _mm_cvtepu8_epi16(est_r2); 1761 est_r3 = _mm_cvtepu8_epi16(est_r3); 1762 1763 src_r0 = _mm_sub_epi16(src_r0, est_r0); 1764 src_r1 = _mm_sub_epi16(src_r1, est_r1); 1765 src_r2 = _mm_sub_epi16(src_r2, est_r2); 1766 src_r3 = _mm_sub_epi16(src_r3, est_r3); 1767 1768 src_r0 = _mm_abs_epi16(src_r0); 1769 src_r1 = _mm_abs_epi16(src_r1); 1770 src_r2 = _mm_abs_epi16(src_r2); 1771 src_r3 = _mm_abs_epi16(src_r3); 1772 1773 src_r0 = _mm_add_epi16(src_r0, src_r3); //s1 s4 s4 s1 a1 a4 a4 a1 1774 src_r1 = _mm_add_epi16(src_r1, src_r2); //s2 s3 s3 s2 a2 a3 a3 a2 1775 1776 //SAD calculation 1777 temp0 = _mm_add_epi16(src_r0, src_r1); //s1+s2 s4+s3 s4+s3 s1+s2 a1+a2 a4+a3 a4+a3 a1+a2 1778 temp0 = _mm_hadd_epi16(temp0, zero); 1779 temp0 = _mm_hadd_epi16(temp0, zero); //sad1, sad2 - 16bit values 1780 1781 sad_1 = _mm_extract_epi16(temp0, 0); 1782 sad_2 = _mm_extract_epi16(temp0, 1); 1783 1784 (*pi4_mb_distortion) += sad_1 + sad_2; 1785 1786 if (flag == 0) { 1787 sad_b1 = _mm_set1_epi16((sad_1 << 1)); 1788 sad_b2 = _mm_set1_epi16((sad_2 << 1)); 1789 1790 src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1 1791 src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4 1792 1793 src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2 1794 src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3 1795 1796 src_r0 = _mm_hadd_epi16(src_r0, zero); //s1 s4 a1 a4 0 0 0 0 1797 src_r1 = _mm_hadd_epi16(src_r1, zero); //s2 s3 a2 a3 0 0 0 0 1798 1799 temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0 1800 temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0 1801 1802 temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0 1803 temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0 1804 1805 temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0 1806 temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0 1807 1808 temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0 1809 1810 temp0 = _mm_hadd_epi16(src_r0, zero); //s1+s4 a1+a4 0 0 0 0 0 0 1811 temp1 = _mm_hadd_epi16(src_r1, zero); //s2+s3 a2+a3 0 0 0 0 0 0 1812 1813 temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0 1814 1815 temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) 1816 temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1) 1817 1818 temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) 1819 temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1) 1820 1821 sad_b1 = _mm_sub_epi16(sad_b1, temp2); //lsi values Block0 1822 sad_b2 = _mm_sub_epi16(sad_b2, temp3); //lsi values Block1 1823 1824 temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff 1825 1826 temp1 = _mm_cmpgt_epi16(threshold, sad_b2); 1827 1828 temp0 = _mm_xor_si128(temp0, all_one); //Xor with 1 => NOT operation 1829 temp1 = _mm_xor_si128(temp1, all_one); 1830 1831 test1 = _mm_test_all_zeros(temp0, all_one); 1832 test2 = _mm_test_all_zeros(temp1, all_one); 1833 1834 if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1 1835 || pu2_thrsh[8] <= sad_2) 1836 flag = 1; 1837 } 1838 1839 pu1_src += 8; 1840 pu1_est += 8; 1841 1842 src_r0 = _mm_loadl_epi64((__m128i *) pu1_src); //Row 0 - Block1 and 2 1843 src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2 1844 src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2 1845 src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2 1846 1847 src_r0 = _mm_cvtepu8_epi16(src_r0); 1848 src_r1 = _mm_cvtepu8_epi16(src_r1); 1849 src_r2 = _mm_cvtepu8_epi16(src_r2); 1850 src_r3 = _mm_cvtepu8_epi16(src_r3); 1851 1852 est_r0 = _mm_loadl_epi64((__m128i *) pu1_est); 1853 est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd)); 1854 est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd)); 1855 est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd)); 1856 1857 est_r0 = _mm_cvtepu8_epi16(est_r0); 1858 est_r1 = _mm_cvtepu8_epi16(est_r1); 1859 est_r2 = _mm_cvtepu8_epi16(est_r2); 1860 est_r3 = _mm_cvtepu8_epi16(est_r3); 1861 1862 src_r0 = _mm_sub_epi16(src_r0, est_r0); 1863 src_r1 = _mm_sub_epi16(src_r1, est_r1); 1864 src_r2 = _mm_sub_epi16(src_r2, est_r2); 1865 src_r3 = _mm_sub_epi16(src_r3, est_r3); 1866 1867 src_r0 = _mm_abs_epi16(src_r0); 1868 src_r1 = _mm_abs_epi16(src_r1); 1869 src_r2 = _mm_abs_epi16(src_r2); 1870 src_r3 = _mm_abs_epi16(src_r3); 1871 1872 src_r0 = _mm_add_epi16(src_r0, src_r3); //s1 s4 s4 s1 a1 a4 a4 a1 1873 src_r1 = _mm_add_epi16(src_r1, src_r2); //s2 s3 s3 s2 a2 a3 a3 a2 1874 1875 //SAD calculation 1876 temp0 = _mm_add_epi16(src_r0, src_r1); 1877 temp0 = _mm_hadd_epi16(temp0, zero); 1878 temp0 = _mm_hadd_epi16(temp0, zero); //sad1, sad2 - 16bit values 1879 1880 sad_1 = _mm_extract_epi16(temp0, 0); 1881 sad_2 = _mm_extract_epi16(temp0, 1); 1882 1883 (*pi4_mb_distortion) += sad_1 + sad_2; 1884 1885 if (flag == 0) { 1886 sad_b1 = _mm_set1_epi16((sad_1 << 1)); 1887 sad_b2 = _mm_set1_epi16((sad_2 << 1)); 1888 1889 src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1 1890 src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4 1891 1892 src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2 1893 src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3 1894 1895 src_r0 = _mm_hadd_epi16(src_r0, zero); //s1 s4 a1 a4 0 0 0 0 1896 src_r1 = _mm_hadd_epi16(src_r1, zero); //s2 s3 a2 a3 0 0 0 0 1897 1898 temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0 1899 temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0 1900 1901 temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0 1902 temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0 1903 1904 temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0 1905 temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0 1906 1907 temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0 1908 1909 temp0 = _mm_hadd_epi16(src_r0, zero); //s1+s4 a1+a4 0 0 0 0 0 0 1910 temp1 = _mm_hadd_epi16(src_r1, zero); //s2+s3 a2+a3 0 0 0 0 0 0 1911 1912 temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0 1913 1914 temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) 1915 temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1) 1916 1917 temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) 1918 temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1) 1919 1920 sad_b1 = _mm_sub_epi16(sad_b1, temp2); //lsi values Block0 1921 sad_b2 = _mm_sub_epi16(sad_b2, temp3); //lsi values Block1 1922 1923 temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff 1924 1925 temp1 = _mm_cmpgt_epi16(threshold, sad_b2); 1926 1927 temp0 = _mm_xor_si128(temp0, all_one); //Xor with 1 => NOT operation 1928 temp1 = _mm_xor_si128(temp1, all_one); 1929 1930 test1 = _mm_test_all_zeros(temp0, all_one); 1931 test2 = _mm_test_all_zeros(temp1, all_one); 1932 1933 if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1 1934 || pu2_thrsh[8] <= sad_2) 1935 flag = 1; 1936 } 1937 1938 pu1_src += 4*src_strd - 8; 1939 pu1_est += 4*est_strd - 8; 1940 } 1941 1942 *pu4_is_zero = flag; 1943} 1944