ime_distortion_metrics_sse42.c revision 7497191460a9504f8b4f64df169ab633f0b74353
18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/****************************************************************************** 28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Copyright (C) 2015 The Android Open Source Project 48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Licensed under the Apache License, Version 2.0 (the "License"); 68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * you may not use this file except in compliance with the License. 78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * You may obtain a copy of the License at: 88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * http://www.apache.org/licenses/LICENSE-2.0 108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Unless required by applicable law or agreed to in writing, software 128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * distributed under the License is distributed on an "AS IS" BASIS, 138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * See the License for the specific language governing permissions and 158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * limitations under the License. 168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * 178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ***************************************************************************** 188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/ 208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @file ime_distortion_metrics_sse42.c 248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief 268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* This file contains definitions of routines that compute distortion 278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* between two macro/sub blocks of identical dimensions 288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @author 308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* Ittiam 318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par List of Functions: 338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* - ime_compute_sad_16x16_sse42() 348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* - ime_compute_sad_16x16_fast_sse42() 358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* - ime_compute_sad_16x16_ea8_sse42() 368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* - ime_compute_sad_16x8_sse42() 378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* - ime_calculate_sad4_prog_sse42() 388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* - ime_sub_pel_compute_sad_16x16_sse42() 398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* - ime_compute_satqd_16x16_lumainter_sse42() 408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks 428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* None 438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************* 458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/ 468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* File Includes */ 498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* System include files */ 528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <stdio.h> 538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <stdlib.h> 548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <string.h> 558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* User include files */ 578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_typedefs.h" 588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_defs.h" 598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_macros.h" 608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_statistics.h" 618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_platform_macros.h" 628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_distortion_metrics.h" 638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <immintrin.h> 648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Function Definitions */ 678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/ 688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief computes distortion (SAD) between 2 16x16 blocks 738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par Description 758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* This functions computes SAD between 2 16x16 blocks. There is a provision 768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src 808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the source 818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_dst 838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the destination 848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd 868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer source stride 878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] dst_strd 898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer destination stride 908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] i4_max_sad 928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer maximum allowed distortion 938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_mb_distortion 958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer evaluated sad 968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks 988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/ 1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_compute_sad_16x16_sse42(UWORD8 *pu1_src, 1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_est, 1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 est_strd, 1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 i4_max_sad, 1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 *pi4_mb_distortion) 1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src_r0, src_r1, src_r2, src_r3; 1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i est_r0, est_r1, est_r2, est_r3; 1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i res_r0, res_r1, res_r2, res_r3; 1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sad_val; 1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int val1, val2; 1137497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar UNUSED (i4_max_sad); 1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 0-3 sad calculation 1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(res_r0, res_r1); 1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 4-7 sad calculation 1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += 4*src_strd; 1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_est += 4*est_strd; 1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r0); 1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r1); 1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 8-11 sad calculation 1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += 4*src_strd; 1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_est += 4*est_strd; 1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r0); 1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r1); 1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 12-15 sad calculation 1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += 4*src_strd; 1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_est += 4*est_strd; 1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r0); 2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r1); 2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_val,0); 2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_val, 2); 2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *pi4_mb_distortion = (val1+val2); 2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S return; 2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief computes distortion (SAD) between 2 16x8 blocks 2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par Description 2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* This functions computes SAD between 2 16x8 blocks. There is a provision 2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src 2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the source 2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_dst 2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the destination 2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd 2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer source stride 2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] dst_strd 2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer destination stride 2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] u4_max_sad 2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer maximum allowed distortion 2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_mb_distortion 2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer evaluated sad 2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks 2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/ 2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_compute_sad_16x8_sse42(UWORD8 *pu1_src, 2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_est, 2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 est_strd, 2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 i4_max_sad, 2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 *pi4_mb_distortion) 2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 2527497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar UNUSED (i4_max_sad); 2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src_r0, src_r1, src_r2, src_r3; 2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i est_r0, est_r1, est_r2, est_r3; 2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i res_r0, res_r1, res_r2, res_r3; 2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sad_val; 2578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S int val1, val2; 2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 0-3 sad calculation 2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 2688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 2698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 2718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 2728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 2738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 2748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(res_r0, res_r1); 2768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 2778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 2788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 4-7 sad calculation 2808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += 4*src_strd; 2818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_est += 4*est_strd; 2828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 2848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd)); 2858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 2868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd)); 2878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 2898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd)); 2908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 2918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd)); 2928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 2948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 2958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 2968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 2978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 2988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r0); 2998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r1); 3008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 3018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 3028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_val,0); 3048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_val, 2); 3058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *pi4_mb_distortion = (val1+val2); 3068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S return; 3078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 3088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 3108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 3118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 3128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief computes distortion (SAD) between 2 16x16 blocks 3138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 3148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par Description 3158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* This functions computes SAD between 2 16x16 blocks. There is a provision 3168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To 3178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* compute the distortion of the entire block set u4_max_sad to USHRT_MAX. 3188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 3198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src 3208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the source 3218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 3228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_dst 3238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the destination 3248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 3258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd 3268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer source stride 3278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 3288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] dst_strd 3298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer destination stride 3308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 3318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] i4_max_sad 3328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer maximum allowed distortion 3338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 3348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_mb_distortion 3358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer evaluated sad 3368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 3378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks 3388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 3398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 3408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/ 3418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_compute_sad_16x16_ea8_sse42(UWORD8 *pu1_src, 3428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_est, 3438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 3448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 est_strd, 3458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 i4_max_sad, 3468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 *pi4_mb_distortion) 3478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 3488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src_r0, src_r1, src_r2, src_r3; 3498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i est_r0, est_r1, est_r2, est_r3; 3508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i res_r0, res_r1, res_r2, res_r3; 3518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sad_val; 3528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 val1, val2; 3538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 i4_sad; 3548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_src_temp = pu1_src + src_strd; 3558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_est_temp = pu1_est + est_strd; 3568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 0,2,4,6 sad calculation 3588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 3598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 3608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); 3618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); 3628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 3648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 3658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); 3668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); 3678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 3698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 3708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 3718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 3728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(res_r0, res_r1); 3748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 3758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 3768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 8,10,12,14 sad calculation 3788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += 8*src_strd; 3798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_est += 8*est_strd; 3808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 3828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 3838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); 3848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); 3858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 3878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 3888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); 3898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); 3908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 3928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 3938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 3948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 3958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 3968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r0); 3978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r1); 3988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 3998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 4008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src = pu1_src_temp; 4028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_est = pu1_est_temp; 4038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_val, 0); 4058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_val, 2); 4068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i4_sad = val1 + val2; 4088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if (i4_max_sad < i4_sad) 4098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 4108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *pi4_mb_distortion = i4_sad; 4118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S return ; 4128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 4138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 1,3,5,7 sad calculation 4148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 4158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 4168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); 4178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); 4188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 4208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 4218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); 4228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); 4238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 4258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 4268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 4278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 4288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r0); 4308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r1); 4318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 4328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 4338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 9,11,13,15 sad calculation 4358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += 8*src_strd; 4368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_est += 8*est_strd; 4378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 4388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd)); 4398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd)); 4408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd)); 4418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 4438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd)); 4448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd)); 4458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd)); 4468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 4488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 4498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 4508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 4518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r0); 4538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r1); 4548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 4558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 4568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_val, 0); 4588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_val, 2); 4598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *pi4_mb_distortion = (val1+val2); 4608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S return; 4628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 4638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 4648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 4658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 4668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 4678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode) 4688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 4698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par Description 4708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* This functions computes SAD between 2 16x16 blocks by processing alternate 4718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* rows (fast mode). For fast mode it is assumed sad obtained by processing 4728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* alternate rows is approximately twice as that for the whole block. 4738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 4748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src 4758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the source 4768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 4778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_dst 4788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the destination 4798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 4808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd 4818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer source stride 4828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 4838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] dst_strd 4848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer destination stride 4858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 4868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] i4_max_sad 4878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer maximum allowed distortion 4888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 4898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_mb_distortion 4908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer evaluated sad 4918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 4928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks 4938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 4948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 4958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/ 4968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src, 4978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_est, 4988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 4998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 est_strd, 5008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 i4_max_sad, 5018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 *pi4_mb_distortion) 5028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 5037497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar UNUSED (i4_max_sad); 5048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src_r0, src_r1, src_r2, src_r3; 5058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i est_r0, est_r1, est_r2, est_r3; 5068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i res_r0, res_r1, res_r2, res_r3; 5078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sad_val; 5088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 val1, val2; 5098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 i4_sad; 5108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_src_temp = pu1_src + src_strd; 5118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_est_temp = pu1_est + est_strd; 5128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 0,2,4,6 sad calculation 5148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 5158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd)); 5168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd)); 5178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd)); 5188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 5208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd)); 5218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd)); 5228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd)); 5238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 5258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 5268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 5278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 5288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(res_r0, res_r1); 5308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 5318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 5328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 8,10,12,14 sad calculation 5348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += 8 * src_strd; 5358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_est += 8 * est_strd; 5368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadu_si128((__m128i *) (pu1_src)); 5388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd)); 5398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd)); 5408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd)); 5418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadu_si128((__m128i *) (pu1_est)); 5438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd)); 5448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd)); 5458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd)); 5468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src_r0, est_r0); 5488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src_r1, est_r1); 5498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src_r2, est_r2); 5508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src_r3, est_r3); 5518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r0); 5538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r1); 5548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r2); 5558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_val = _mm_add_epi64(sad_val, res_r3); 5568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src = pu1_src_temp; 5588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_est = pu1_est_temp; 5598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_val, 0); 5618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_val, 2); 5628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S i4_sad = val1 + val2; 5648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *pi4_mb_distortion = (i4_sad<<1); 5658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S return; 5668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 5678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 5688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 5698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************* 5708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 5718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief compute sad 5728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 5738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par Description: This function computes the sad at vertices of diamond grid 5748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* centered at reference pointer and at unit distance from it. 5758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 5768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_ref 5778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the reference 5788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 5798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_src 5808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the source 5818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 5828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] ref_strd 5838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer reference stride 5848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 5858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd 5868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer source stride 5878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 5888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_sad 5898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* pointer to integer array evaluated sad 5908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 5918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @returns sad at all evaluated vertexes 5928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 5938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks none 5948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 5958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************* 5968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/ 5978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_calculate_sad4_prog_sse42(UWORD8 *pu1_ref, 5988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_src, 5998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 ref_strd, 6008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 6018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 *pi4_sad) 6028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 6038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S /* reference ptrs at unit 1 distance in diamond pattern centered at pu1_ref */ 6048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *left_ptr = pu1_ref - 1; 6058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *right_ptr = pu1_ref + 1; 6068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *top_ptr = pu1_ref - ref_strd; 6078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *bot_ptr = pu1_ref + ref_strd; 6088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 val1, val2; 6108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src, ref_left, ref_right, ref_top, ref_bot; 6118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i res_r0, res_r1, res_r2, res_r3; 6128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sad_r0, sad_r1, sad_r2, sad_r3; 6138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 0 sad calculation 6158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 6168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 6178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 6188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 6198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 6208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_sad_epu8(src, ref_left); 6228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_sad_epu8(src, ref_right); 6238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_sad_epu8(src, ref_top); 6248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_sad_epu8(src, ref_bot); 6258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 6278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 6288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 6298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 6308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 6318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 1 sad calculation 6338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 6348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 6358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 6368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 6378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 6388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 6408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 6418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 6428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 6438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 6458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 6468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 6478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 6488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 6508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 6518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 6528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 6538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 6548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 2 sad calculation 6568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 6578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 6588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 6598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 6608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 6618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 6638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 6648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 6658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 6668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 6688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 6698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 6708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 6718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 6738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 6748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 6758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 6768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 6778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 3 sad calculation 6798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 6808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 6818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 6828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 6838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 6848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 6868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 6878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 6888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 6898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 6918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 6928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 6938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 6948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 6958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 6968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 6978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 6988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 6998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 7008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 4 sad calculation 7028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 7038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 7048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 7058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 7068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 7078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 7098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 7108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 7118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 7128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 7148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 7158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 7168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 7178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 7198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 7208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 7218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 7228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 7238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 5 sad calculation 7258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 7268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 7278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 7288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 7298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 7308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 7328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 7338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 7348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 7358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 7378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 7388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 7398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 7408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 7428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 7438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 7448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 7458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 7468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 6 sad calculation 7488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 7498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 7508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 7518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 7528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 7538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 7558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 7568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 7578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 7588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 7608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 7618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 7628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 7638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 7658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 7668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 7678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 7688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 7698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 7 sad calculation 7718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 7728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 7738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 7748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 7758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 7768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 7788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 7798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 7808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 7818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 7838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 7848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 7858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 7868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 7888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 7898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 7908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 7918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 7928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 7938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 8 sad calculation 7948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 7958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 7968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 7978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 7988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 7998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 8018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 8028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 8038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 8048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 8068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 8078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 8088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 8098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 8118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 8128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 8138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 8148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 8158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 9 sad calculation 8178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 8188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 8198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 8208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 8218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 8228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 8248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 8258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 8268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 8278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 8298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 8308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 8318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 8328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 8348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 8358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 8368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 8378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 8388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 10 sad calculation 8408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 8418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 8428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 8438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 8448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 8458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 8478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 8488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 8498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 8508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 8528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 8538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 8548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 8558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 8578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 8588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 8598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 8608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 8618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 11 sad calculation 8638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 8648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 8658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 8668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 8678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 8688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 8708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 8718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 8728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 8738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 8758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 8768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 8778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 8788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 8808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 8818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 8828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 8838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 8848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 12 sad calculation 8868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 8878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 8888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 8898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 8908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 8918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 8938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 8948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 8958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 8968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 8978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 8988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 8998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 9008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 9018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 9038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 9048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 9058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 9068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 9078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 13 sad calculation 9098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 9108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 9118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 9128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 9138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 9148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 9168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 9178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 9188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 9198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 9218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 9228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 9238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 9248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 9268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 9278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 9288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 9298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 9308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 14 sad calculation 9328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 9338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 9348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 9358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 9368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 9378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 9398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 9408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 9418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 9428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 9448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 9458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 9468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 9478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 9498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S left_ptr += ref_strd; 9508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S right_ptr += ref_strd; 9518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S top_ptr += ref_strd; 9528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S bot_ptr += ref_strd; 9538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 15 sad calculation 9558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 9568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_left = _mm_loadu_si128((__m128i *) (left_ptr)); 9578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_right = _mm_loadu_si128((__m128i *) (right_ptr)); 9588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_top = _mm_loadu_si128((__m128i *) (top_ptr)); 9598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr)); 9608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_left); 9628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_right); 9638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_top); 9648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_bot); 9658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 9678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 9688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 9698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 9708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r0, 0); 9728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r0, 2); 9738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[0] = (val1 + val2); 9748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r1, 0); 9768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r1, 2); 9778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[1] = (val1 + val2); 9788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r2, 0); 9808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r2, 2); 9818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[2] = (val1 + val2); 9828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r3, 0); 9848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r3, 2); 9858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[3] = (val1 + val2); 9868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 9878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 9888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/** 9898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 9908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 9918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief computes distortion (SAD) at all subpel points about the src location 9928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 9938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par Description 9948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* This functions computes SAD at all points at a subpel distance from the 9958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* current source location. 9968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 9978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src 9988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the source 9998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 10008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_ref_half_x 10018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to half pel buffer 10028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 10038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_ref_half_y 10048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to half pel buffer 10058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 10068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_ref_half_xy 10078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to half pel buffer 10088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 10098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd 10108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer source stride 10118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 10128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] ref_strd 10138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer ref stride 10148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 10158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_sad 10168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer evaluated sad 10178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* pi4_sad[0] - half x 10188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* pi4_sad[1] - half x - 1 10198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* pi4_sad[2] - half y 10208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* pi4_sad[3] - half y - 1 10218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* pi4_sad[4] - half xy 10228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* pi4_sad[5] - half xy - 1 10238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* pi4_sad[6] - half xy - strd 10248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* pi4_sad[7] - half xy - 1 - strd 10258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 10268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks 10278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 10288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 10298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/ 10308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_sub_pel_compute_sad_16x16_sse42(UWORD8 *pu1_src, 10318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_ref_half_x, 10328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_ref_half_y, 10338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_ref_half_xy, 10348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 10358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 ref_strd, 10368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 *pi4_sad) 10378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 10388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_ref_half_x_left = pu1_ref_half_x - 1; 10398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_ref_half_y_top = pu1_ref_half_y - ref_strd; 10408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_ref_half_xy_left = pu1_ref_half_xy - 1; 10418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_ref_half_xy_top = pu1_ref_half_xy - ref_strd; 10428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_ref_half_xy_top_left = pu1_ref_half_xy - ref_strd - 1; 10438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 val1, val2; 10448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src, ref_half_x, ref_half_y, ref_half_xy; 10468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i ref_half_x_left, ref_half_y_top, ref_half_xy_left, ref_half_xy_top, ref_half_xy_top_left; 10478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i res_r0, res_r1, res_r2, res_r3, res_r4, res_r5, res_r6, res_r7; 10488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sad_r0, sad_r1, sad_r2, sad_r3, sad_r4, sad_r5, sad_r6, sad_r7; 10498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 0 sad calculation 10508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 10518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 10528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 10538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 10548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 10558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 10568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 10578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 10588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 10598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_sad_epu8(src, ref_half_x); 10618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_sad_epu8(src, ref_half_x_left); 10628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_sad_epu8(src, ref_half_y); 10638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_sad_epu8(src, ref_half_y_top); 10648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_sad_epu8(src, ref_half_xy); 10658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_sad_epu8(src, ref_half_xy_left); 10668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_sad_epu8(src, ref_half_xy_top); 10678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 10688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 10708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 10718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 10728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 10738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 10748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 10758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 10768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 10778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 10788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 1 sad calculation 10808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 10818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 10828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 10838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 10848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 10858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 10868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 10878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 10888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 10898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 10918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 10928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 10938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 10948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 10958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 10968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 10978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 10988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 10998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 11008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 11018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 11028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 11038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 11048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 11058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 11068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 11078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 11098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 11108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 11118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 11128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 11138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 11148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 11158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 11168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 11178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 2 sad calculation 11198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 11208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 11218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 11228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 11238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 11248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 11258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 11268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 11278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 11288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 11308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 11318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 11328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 11338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 11348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 11358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 11368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 11378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 11398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 11408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 11418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 11428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 11438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 11448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 11458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 11468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 11488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 11498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 11508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 11518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 11528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 11538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 11548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 11558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 11568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 3 sad calculation 11588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 11598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 11608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 11618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 11628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 11638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 11648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 11658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 11668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 11678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 11698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 11708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 11718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 11728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 11738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 11748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 11758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 11768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 11788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 11798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 11808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 11818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 11828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 11838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 11848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 11858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 11878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 11888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 11898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 11908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 11918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 11928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 11938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 11948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 11958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 11968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 4 sad calculation 11978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 11988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 11998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 12008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 12018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 12028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 12038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 12048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 12058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 12068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 12088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 12098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 12108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 12118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 12128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 12138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 12148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 12158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 12178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 12188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 12198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 12208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 12218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 12228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 12238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 12248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 12268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 12278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 12288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 12298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 12308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 12318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 12328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 12338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 12348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 5 sad calculation 12378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 12388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 12398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 12408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 12418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 12428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 12438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 12448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 12458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 12468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 12488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 12498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 12508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 12518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 12528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 12538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 12548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 12558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 12578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 12588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 12598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 12608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 12618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 12628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 12638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 12648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 12668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 12678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 12688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 12698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 12708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 12718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 12728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 12738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 12748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 6 sad calculation 12768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 12778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 12788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 12798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 12808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 12818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 12828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 12838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 12848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 12858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 12878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 12888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 12898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 12908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 12918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 12928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 12938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 12948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 12958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 12968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 12978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 12988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 12998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 13008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 13018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 13028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 13038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 13058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 13068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 13078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 13088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 13098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 13108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 13118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 13128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 13138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 7 sad calculation 13158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 13168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 13178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 13188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 13198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 13208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 13218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 13228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 13238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 13248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 13268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 13278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 13288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 13298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 13308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 13318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 13328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 13338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 13358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 13368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 13378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 13388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 13398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 13408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 13418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 13428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 13448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 13458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 13468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 13478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 13488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 13498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 13508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 13518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 13528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 8 sad calculation 13548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 13558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 13568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 13578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 13588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 13598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 13608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 13618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 13628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 13638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 13658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 13668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 13678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 13688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 13698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 13708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 13718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 13728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 13748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 13758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 13768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 13778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 13788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 13798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 13808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 13818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 13838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 13848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 13858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 13868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 13878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 13888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 13898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 13908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 13918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 13928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 9 sad calculation 13938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 13948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 13958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 13968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 13978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 13988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 13998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 14008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 14018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 14028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 14048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 14058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 14068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 14078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 14088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 14098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 14108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 14118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 14138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 14148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 14158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 14168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 14178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 14188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 14198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 14208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 14228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 14238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 14248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 14258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 14268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 14278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 14288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 14298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 14308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 10 sad calculation 14328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 14338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 14348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 14358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 14368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 14378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 14388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 14398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 14408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 14418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 14438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 14448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 14458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 14468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 14478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 14488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 14498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 14508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 14528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 14538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 14548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 14558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 14568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 14578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 14588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 14598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 14618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 14628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 14638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 14648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 14658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 14668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 14678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 14688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 14698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 11 sad calculation 14718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 14728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 14738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 14748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 14758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 14768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 14778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 14788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 14798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 14808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 14828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 14838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 14848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 14858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 14868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 14878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 14888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 14898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 14918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 14928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 14938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 14948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 14958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 14968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 14978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 14988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 14998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 15008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 15018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 15028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 15038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 15048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 15058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 15068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 15078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 15088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 12 sad calculation 15108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 15118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 15128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 15138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 15148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 15158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 15168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 15178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 15188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 15198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 15218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 15228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 15238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 15248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 15258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 15268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 15278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 15288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 15308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 15318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 15328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 15338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 15348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 15358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 15368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 15378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 15398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 15408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 15418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 15428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 15438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 15448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 15458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 15468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 15478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 13 sad calculation 15498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 15508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 15518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 15528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 15538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 15548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 15558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 15568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 15578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 15588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 15608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 15618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 15628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 15638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 15648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 15658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 15668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 15678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 15698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 15708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 15718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 15728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 15738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 15748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 15758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 15768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 15788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 15798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 15808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 15818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 15828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 15838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 15848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 15858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 15868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 14 sad calculation 15888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 15898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 15908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 15918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 15928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 15938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 15948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 15958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 15968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 15978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 15988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 15998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 16008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 16018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 16028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 16038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 16048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 16058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 16068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 16088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 16098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 16108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 16118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 16128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 16138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 16148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 16158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += src_strd; 16178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x += ref_strd; 16188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_x_left += ref_strd; 16198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y += ref_strd; 16208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_y_top += ref_strd; 16218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy += ref_strd; 16228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_left += ref_strd; 16238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top += ref_strd; 16248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_ref_half_xy_top_left += ref_strd; 16258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S // Row 15 sad calculation 16278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src = _mm_loadu_si128((__m128i *) (pu1_src)); 16288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x)); 16298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y)); 16308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy)); 16318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left)); 16328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top)); 16338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left)); 16348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top)); 16358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left)); 16368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r0 = _mm_sad_epu8(src, ref_half_x); 16388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r1 = _mm_sad_epu8(src, ref_half_x_left); 16398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r2 = _mm_sad_epu8(src, ref_half_y); 16408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r3 = _mm_sad_epu8(src, ref_half_y_top); 16418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r4 = _mm_sad_epu8(src, ref_half_xy); 16428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r5 = _mm_sad_epu8(src, ref_half_xy_left); 16438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r6 = _mm_sad_epu8(src, ref_half_xy_top); 16448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left); 16458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r0 = _mm_add_epi64(sad_r0, res_r0); 16478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r1 = _mm_add_epi64(sad_r1, res_r1); 16488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r2 = _mm_add_epi64(sad_r2, res_r2); 16498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r3 = _mm_add_epi64(sad_r3, res_r3); 16508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r4 = _mm_add_epi64(sad_r4, res_r4); 16518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r5 = _mm_add_epi64(sad_r5, res_r5); 16528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r6 = _mm_add_epi64(sad_r6, res_r6); 16538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_r7 = _mm_add_epi64(sad_r7, res_r7); 16548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r0, 0); 16568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r0, 2); 16578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[0] = (val1 + val2); 16588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r1, 0); 16608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r1, 2); 16618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[1] = (val1 + val2); 16628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r2, 0); 16648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r2, 2); 16658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[2] = (val1 + val2); 16668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r3, 0); 16688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r3, 2); 16698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[3] = (val1 + val2); 16708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r4, 0); 16728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r4, 2); 16738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[4] = (val1 + val2); 16748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r5, 0); 16768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r5, 2); 16778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[5] = (val1 + val2); 16788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r6, 0); 16808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r6, 2); 16818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[6] = (val1 + val2); 16828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val1 = _mm_extract_epi32(sad_r7, 0); 16848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S val2 = _mm_extract_epi32(sad_r7, 2); 16858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pi4_sad[7] = (val1 + val2); 16868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 16878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S return; 16888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 16898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* 16908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 16918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief This function computes SAD between two 16x16 blocks 16928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* It also computes if the block will be zero after H264 transform and quant for 16938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* Intra 16x16 blocks 16948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 16958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src 16968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the source 16978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 16988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_dst 16998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* UWORD8 pointer to the destination 17008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 17018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd 17028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer source stride 17038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 17048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] dst_strd 17058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer destination stride 17068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 17078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu2_thrsh 17088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* Threshold for each element of transofrmed quantized block 17098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 17108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_mb_distortion 17118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* integer evaluated sad 17128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 17138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu4_is_zero 17148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* Poitner to store if the block is zero after transform and quantization 17158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 17168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks 17178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* 17188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S****************************************************************************** 17198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/ 17208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_compute_satqd_16x16_lumainter_sse42(UWORD8 *pu1_src, 17218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD8 *pu1_est, 17228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 src_strd, 17238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 est_strd, 17248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD16 *pu2_thrsh, 17258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 *pi4_mb_distortion, 17268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD32 *pu4_is_zero) 17278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{ 17288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i src_r0, src_r1, src_r2, src_r3; 17298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i est_r0, est_r1, est_r2, est_r3; 17308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i temp0, temp1, temp2, temp3, temp4; 17318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i zero = _mm_setzero_si128(); // all bits reset to zero 17328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i all_one = _mm_set1_epi8(0xFF); 17338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S __m128i sad_b1, sad_b2, threshold; 17348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD16 sad_1, sad_2; 17358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 i; 17368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S UWORD32 flag = 0; 17378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S WORD32 test1, test2; 17388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S threshold = _mm_loadu_si128((__m128i *) pu2_thrsh); 17398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S (*pi4_mb_distortion) = 0; 17408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S for (i=0; i<4; i++) 17428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S { 17438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadl_epi64((__m128i *) pu1_src); //Row 0 - Block1 and 2 17448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2 17458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2 17468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2 17478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_cvtepu8_epi16(src_r0); 17498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_cvtepu8_epi16(src_r1); 17508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_cvtepu8_epi16(src_r2); 17518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_cvtepu8_epi16(src_r3); 17528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadl_epi64((__m128i *) pu1_est); 17548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd)); 17558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd)); 17568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd)); 17578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_cvtepu8_epi16(est_r0); 17598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_cvtepu8_epi16(est_r1); 17608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_cvtepu8_epi16(est_r2); 17618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_cvtepu8_epi16(est_r3); 17628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_sub_epi16(src_r0, est_r0); 17648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_sub_epi16(src_r1, est_r1); 17658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_sub_epi16(src_r2, est_r2); 17668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_sub_epi16(src_r3, est_r3); 17678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_abs_epi16(src_r0); 17698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_abs_epi16(src_r1); 17708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_abs_epi16(src_r2); 17718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_abs_epi16(src_r3); 17728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_add_epi16(src_r0, src_r3); //s1 s4 s4 s1 a1 a4 a4 a1 17748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_add_epi16(src_r1, src_r2); //s2 s3 s3 s2 a2 a3 a3 a2 17758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //SAD calculation 17778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi16(src_r0, src_r1); //s1+s2 s4+s3 s4+s3 s1+s2 a1+a2 a4+a3 a4+a3 a1+a2 17788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_hadd_epi16(temp0, zero); 17798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_hadd_epi16(temp0, zero); //sad1, sad2 - 16bit values 17808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_1 = _mm_extract_epi16(temp0, 0); 17828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_2 = _mm_extract_epi16(temp0, 1); 17838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S (*pi4_mb_distortion) += sad_1 + sad_2; 17858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if (flag == 0) { 17878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_b1 = _mm_set1_epi16((sad_1 << 1)); 17888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_b2 = _mm_set1_epi16((sad_2 << 1)); 17898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1 17918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4 17928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2 17948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3 17958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_hadd_epi16(src_r0, zero); //s1 s4 a1 a4 0 0 0 0 17978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_hadd_epi16(src_r1, zero); //s2 s3 a2 a3 0 0 0 0 17988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 17998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0 18008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0 18018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0 18038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0 18048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0 18068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0 18078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0 18098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_hadd_epi16(src_r0, zero); //s1+s4 a1+a4 0 0 0 0 0 0 18118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_hadd_epi16(src_r1, zero); //s2+s3 a2+a3 0 0 0 0 0 0 18128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0 18148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) 18168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1) 18178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) 18198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1) 18208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_b1 = _mm_sub_epi16(sad_b1, temp2); //lsi values Block0 18228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_b2 = _mm_sub_epi16(sad_b2, temp3); //lsi values Block1 18238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff 18258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(threshold, sad_b2); 18278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_xor_si128(temp0, all_one); //Xor with 1 => NOT operation 18298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_xor_si128(temp1, all_one); 18308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S test1 = _mm_test_all_zeros(temp0, all_one); 18328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S test2 = _mm_test_all_zeros(temp1, all_one); 18338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1 18358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S || pu2_thrsh[8] <= sad_2) 18368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag = 1; 18378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 18388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += 8; 18408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_est += 8; 18418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_loadl_epi64((__m128i *) pu1_src); //Row 0 - Block1 and 2 18438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2 18448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2 18458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2 18468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_cvtepu8_epi16(src_r0); 18488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_cvtepu8_epi16(src_r1); 18498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_cvtepu8_epi16(src_r2); 18508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_cvtepu8_epi16(src_r3); 18518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_loadl_epi64((__m128i *) pu1_est); 18538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd)); 18548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd)); 18558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd)); 18568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r0 = _mm_cvtepu8_epi16(est_r0); 18588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r1 = _mm_cvtepu8_epi16(est_r1); 18598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r2 = _mm_cvtepu8_epi16(est_r2); 18608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S est_r3 = _mm_cvtepu8_epi16(est_r3); 18618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_sub_epi16(src_r0, est_r0); 18638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_sub_epi16(src_r1, est_r1); 18648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_sub_epi16(src_r2, est_r2); 18658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_sub_epi16(src_r3, est_r3); 18668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_abs_epi16(src_r0); 18688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_abs_epi16(src_r1); 18698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r2 = _mm_abs_epi16(src_r2); 18708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r3 = _mm_abs_epi16(src_r3); 18718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_add_epi16(src_r0, src_r3); //s1 s4 s4 s1 a1 a4 a4 a1 18738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_add_epi16(src_r1, src_r2); //s2 s3 s3 s2 a2 a3 a3 a2 18748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S //SAD calculation 18768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_add_epi16(src_r0, src_r1); 18778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_hadd_epi16(temp0, zero); 18788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_hadd_epi16(temp0, zero); //sad1, sad2 - 16bit values 18798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_1 = _mm_extract_epi16(temp0, 0); 18818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_2 = _mm_extract_epi16(temp0, 1); 18828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S (*pi4_mb_distortion) += sad_1 + sad_2; 18848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if (flag == 0) { 18868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_b1 = _mm_set1_epi16((sad_1 << 1)); 18878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_b2 = _mm_set1_epi16((sad_2 << 1)); 18888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1 18908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4 18918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2 18938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3 18948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r0 = _mm_hadd_epi16(src_r0, zero); //s1 s4 a1 a4 0 0 0 0 18968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S src_r1 = _mm_hadd_epi16(src_r1, zero); //s2 s3 a2 a3 0 0 0 0 18978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 18988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0 18998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0 19008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0 19028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0 19038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0 19058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0 19068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0 19088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_hadd_epi16(src_r0, zero); //s1+s4 a1+a4 0 0 0 0 0 0 19108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_hadd_epi16(src_r1, zero); //s2+s3 a2+a3 0 0 0 0 0 0 19118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0 19138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) 19158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1) 19168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) 19188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1) 19198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_b1 = _mm_sub_epi16(sad_b1, temp2); //lsi values Block0 19218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S sad_b2 = _mm_sub_epi16(sad_b2, temp3); //lsi values Block1 19228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff 19248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_cmpgt_epi16(threshold, sad_b2); 19268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp0 = _mm_xor_si128(temp0, all_one); //Xor with 1 => NOT operation 19288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S temp1 = _mm_xor_si128(temp1, all_one); 19298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S test1 = _mm_test_all_zeros(temp0, all_one); 19318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S test2 = _mm_test_all_zeros(temp1, all_one); 19328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1 19348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S || pu2_thrsh[8] <= sad_2) 19358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S flag = 1; 19368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 19378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_src += 4*src_strd - 8; 19398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S pu1_est += 4*est_strd - 8; 19408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S } 19418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S 19428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *pu4_is_zero = flag; 19438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S} 1944