18d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/******************************************************************************
28d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
38d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Copyright (C) 2015 The Android Open Source Project
48d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
58d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Licensed under the Apache License, Version 2.0 (the "License");
68d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * you may not use this file except in compliance with the License.
78d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * You may obtain a copy of the License at:
88d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
98d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * http://www.apache.org/licenses/LICENSE-2.0
108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Unless required by applicable law or agreed to in writing, software
128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * distributed under the License is distributed on an "AS IS" BASIS,
138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * See the License for the specific language governing permissions and
158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * limitations under the License.
168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *
178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S *****************************************************************************
188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/
208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @file ime_distortion_metrics_sse42.c
248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief
268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  This file contains definitions of routines that compute distortion
278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  between two macro/sub blocks of identical dimensions
288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @author
308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  Ittiam
318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par List of Functions:
338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  - ime_compute_sad_16x16_sse42()
348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  - ime_compute_sad_16x16_fast_sse42()
358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  - ime_compute_sad_16x16_ea8_sse42()
368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  - ime_compute_sad_16x8_sse42()
378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  - ime_calculate_sad4_prog_sse42()
388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  - ime_sub_pel_compute_sad_16x16_sse42()
398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  - ime_compute_satqd_16x16_lumainter_sse42()
408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks
428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  None
438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*******************************************************************************
458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/
468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* File Includes                                                             */
498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* System include files */
528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <stdio.h>
538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <stdlib.h>
548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <string.h>
558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* User include files */
578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_typedefs.h"
588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_defs.h"
598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_macros.h"
608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_statistics.h"
618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_platform_macros.h"
628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include "ime_distortion_metrics.h"
638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S#include <immintrin.h>
648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/* Function Definitions                                                      */
678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*****************************************************************************/
688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief computes distortion (SAD) between 2 16x16 blocks
738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par   Description
758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   This functions computes SAD between 2 16x16 blocks. There is a provision
768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src
808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the source
818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_dst
838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the destination
848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd
868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer source stride
878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] dst_strd
898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer destination stride
908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] i4_max_sad
928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer maximum allowed distortion
938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_mb_distortion
958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer evaluated sad
968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks
988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
1008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/
1018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_compute_sad_16x16_sse42(UWORD8 *pu1_src,
1028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           UWORD8 *pu1_est,
1038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           WORD32 src_strd,
1048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           WORD32 est_strd,
1058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           WORD32 i4_max_sad,
1068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                           WORD32 *pi4_mb_distortion)
1078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
1088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src_r0, src_r1, src_r2, src_r3;
1098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i est_r0, est_r1, est_r2, est_r3;
1108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i res_r0, res_r1, res_r2, res_r3;
1118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sad_val;
1128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    int val1, val2;
1137497191460a9504f8b4f64df169ab633f0b74353Harish Mahendrakar    UNUSED (i4_max_sad);
1148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 0-3 sad calculation
1168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
1178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
1188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
1198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
1208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
1228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
1238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
1248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
1258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
1278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
1288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
1298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
1308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(res_r0, res_r1);
1328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
1338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
1348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 4-7 sad calculation
1368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += 4*src_strd;
1378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_est += 4*est_strd;
1388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
1408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
1418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
1428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
1438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
1458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
1468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
1478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
1488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
1508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
1518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
1528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
1538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r0);
1558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r1);
1568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
1578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
1588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 8-11 sad calculation
1608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += 4*src_strd;
1618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_est += 4*est_strd;
1628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
1638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
1648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
1658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
1668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
1688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
1698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
1708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
1718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
1738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
1748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
1758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
1768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r0);
1788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r1);
1798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
1808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
1818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 12-15 sad calculation
1838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += 4*src_strd;
1848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_est += 4*est_strd;
1858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
1868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
1878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
1888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
1898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
1918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
1928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
1938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
1948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
1958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
1968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
1978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
1988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
1998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r0);
2018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r1);
2028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
2038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
2048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_val,0);
2068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_val, 2);
2078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    *pi4_mb_distortion = (val1+val2);
2088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    return;
2108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
2118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
2138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
2148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
2158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  @brief computes distortion (SAD) between 2 16x8  blocks
2168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
2178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
2188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  @par   Description
2198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   This functions computes SAD between 2 16x8 blocks. There is a provision
2208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
2218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
2228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
2238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src
2248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the source
2258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
2268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_dst
2278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the destination
2288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
2298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd
2308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer source stride
2318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
2328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] dst_strd
2338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer destination stride
2348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
2358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] u4_max_sad
2368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer maximum allowed distortion
2378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
2388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_mb_distortion
2398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer evaluated sad
2408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
2418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks
2428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
2438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
2448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/
2458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_compute_sad_16x8_sse42(UWORD8 *pu1_src,
2468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                    UWORD8 *pu1_est,
2478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                    WORD32 src_strd,
2488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                    WORD32 est_strd,
2498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                    WORD32 i4_max_sad,
2508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                    WORD32 *pi4_mb_distortion)
2518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
2528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src_r0, src_r1, src_r2, src_r3;
2538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i est_r0, est_r1, est_r2, est_r3;
2548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i res_r0, res_r1, res_r2, res_r3;
2558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sad_val;
2568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    int val1, val2;
257086dd8ea90eaf4ead8b4927e777b1c8a19bd23a9Martin Storsjo    UNUSED (i4_max_sad);
2588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 0-3 sad calculation
2608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
2618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
2628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
2638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
2648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
2668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
2678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
2688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
2698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
2718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
2728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
2738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
2748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(res_r0, res_r1);
2768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
2778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
2788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 4-7 sad calculation
2808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += 4*src_strd;
2818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_est += 4*est_strd;
2828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
2848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
2858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
2868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
2878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
2898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
2908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
2918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
2928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
2948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
2958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
2968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
2978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
2988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r0);
2998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r1);
3008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
3018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
3028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_val,0);
3048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_val, 2);
3058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    *pi4_mb_distortion = (val1+val2);
3068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    return;
3078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
3088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
3108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
3118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
3128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief computes distortion (SAD) between 2 16x16 blocks
3138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
3148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par   Description
3158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   This functions computes SAD between 2 16x16 blocks. There is a provision
3168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
3178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
3188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
3198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src
3208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the source
3218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
3228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_dst
3238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the destination
3248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
3258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd
3268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer source stride
3278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
3288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] dst_strd
3298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer destination stride
3308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
3318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] i4_max_sad
3328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer maximum allowed distortion
3338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
3348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_mb_distortion
3358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer evaluated sad
3368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
3378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks
3388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
3398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
3408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/
3418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_compute_sad_16x16_ea8_sse42(UWORD8 *pu1_src,
3428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                               UWORD8 *pu1_est,
3438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                               WORD32 src_strd,
3448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                               WORD32 est_strd,
3458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                               WORD32 i4_max_sad,
3468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                               WORD32 *pi4_mb_distortion)
3478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
3488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src_r0, src_r1, src_r2, src_r3;
3498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i est_r0, est_r1, est_r2, est_r3;
3508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i res_r0, res_r1, res_r2, res_r3;
3518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sad_val;
3528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 val1, val2;
3538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 i4_sad;
3548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *pu1_src_temp = pu1_src + src_strd;
3558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *pu1_est_temp = pu1_est + est_strd;
3568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 0,2,4,6 sad calculation
3588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
3598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
3608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
3618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
3628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
3648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
3658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
3668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
3678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
3698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
3708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
3718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
3728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(res_r0, res_r1);
3748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
3758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
3768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 8,10,12,14 sad calculation
3788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += 8*src_strd;
3798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_est += 8*est_strd;
3808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
3828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
3838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
3848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
3858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
3878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
3888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
3898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
3908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
3928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
3938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
3948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
3958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
3968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r0);
3978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r1);
3988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
3998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
4008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src = pu1_src_temp;
4028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_est = pu1_est_temp;
4038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_val, 0);
4058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_val, 2);
4068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i4_sad = val1 + val2;
4088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    if (i4_max_sad < i4_sad)
4098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    {
4108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        *pi4_mb_distortion = i4_sad;
4118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        return ;
4128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    }
4138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 1,3,5,7 sad calculation
4148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
4158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
4168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
4178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
4188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
4208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
4218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
4228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
4238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
4258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
4268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
4278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
4288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r0);
4308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r1);
4318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
4328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
4338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 9,11,13,15 sad calculation
4358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += 8*src_strd;
4368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_est += 8*est_strd;
4378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
4388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
4398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
4408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
4418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
4438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
4448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
4458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
4468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
4488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
4498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
4508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
4518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r0);
4538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r1);
4548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
4558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
4568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_val, 0);
4588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_val, 2);
4598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    *pi4_mb_distortion = (val1+val2);
4608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    return;
4628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
4638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
4648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
4658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
4668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
4678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
4688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
4698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par   Description
4708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   This functions computes SAD between 2 16x16 blocks by processing alternate
4718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   rows (fast mode). For fast mode it is assumed sad obtained by processing
4728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   alternate rows is approximately twice as that for the whole block.
4738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
4748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src
4758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the source
4768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
4778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_dst
4788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the destination
4798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
4808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd
4818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer source stride
4828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
4838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] dst_strd
4848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer destination stride
4858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
4868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] i4_max_sad
4878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer maximum allowed distortion
4888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
4898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_mb_distortion
4908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer evaluated sad
4918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
4928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks
4938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
4948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
4958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/
4968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src,
4978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                UWORD8 *pu1_est,
4988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                WORD32 src_strd,
4998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                WORD32 est_strd,
5008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                WORD32 i4_max_sad,
5018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                WORD32 *pi4_mb_distortion)
5028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
5038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src_r0, src_r1, src_r2, src_r3;
5048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i est_r0, est_r1, est_r2, est_r3;
5058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i res_r0, res_r1, res_r2, res_r3;
5068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sad_val;
5078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 val1, val2;
5088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 i4_sad;
5098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *pu1_src_temp = pu1_src + src_strd;
5108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *pu1_est_temp = pu1_est + est_strd;
511086dd8ea90eaf4ead8b4927e777b1c8a19bd23a9Martin Storsjo    UNUSED (i4_max_sad);
5128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 0,2,4,6 sad calculation
5148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
5158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
5168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
5178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));
5188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
5208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
5218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
5228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));
5238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
5258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
5268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
5278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
5288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(res_r0, res_r1);
5308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
5318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
5328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 8,10,12,14 sad calculation
5348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += 8 * src_strd;
5358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_est += 8 * est_strd;
5368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
5388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
5398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
5408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));
5418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
5438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
5448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
5458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));
5468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src_r0, est_r0);
5488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src_r1, est_r1);
5498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src_r2, est_r2);
5508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src_r3, est_r3);
5518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r0);
5538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r1);
5548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r2);
5558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_val = _mm_add_epi64(sad_val, res_r3);
5568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src = pu1_src_temp;
5588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_est = pu1_est_temp;
5598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_val, 0);
5618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_val, 2);
5628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    i4_sad = val1 + val2;
5648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    *pi4_mb_distortion = (i4_sad<<1);
5658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    return;
5668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
5678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
5688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
5698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*******************************************************************************
5708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
5718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief compute sad
5728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
5738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par Description: This function computes the sad at vertices of diamond grid
5748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* centered at reference pointer and at unit distance from it.
5758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
5768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_ref
5778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the reference
5788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
5798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_src
5808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the source
5818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
5828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] ref_strd
5838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer reference stride
5848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
5858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd
5868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer source stride
5878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
5888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_sad
5898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  pointer to integer array evaluated sad
5908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
5918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @returns  sad at all evaluated vertexes
5928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
5938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks  none
5948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
5958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*******************************************************************************
5968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/
5978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_calculate_sad4_prog_sse42(UWORD8 *pu1_ref,
5988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                             UWORD8 *pu1_src,
5998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                             WORD32 ref_strd,
6008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                             WORD32 src_strd,
6018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                             WORD32 *pi4_sad)
6028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
6038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    /* reference ptrs at unit 1 distance in diamond pattern centered at pu1_ref */
6048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *left_ptr    = pu1_ref - 1;
6058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *right_ptr   = pu1_ref + 1;
6068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *top_ptr     = pu1_ref - ref_strd;
6078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *bot_ptr     = pu1_ref + ref_strd;
6088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 val1, val2;
6108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src, ref_left, ref_right, ref_top, ref_bot;
6118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i res_r0, res_r1, res_r2, res_r3;
6128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sad_r0, sad_r1, sad_r2, sad_r3;
6138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 0 sad calculation
6158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
6168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
6178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
6188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
6198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
6208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_sad_epu8(src, ref_left);
6228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_sad_epu8(src, ref_right);
6238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_sad_epu8(src, ref_top);
6248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_sad_epu8(src, ref_bot);
6258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
6278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
6288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
6298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
6308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
6318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 1 sad calculation
6338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
6348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
6358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
6368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
6378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
6388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
6408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
6418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
6428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
6438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
6458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
6468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
6478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
6488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
6508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
6518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
6528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
6538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
6548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 2 sad calculation
6568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
6578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
6588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
6598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
6608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
6618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
6638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
6648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
6658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
6668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
6688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
6698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
6708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
6718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
6738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
6748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
6758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
6768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
6778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 3 sad calculation
6798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
6808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
6818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
6828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
6838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
6848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
6868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
6878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
6888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
6898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
6918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
6928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
6938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
6948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
6958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
6968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
6978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
6988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
6998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
7008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 4 sad calculation
7028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
7038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
7048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
7058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
7068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
7078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
7098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
7108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
7118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
7128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
7148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
7158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
7168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
7178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
7198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
7208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
7218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
7228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
7238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 5 sad calculation
7258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
7268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
7278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
7288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
7298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
7308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
7328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
7338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
7348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
7358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
7378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
7388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
7398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
7408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
7428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
7438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
7448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
7458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
7468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 6 sad calculation
7488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
7498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
7508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
7518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
7528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
7538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
7558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
7568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
7578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
7588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
7608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
7618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
7628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
7638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
7658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
7668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
7678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
7688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
7698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 7 sad calculation
7718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
7728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
7738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
7748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
7758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
7768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
7788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
7798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
7808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
7818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
7838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
7848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
7858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
7868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
7888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
7898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
7908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
7918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
7928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
7938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 8 sad calculation
7948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
7958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
7968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
7978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
7988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
7998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
8018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
8028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
8038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
8048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
8068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
8078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
8088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
8098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
8118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
8128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
8138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
8148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
8158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 9 sad calculation
8178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
8188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
8198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
8208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
8218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
8228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
8248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
8258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
8268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
8278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
8298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
8308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
8318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
8328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
8348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
8358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
8368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
8378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
8388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 10 sad calculation
8408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
8418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
8428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
8438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
8448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
8458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
8478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
8488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
8498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
8508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
8528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
8538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
8548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
8558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
8578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
8588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
8598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
8608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
8618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 11 sad calculation
8638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
8648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
8658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
8668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
8678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
8688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
8708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
8718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
8728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
8738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
8758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
8768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
8778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
8788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
8808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
8818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
8828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
8838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
8848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 12 sad calculation
8868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
8878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
8888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
8898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
8908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
8918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
8938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
8948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
8958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
8968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
8978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
8988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
8998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
9008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
9018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
9038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
9048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
9058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
9068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
9078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 13 sad calculation
9098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
9108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
9118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
9128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
9138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
9148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
9168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
9178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
9188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
9198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
9218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
9228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
9238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
9248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
9268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
9278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
9288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
9298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
9308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 14 sad calculation
9328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
9338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
9348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
9358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
9368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
9378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
9398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
9408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
9418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
9428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
9448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
9458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
9468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
9478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
9498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    left_ptr += ref_strd;
9508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    right_ptr += ref_strd;
9518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    top_ptr += ref_strd;
9528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    bot_ptr += ref_strd;
9538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 15 sad calculation
9558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
9568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
9578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
9588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
9598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
9608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_left);
9628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_right);
9638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_top);
9648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_bot);
9658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
9678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
9688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
9698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
9708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r0, 0);
9728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r0, 2);
9738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[0] = (val1 + val2);
9748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r1, 0);
9768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r1, 2);
9778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[1] = (val1 + val2);
9788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r2, 0);
9808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r2, 2);
9818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[2] = (val1 + val2);
9828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r3, 0);
9848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r3, 2);
9858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[3] = (val1 + val2);
9868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
9878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
9888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/**
9898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
9908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
9918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief computes distortion (SAD) at all subpel points about the src location
9928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
9938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @par Description
9948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   This functions computes SAD at all points at a subpel distance from the
9958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*   current source location.
9968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
9978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src
9988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the source
9998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
10008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_ref_half_x
10018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to half pel buffer
10028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
10038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_ref_half_y
10048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to half pel buffer
10058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
10068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_ref_half_xy
10078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to half pel buffer
10088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
10098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd
10108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer source stride
10118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
10128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] ref_strd
10138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer ref stride
10148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
10158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_sad
10168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer evaluated sad
10178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  pi4_sad[0] - half x
10188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  pi4_sad[1] - half x - 1
10198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  pi4_sad[2] - half y
10208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  pi4_sad[3] - half y - 1
10218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  pi4_sad[4] - half xy
10228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  pi4_sad[5] - half xy - 1
10238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  pi4_sad[6] - half xy - strd
10248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  pi4_sad[7] - half xy - 1 - strd
10258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
10268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks
10278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
10288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
10298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/
10308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_sub_pel_compute_sad_16x16_sse42(UWORD8 *pu1_src,
10318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                   UWORD8 *pu1_ref_half_x,
10328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                   UWORD8 *pu1_ref_half_y,
10338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                   UWORD8 *pu1_ref_half_xy,
10348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                   WORD32 src_strd,
10358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                   WORD32 ref_strd,
10368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                   WORD32 *pi4_sad)
10378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
10388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *pu1_ref_half_x_left = pu1_ref_half_x - 1;
10398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *pu1_ref_half_y_top = pu1_ref_half_y - ref_strd;
10408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *pu1_ref_half_xy_left = pu1_ref_half_xy - 1;
10418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *pu1_ref_half_xy_top = pu1_ref_half_xy - ref_strd;
10428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD8 *pu1_ref_half_xy_top_left = pu1_ref_half_xy - ref_strd - 1;
10438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 val1, val2;
10448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src, ref_half_x, ref_half_y, ref_half_xy;
10468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i ref_half_x_left, ref_half_y_top, ref_half_xy_left, ref_half_xy_top, ref_half_xy_top_left;
10478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i res_r0, res_r1, res_r2, res_r3, res_r4, res_r5, res_r6, res_r7;
10488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sad_r0, sad_r1, sad_r2, sad_r3, sad_r4, sad_r5, sad_r6, sad_r7;
10498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 0 sad calculation
10508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
10518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
10528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
10538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
10548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
10558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
10568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
10578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
10588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
10598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_sad_epu8(src, ref_half_x);
10618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_sad_epu8(src, ref_half_x_left);
10628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_sad_epu8(src, ref_half_y);
10638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_sad_epu8(src, ref_half_y_top);
10648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_sad_epu8(src, ref_half_xy);
10658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_sad_epu8(src, ref_half_xy_left);
10668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_sad_epu8(src, ref_half_xy_top);
10678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
10688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
10708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
10718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
10728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
10738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
10748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
10758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
10768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
10778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
10788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 1 sad calculation
10808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
10818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
10828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
10838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
10848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
10858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
10868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
10878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
10888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
10898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
10918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
10928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
10938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
10948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
10958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
10968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
10978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
10988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
10998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
11008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
11018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
11028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
11038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
11048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
11058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
11068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
11078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
11098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
11108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
11118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
11128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
11138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
11148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
11158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
11168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
11178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 2 sad calculation
11198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
11208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
11218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
11228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
11238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
11248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
11258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
11268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
11278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
11288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
11308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
11318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
11328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
11338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
11348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
11358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
11368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
11378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
11398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
11408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
11418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
11428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
11438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
11448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
11458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
11468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
11488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
11498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
11508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
11518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
11528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
11538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
11548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
11558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
11568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 3 sad calculation
11588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
11598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
11608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
11618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
11628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
11638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
11648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
11658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
11668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
11678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
11698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
11708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
11718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
11728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
11738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
11748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
11758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
11768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
11788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
11798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
11808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
11818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
11828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
11838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
11848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
11858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
11878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
11888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
11898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
11908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
11918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
11928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
11938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
11948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
11958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
11968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 4 sad calculation
11978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
11988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
11998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
12008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
12018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
12028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
12038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
12048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
12058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
12068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
12088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
12098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
12108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
12118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
12128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
12138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
12148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
12158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
12178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
12188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
12198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
12208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
12218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
12228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
12238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
12248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
12268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
12278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
12288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
12298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
12308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
12318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
12328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
12338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
12348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 5 sad calculation
12378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
12388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
12398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
12408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
12418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
12428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
12438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
12448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
12458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
12468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
12488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
12498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
12508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
12518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
12528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
12538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
12548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
12558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
12578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
12588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
12598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
12608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
12618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
12628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
12638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
12648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
12668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
12678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
12688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
12698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
12708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
12718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
12728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
12738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
12748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 6 sad calculation
12768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
12778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
12788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
12798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
12808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
12818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
12828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
12838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
12848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
12858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
12878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
12888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
12898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
12908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
12918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
12928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
12938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
12948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
12958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
12968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
12978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
12988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
12998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
13008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
13018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
13028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
13038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
13058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
13068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
13078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
13088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
13098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
13108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
13118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
13128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
13138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 7 sad calculation
13158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
13168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
13178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
13188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
13198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
13208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
13218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
13228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
13238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
13248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
13268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
13278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
13288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
13298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
13308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
13318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
13328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
13338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
13358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
13368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
13378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
13388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
13398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
13408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
13418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
13428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
13448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
13458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
13468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
13478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
13488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
13498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
13508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
13518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
13528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 8 sad calculation
13548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
13558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
13568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
13578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
13588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
13598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
13608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
13618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
13628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
13638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
13658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
13668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
13678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
13688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
13698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
13708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
13718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
13728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
13748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
13758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
13768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
13778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
13788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
13798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
13808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
13818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
13838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
13848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
13858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
13868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
13878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
13888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
13898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
13908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
13918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
13928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 9 sad calculation
13938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
13948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
13958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
13968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
13978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
13988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
13998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
14008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
14018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
14028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
14048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
14058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
14068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
14078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
14088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
14098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
14108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
14118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
14138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
14148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
14158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
14168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
14178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
14188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
14198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
14208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
14228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
14238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
14248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
14258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
14268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
14278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
14288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
14298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
14308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 10 sad calculation
14328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
14338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
14348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
14358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
14368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
14378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
14388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
14398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
14408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
14418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
14438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
14448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
14458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
14468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
14478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
14488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
14498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
14508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
14528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
14538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
14548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
14558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
14568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
14578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
14588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
14598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
14618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
14628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
14638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
14648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
14658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
14668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
14678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
14688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
14698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 11 sad calculation
14718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
14728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
14738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
14748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
14758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
14768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
14778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
14788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
14798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
14808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
14828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
14838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
14848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
14858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
14868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
14878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
14888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
14898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
14918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
14928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
14938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
14948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
14958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
14968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
14978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
14988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
14998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
15008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
15018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
15028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
15038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
15048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
15058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
15068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
15078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
15088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 12 sad calculation
15108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
15118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
15128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
15138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
15148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
15158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
15168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
15178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
15188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
15198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
15218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
15228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
15238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
15248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
15258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
15268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
15278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
15288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
15308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
15318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
15328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
15338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
15348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
15358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
15368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
15378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
15398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
15408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
15418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
15428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
15438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
15448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
15458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
15468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
15478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 13 sad calculation
15498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
15508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
15518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
15528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
15538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
15548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
15558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
15568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
15578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
15588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
15608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
15618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
15628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
15638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
15648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
15658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
15668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
15678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
15698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
15708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
15718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
15728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
15738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
15748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
15758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
15768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
15788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
15798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
15808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
15818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
15828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
15838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
15848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
15858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
15868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 14 sad calculation
15888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
15898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
15908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
15918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
15928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
15938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
15948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
15958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
15968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
15978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
15988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
15998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
16008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
16018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
16028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
16038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
16048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
16058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
16068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
16088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
16098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
16108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
16118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
16128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
16138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
16148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
16158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_src += src_strd;
16178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x += ref_strd;
16188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_x_left += ref_strd;
16198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y += ref_strd;
16208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_y_top += ref_strd;
16218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy += ref_strd;
16228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_left += ref_strd;
16238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top += ref_strd;
16248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pu1_ref_half_xy_top_left += ref_strd;
16258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    // Row 15 sad calculation
16278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    src = _mm_loadu_si128((__m128i *) (pu1_src));
16288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
16298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
16308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
16318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
16328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
16338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
16348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
16358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
16368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r0 = _mm_sad_epu8(src, ref_half_x);
16388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
16398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r2 = _mm_sad_epu8(src, ref_half_y);
16408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
16418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r4 = _mm_sad_epu8(src, ref_half_xy);
16428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
16438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
16448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
16458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
16478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
16488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
16498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
16508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
16518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
16528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
16538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
16548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r0, 0);
16568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r0, 2);
16578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[0] = (val1 + val2);
16588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r1, 0);
16608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r1, 2);
16618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[1] = (val1 + val2);
16628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r2, 0);
16648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r2, 2);
16658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[2] = (val1 + val2);
16668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r3, 0);
16688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r3, 2);
16698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[3] = (val1 + val2);
16708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r4, 0);
16728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r4, 2);
16738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[4] = (val1 + val2);
16748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r5, 0);
16768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r5, 2);
16778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[5] = (val1 + val2);
16788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r6, 0);
16808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r6, 2);
16818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[6] = (val1 + val2);
16828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val1 = _mm_extract_epi32(sad_r7, 0);
16848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    val2 = _mm_extract_epi32(sad_r7, 2);
16858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    pi4_sad[7] = (val1 + val2);
16868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
16878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    return;
16888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
16898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S/*
16908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
16918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @brief This function computes SAD between two 16x16 blocks
16928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*        It also computes if the block will be zero after H264 transform and quant for
16938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*        Intra 16x16 blocks
16948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
16958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu1_src
16968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the source
16978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
16988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu1_dst
16998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  UWORD8 pointer to the destination
17008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
17018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] src_strd
17028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer source stride
17038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
17048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] dst_strd
17058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer destination stride
17068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
17078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[in] pu2_thrsh
17088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  Threshold for each element of transofrmed quantized block
17098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
17108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pi4_mb_distortion
17118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  integer evaluated sad
17128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
17138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @param[out] pu4_is_zero
17148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*  Poitner to store if the block is zero after transform and quantization
17158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
17168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S* @remarks
17178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*
17188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S******************************************************************************
17198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S*/
17208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha Svoid ime_compute_satqd_16x16_lumainter_sse42(UWORD8 *pu1_src,
17218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                         UWORD8 *pu1_est,
17228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                         WORD32 src_strd,
17238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                         WORD32 est_strd,
17248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                         UWORD16 *pu2_thrsh,
17258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                         WORD32 *pi4_mb_distortion,
17268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                                         UWORD32 *pu4_is_zero)
17278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S{
17288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i src_r0, src_r1, src_r2, src_r3;
17298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i est_r0, est_r1, est_r2, est_r3;
17308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i temp0, temp1, temp2, temp3, temp4;
17318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i zero = _mm_setzero_si128();          // all bits reset to zero
17328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i all_one = _mm_set1_epi8(0xFF);
17338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    __m128i sad_b1, sad_b2, threshold;
17348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD16 sad_1, sad_2;
17358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 i;
17368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    UWORD32 flag = 0;
17378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    WORD32 test1, test2;
17388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    threshold = _mm_loadu_si128((__m128i *) pu2_thrsh);
17398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    (*pi4_mb_distortion) = 0;
17408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    for (i=0; i<4; i++)
17428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    {
17438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r0 = _mm_loadl_epi64((__m128i *) pu1_src);  //Row 0 - Block1 and 2
17448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
17458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
17468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2
17478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r0 = _mm_cvtepu8_epi16(src_r0);
17498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r1 = _mm_cvtepu8_epi16(src_r1);
17508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r2 = _mm_cvtepu8_epi16(src_r2);
17518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r3 = _mm_cvtepu8_epi16(src_r3);
17528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
17548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
17558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
17568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));
17578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r0 = _mm_cvtepu8_epi16(est_r0);
17598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r1 = _mm_cvtepu8_epi16(est_r1);
17608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r2 = _mm_cvtepu8_epi16(est_r2);
17618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r3 = _mm_cvtepu8_epi16(est_r3);
17628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r0 = _mm_sub_epi16(src_r0, est_r0);
17648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r1 = _mm_sub_epi16(src_r1, est_r1);
17658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r2 = _mm_sub_epi16(src_r2, est_r2);
17668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r3 = _mm_sub_epi16(src_r3, est_r3);
17678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r0 = _mm_abs_epi16(src_r0);
17698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r1 = _mm_abs_epi16(src_r1);
17708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r2 = _mm_abs_epi16(src_r2);
17718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r3 = _mm_abs_epi16(src_r3);
17728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r0 = _mm_add_epi16(src_r0, src_r3);     //s1 s4 s4 s1 a1 a4 a4 a1
17748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r1 = _mm_add_epi16(src_r1, src_r2);     //s2 s3 s3 s2 a2 a3 a3 a2
17758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //SAD calculation
17778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        temp0 = _mm_add_epi16(src_r0, src_r1);      //s1+s2 s4+s3 s4+s3 s1+s2 a1+a2 a4+a3 a4+a3 a1+a2
17788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        temp0 = _mm_hadd_epi16(temp0, zero);
17798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        temp0 = _mm_hadd_epi16(temp0, zero);        //sad1, sad2 - 16bit values
17808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        sad_1 = _mm_extract_epi16(temp0, 0);
17828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        sad_2 = _mm_extract_epi16(temp0, 1);
17838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        (*pi4_mb_distortion) += sad_1 + sad_2;
17858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        if (flag == 0) {
17878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sad_b1 = _mm_set1_epi16((sad_1 << 1));
17888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sad_b2 = _mm_set1_epi16((sad_2 << 1));
17898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
17918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4
17928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
17948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3
17958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r0 = _mm_hadd_epi16(src_r0, zero);      //s1 s4 a1 a4 0 0 0 0
17978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r1 = _mm_hadd_epi16(src_r1, zero);      //s2 s3 a2 a3 0 0 0 0
17988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
17998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
18008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0
18018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
18038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0
18048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
18068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0
18078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0
18098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_hadd_epi16(src_r0, zero);   //s1+s4 a1+a4 0 0 0 0 0 0
18118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_hadd_epi16(src_r1, zero);   //s2+s3 a2+a3 0 0 0 0 0 0
18128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0
18148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
18168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)
18178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
18198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)
18208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sad_b1 = _mm_sub_epi16(sad_b1, temp2);      //lsi values Block0
18228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sad_b2 = _mm_sub_epi16(sad_b2, temp3);      //lsi values Block1
18238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff
18258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_cmpgt_epi16(threshold, sad_b2);
18278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_xor_si128(temp0, all_one);      //Xor with 1 => NOT operation
18298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_xor_si128(temp1, all_one);
18308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            test1 = _mm_test_all_zeros(temp0, all_one);
18328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            test2 = _mm_test_all_zeros(temp1, all_one);
18338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
18358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                    || pu2_thrsh[8] <= sad_2)
18368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                flag = 1;
18378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        }
18388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        pu1_src += 8;
18408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        pu1_est += 8;
18418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r0 = _mm_loadl_epi64((__m128i *) pu1_src);  //Row 0 - Block1 and 2
18438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
18448d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
18458d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2
18468d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18478d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r0 = _mm_cvtepu8_epi16(src_r0);
18488d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r1 = _mm_cvtepu8_epi16(src_r1);
18498d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r2 = _mm_cvtepu8_epi16(src_r2);
18508d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r3 = _mm_cvtepu8_epi16(src_r3);
18518d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18528d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
18538d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
18548d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
18558d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));
18568d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18578d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r0 = _mm_cvtepu8_epi16(est_r0);
18588d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r1 = _mm_cvtepu8_epi16(est_r1);
18598d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r2 = _mm_cvtepu8_epi16(est_r2);
18608d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        est_r3 = _mm_cvtepu8_epi16(est_r3);
18618d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18628d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r0 = _mm_sub_epi16(src_r0, est_r0);
18638d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r1 = _mm_sub_epi16(src_r1, est_r1);
18648d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r2 = _mm_sub_epi16(src_r2, est_r2);
18658d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r3 = _mm_sub_epi16(src_r3, est_r3);
18668d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18678d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r0 = _mm_abs_epi16(src_r0);
18688d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r1 = _mm_abs_epi16(src_r1);
18698d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r2 = _mm_abs_epi16(src_r2);
18708d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r3 = _mm_abs_epi16(src_r3);
18718d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18728d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r0 = _mm_add_epi16(src_r0, src_r3);     //s1 s4 s4 s1 a1 a4 a4 a1
18738d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        src_r1 = _mm_add_epi16(src_r1, src_r2);     //s2 s3 s3 s2 a2 a3 a3 a2
18748d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18758d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        //SAD calculation
18768d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        temp0 = _mm_add_epi16(src_r0, src_r1);
18778d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        temp0 = _mm_hadd_epi16(temp0, zero);
18788d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        temp0 = _mm_hadd_epi16(temp0, zero);        //sad1, sad2 - 16bit values
18798d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18808d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        sad_1 = _mm_extract_epi16(temp0, 0);
18818d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        sad_2 = _mm_extract_epi16(temp0, 1);
18828d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18838d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        (*pi4_mb_distortion) += sad_1 + sad_2;
18848d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18858d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        if (flag == 0) {
18868d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sad_b1 = _mm_set1_epi16((sad_1 << 1));
18878d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sad_b2 = _mm_set1_epi16((sad_2 << 1));
18888d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18898d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
18908d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4
18918d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18928d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
18938d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3
18948d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18958d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r0 = _mm_hadd_epi16(src_r0, zero);      //s1 s4 a1 a4 0 0 0 0
18968d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            src_r1 = _mm_hadd_epi16(src_r1, zero);      //s2 s3 a2 a3 0 0 0 0
18978d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
18988d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
18998d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0
19008d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19018d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
19028d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0
19038d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19048d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
19058d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0
19068d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19078d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0
19088d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19098d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_hadd_epi16(src_r0, zero);   //s1+s4 a1+a4 0 0 0 0 0 0
19108d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_hadd_epi16(src_r1, zero);   //s2+s3 a2+a3 0 0 0 0 0 0
19118d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19128d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0
19138d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19148d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
19158d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)
19168d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19178d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
19188d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)
19198d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19208d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sad_b1 = _mm_sub_epi16(sad_b1, temp2);      //lsi values Block0
19218d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            sad_b2 = _mm_sub_epi16(sad_b2, temp3);      //lsi values Block1
19228d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19238d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff
19248d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19258d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_cmpgt_epi16(threshold, sad_b2);
19268d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19278d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp0 = _mm_xor_si128(temp0, all_one);      //Xor with 1 => NOT operation
19288d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            temp1 = _mm_xor_si128(temp1, all_one);
19298d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19308d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            test1 = _mm_test_all_zeros(temp0, all_one);
19318d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            test2 = _mm_test_all_zeros(temp1, all_one);
19328d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19338d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S            if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
19348d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                    || pu2_thrsh[8] <= sad_2)
19358d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S                flag = 1;
19368d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        }
19378d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19388d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        pu1_src += 4*src_strd - 8;
19398d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        pu1_est += 4*est_strd - 8;
19408d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S    }
19418d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S
19428d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S        *pu4_is_zero = flag;
19438d3d303c7942ced6a987a52db8977d768dc3605fHamsalekha S}
1944