10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/******************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  ihevc_weighted_pred_atom_intr.c
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Contains function definitions for weighted prediction used in inter
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* prediction
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions:
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   - ihevc_weighted_pred_uni_ssse3()
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   - ihevc_weighted_pred_bi_ssse3()
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   - ihevc_weighted_pred_bi_default_ssse3()
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   - ihevc_weighted_pred_chroma_uni_ssse3()
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   - ihevc_weighted_pred_chroma_bi_ssse3()
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   - ihevc_weighted_pred_chroma_bi_default_ssse3()
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* File Includes                                                             */
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdio.h>
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <assert.h>
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_debug.h"
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h"
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h"
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h"
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h"
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h"
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_weighted_pred.h"
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_inter_pred.h"
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h>
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Does uni-weighted prediction on the array pointed by  pi2_src and stores
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* it at the location pointed by pi2_dst
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to the source
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to the destination
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to the source
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset to be added after rounding and
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shifting
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  (14 Bit depth) + log2_weight_denominator
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_uni_ssse3(WORD16 *pi2_src,
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   UWORD8 *pu1_dst,
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 src_strd,
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 dst_strd,
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 wgt0,
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 off0,
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 shift,
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 lvl_shift,
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 ht,
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                   WORD32 wd)
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col, temp;
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* all 128 bit registers are named with a suffix mxnb, where m is the */
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* number of n bits packed in the register                            */
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i res_temp0_4x32b, res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b;
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(wd % 4 == 0); /* checking assumption*/
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(ht % 4 == 0); /* checking assumption*/
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    temp = 1 << (shift - 1);
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // seting values in register
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wgt0_8x16b = _mm_set1_epi16(wgt0);
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift * wgt0 */
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp_4x32b = _mm_set1_epi32(temp);
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    off0_4x32b = _mm_set1_epi32(off0);
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift * wgt0 */
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift * wgt0 + 1 << (shift - 1) */
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(0 == (wd & 7)) /* wd multiple of 8 case */
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  outer for loop starts from here */
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < ht; row += 4)
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(col = 0; col < wd; col += 8)
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {   /* for row =0 ,1,2,3*/
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* row = 1 */
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* row = 2 */
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* row = 3 */
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Get 32 bit Result */
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (i4_tmp >> shift) */ /* First 4 pixels */
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (i4_tmp >> shift) */ /* Last 4 pixels */
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift);
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift);
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp4_4x32b);
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp6_4x32b);
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_packus_epi16(res_temp2_4x32b, res_temp2_4x32b);
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* store four 8-bit output values  */
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 2*/
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), res_temp2_4x32b); /* row = 1*/
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), res_temp3_4x32b); /* row = 3*/
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* To update pointer */
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src += 8;
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += 8;
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            } /* inner loop ends here(4-output values in single iteration) */
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else  /* wd multiple of 4 case */
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 dst0, dst1, dst2, dst3;
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  outer for loop starts from here */
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < ht; row += 4)
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(col = 0; col < wd; col += 4)
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {   /* for row =0 ,1,2,3*/
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* row = 1 */
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* row = 2 */
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 2 * src_strd));
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* row = 3 */
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 3 * src_strd));
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* 2 rows together */
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp2_8x16b);
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Higher 16 bit */
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Get 32 bit Result */
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (i4_tmp >> shift) */
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (i4_tmp >> shift) + off0; */
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp3_4x32b);
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp2_4x32b);
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* dst row = 1 to 3 */
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 2);
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 3);
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* store four 8-bit output values  */
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                dst2 = _mm_cvtsi128_si32(res_temp2_4x32b);
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                dst3 = _mm_cvtsi128_si32(res_temp3_4x32b);
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* row = 1 to row = 3 */
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* To update pointer */
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src += 4;
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += 4;
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            } /* inner loop ends here(4-output values in single iteration) */
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Does chroma uni-weighted prediction on array pointed by pi2_src and stores
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* it at the location pointed by pi2_dst
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to the source
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to the destination
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to the source
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset to be added after rounding and
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shifting
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  (14 Bit depth) + log2_weight_denominator
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source (each colour component)
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_chroma_uni_ssse3(WORD16 *pi2_src,
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          UWORD8 *pu1_dst,
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 src_strd,
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 dst_strd,
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 wgt0_cb,
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 wgt0_cr,
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 off0_cb,
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 off0_cr,
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 shift,
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 lvl_shift,
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 ht,
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 wd)
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col, temp, wdx2;
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* all 128 bit registers are named with a suffix mxnb, where m is the */
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* number of n bits packed in the register                            */
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp0_8x16b, src_temp1_8x16b;
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i res_temp0_4x32b, res_temp1_4x32b;
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(wd % 2 == 0); /* checking assumption*/
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(ht % 2 == 0); /* checking assumption*/
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    temp = 1 << (shift - 1);
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wdx2 = 2 * wd;
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // seting values in register
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift * wgt0 */
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp_4x32b = _mm_set1_epi32(temp);
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift * wgt0 */
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift * wgt0 + 1 << (shift - 1) */
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i src_temp2_8x16b, src_temp3_8x16b;
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i res_temp2_4x32b, res_temp3_4x32b;
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  outer for loop starts from here */
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 16)
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 0 */ /* Next 8 pixels */
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp4_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp5_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Get 32 bit Result */
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp6_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp2_8x16b);
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp7_4x32b = _mm_unpackhi_epi16(res_temp5_4x32b, src_temp3_8x16b);
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp2_8x16b);
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, src_temp3_8x16b);
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b,  shift);
5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (i4_tmp >> shift) + off0; */
5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */
5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b,  shift);
5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b,  shift);
5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp4_4x32b = _mm_packs_epi32(res_temp4_4x32b, res_temp6_4x32b);
5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp5_4x32b = _mm_packs_epi32(res_temp5_4x32b, res_temp7_4x32b);
5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp4_4x32b);
5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp5_4x32b);
5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store 16 8-bit output values  */
5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src += 16;  /* Pointer update */
5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst += 16; /* Pointer update */
5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner loop ends here(4-output values in single iteration) */
5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i res_temp2_4x32b, res_temp3_4x32b;
5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  outer for loop starts from here */
5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 8)
5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Get 32 bit Result */
5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */
5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b,  shift);
5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (i4_tmp >> shift) + off0; */
5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store four 8-bit output values  */
5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src += 8;   /* Pointer update */
6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst += 8; /* Pointer update */
6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner loop ends here(4-output values in single iteration) */
6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else /* 2*wd multiple of 4 case */
6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 dst0, dst1;
6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  outer for loop starts from here */
6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 4)
6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* 2 rows together */
6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Get 32 bit Result */
6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */
6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*i4_tmp = (i4_tmp >> shift) + off0; */
6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* dst row = 1 to 3 */
6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store four 8-bit output values  */
6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src += 4;   /* Pointer update */
6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst += 4; /* Pointer update */
6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner loop ends here(4-output values in single iteration) */
6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location pointed  by pi2_dst
6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* off1 + 1) << (shift - 1) ) >> shift
6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1
6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 1
6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2
6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 2
6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to destination
6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1
6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 1
6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2
6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 2
6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0
7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to source 1
7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0
7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset 0
7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt1
7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to source 2
7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off1
7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset 1
7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift
7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  (14 Bit depth) + log2_weight_denominator
7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1
7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2
7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source
7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_bi_ssse3(WORD16 *pi2_src1,
7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD16 *pi2_src2,
7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  UWORD8 *pu1_dst,
7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 src_strd1,
7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 src_strd2,
7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 dst_strd,
7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 wgt0,
7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 off0,
7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 wgt1,
7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 off1,
7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 shift,
7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 lvl_shift1,
7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 lvl_shift2,
7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 ht,
7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                  WORD32 wd)
7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col, temp;
7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <assert.h>
7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(wd % 4 == 0); /* checking assumption*/
7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(ht % 4 == 0); /* checking assumption*/
7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    temp = (off0 + off1 + 1) << (shift - 1);
7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // seting values in register
7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wgt0_8x16b = _mm_set1_epi16(wgt0);
7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wgt1_8x16b = _mm_set1_epi16(wgt1);
7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift1 * wgt0 */
7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift2 * wgt1 */
7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp_4x32b = _mm_set1_epi32(temp);
7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift1 * wgt0 */
7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift2 * wgt1 */
7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(0 == (wd & 7)) /* wd multiple of 8 case */
7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  outer for loop starts from here */
7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < ht; row += 2)
7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(col = 0; col < wd; col += 8)
7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b  = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b  = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Get 32 bit Result */
8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (pi2_src[col] + lvl_shift) * wgt */
8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (i4_tmp >> shift) */
8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Next 4 Pixels */
8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* store four 8-bit output values  */
8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src1 += 8;  /* Pointer update */
8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src2 += 8;  /* Pointer update */
8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst  += 8;  /* Pointer update */
8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            } /* inner loop ends here(4-output values in single iteration) */
8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        } /* outer loop ends */
8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else /* wd multiple of 4 case */
8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 dst0, dst1;
8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  outer for loop starts from here */
8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < ht; row += 2)
8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(col = 0; col < wd; col += 4)
8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* 2 rows together */
8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Get 32 bit Result */
8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (pi2_src[col] + lvl_shift) * wgt */
9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (i4_tmp >> shift) */
9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* dst row = 1 to 3 */
9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* store four 8-bit output values  */
9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* row = 1 */
9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src1 += 4;  /* Pointer update */
9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src2 += 4;  /* Pointer update */
9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst  += 4;  /* Pointer update */
9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            } /* inner loop ends here(4-output values in single iteration) */
9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        } /* outer loop ends */
9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Does chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location pointed  by pi2_dst
9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* off1 + 1) << (shift - 1) ) >> shift
9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1
9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 1
9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2
9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 2
9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to destination
9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1
9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 1
9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2
9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 2
9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0
9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to source 1
9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0
9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset 0
9880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt1
9900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  weight to be multiplied to source 2
9910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off1
9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  offset 1
9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift
9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  (14 Bit depth) + log2_weight_denominator
9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1
9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2
10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source (each colour component)
10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_chroma_bi_ssse3(WORD16 *pi2_src1,
10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD16 *pi2_src2,
10210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         UWORD8 *pu1_dst,
10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 src_strd1,
10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 src_strd2,
10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 dst_strd,
10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 wgt0_cb,
10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 wgt0_cr,
10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 off0_cb,
10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 off0_cr,
10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 wgt1_cb,
10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 wgt1_cr,
10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 off1_cb,
10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 off1_cr,
10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 shift,
10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 lvl_shift1,
10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 lvl_shift2,
10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 ht,
10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 wd)
10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col, temp1, temp2;
10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 wdx2;
10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(wd % 2 == 0); /* checking assumption*/
10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(ht % 2 == 0); /* checking assumption*/
10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
10500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // seting values in register
10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
10560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wgt1_8x16b = _mm_set_epi16(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift1 * wgt0 */
10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
10600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift2 * wgt1 */
10620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wdx2 = wd * 2;
10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift1 * wgt0 */
10690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* lvl_shift2 * wgt1 */
10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
10750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
10760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  outer for loop starts from here */
10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < ht; row += 2)
10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(col = 0; col < wdx2; col += 8)
10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
10880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b  = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b  = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Get 32 bit Result */
10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (pi2_src[col] + lvl_shift) * wgt */
11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (i4_tmp >> shift) */
11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Next 4 Pixels */
11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* store four 8-bit output values  */
11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src1 += 8;  /* Pointer update */
11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src2 += 8;  /* Pointer update */
11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst  += 8;  /* Pointer update */
11510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            } /* inner loop ends here(4-output values in single iteration) */
11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
11560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
11570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        } /* outer loop ends */
11590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
11600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else /* wdx2 multiple of 4 case */
11610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
11620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 dst0, dst1;
11630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  outer for loop starts from here */
11640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < ht; row += 2)
11650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
11660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(col = 0; col < wdx2; col += 4)
11670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
11680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
11690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
11700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
11710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
11720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
11730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* 2 rows together */
11750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
11760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
11770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
11790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
11800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
11810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
11820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
11830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
11840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* Get 32 bit Result */
11860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
11870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
11880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
11900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
11910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (pi2_src[col] + lvl_shift) * wgt */
11930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
11940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
11950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
11960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
11970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
11990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
12000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
12010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
12030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
12040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
12050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (i4_tmp >> shift) */
12070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
12080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
12090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
12110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
12130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
12140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
12160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* dst row = 1 to 3 */
12180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
12190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* store four 8-bit output values  */
12210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
12220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
12240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* row = 1 */
12260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
12270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src1 += 4;  /* Pointer update */
12290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src2 += 4;  /* Pointer update */
12300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst  += 4;  /* Pointer update */
12310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            } /* inner loop ends here(4-output values in single iteration) */
12330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
12350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
12360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
12370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
12380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
12390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
12410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
12430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
12440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
12460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
12470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location  pointed by pi2_dst
12480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
12500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
12510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* >> shift  where shift = 15 - BitDepth
12520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1
12540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 1
12550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2
12570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 2
12580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
12600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to destination
12610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1
12630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 1
12640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2
12660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 2
12670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
12690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
12700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1
12720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
12730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2
12750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
12760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
12780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
12790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
12810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source
12820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
12840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
12860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
12870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Assumption : ht%4 == 0, wd%4 == 0
12890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
12900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* final result will match even if intermediate precision is in 16 bit.
12910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
12920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
12930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
12940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_bi_default_ssse3(WORD16 *pi2_src1,
12950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD16 *pi2_src2,
12960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          UWORD8 *pu1_dst,
12970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 src_strd1,
12980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 src_strd2,
12990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 dst_strd,
13000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 lvl_shift1,
13010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 lvl_shift2,
13020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 ht,
13030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 wd)
13040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
13050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
13060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 row, col, temp;
13070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 shift;
13080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
13100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
13110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
13120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ASSERT(wd % 4 == 0); /* checking assumption*/
13140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ASSERT(ht % 2 == 0); /* checking assumption*/
13150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
13170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp = 1 << (shift - 1);
13180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // seting values in register
13200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
13210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
13220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp_8x16b = _mm_set1_epi16(temp);
13230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
13250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
13260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(0 == (ht & 3)) /* ht multiple of 4*/
13280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
13290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            if(0 == (wd & 15)) /* wd multiple of 16 case */
13300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
13310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
13320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
13330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*  outer for loop starts from here */
13340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(row = 0; row < ht; row += 4)
13350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
13360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    for(col = 0; col < wd; col += 16)
13370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
13380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /*load 8 pixel values */ /* First 8 Values */
13390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
13400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
13410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 1 */
13420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
13430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
13440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 2 */
13450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
13460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
13470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 3 */
13480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
13490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
13500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /*load 8 pixel values */ /* Second 8 Values */
13520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
13530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
13540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 1 */
13550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
13560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
13570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 2 */
13580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
13590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
13600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
13620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
13630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
13640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
13650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
13660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /*load 8 pixel values */ /* Second 8 Values */
13680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 3 */
13690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
13700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
13710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
13730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
13740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
13750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
13760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
13770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
13790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
13800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
13810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
13820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
13830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* (i4_tmp >> shift) */ /* First 8 Values */
13850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
13860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
13870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
13880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
13890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
13910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
13920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
13930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
13940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
13950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* (i4_tmp >> shift) */ /* Second 8 Values */
13970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
13980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
13990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
14000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
14010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
14030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
14040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
14050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
14060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
14070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* store four 8-bit output values  */ /* 16 8 Values */
14090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
14100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
14110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
14120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
14130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* To update pointer */
14150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src1 += 16;
14160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src2 += 16;
14170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pu1_dst  += 16;
14180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    } /* inner loop ends here(8-output values in single iteration) */
14200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
14220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
14230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
14240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
14260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
14270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            else if(0 == (wd & 7)) /* multiple of 8 case */
14280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
14290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*  outer for loop starts from here */
14300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(row = 0; row < ht; row += 4)
14310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
14320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    for(col = 0; col < wd; col += 8)
14330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
14340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /*load 8 pixel values */
14350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
14360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
14370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 1 */
14380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
14390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
14400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 2 */
14410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
14420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
14430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 3 */
14440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
14450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
14460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* (pi2_src1[col] + pi2_src2[col]) */
14480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
14490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
14500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
14510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
14520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
14540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
14550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
14560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
14570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
14580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* (i4_tmp >> shift) */
14600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
14610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
14620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
14630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
14640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
14660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
14670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
14680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
14690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
14700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* store four 8-bit output values  */
14720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
14730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
14740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
14750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
14760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* To update pointer */
14780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src1 += 8;
14790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src2 += 8;
14800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pu1_dst  += 8;
14810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    } /* inner loop ends here(8-output values in single iteration) */
14830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
14850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
14860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
14870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
14890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
14900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            else /* wd multiple of 4 case*/
14910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
14920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                WORD32 dst0, dst1, dst2, dst3;
14930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*  outer for loop starts from here */
14950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(row = 0; row < ht; row += 4)
14960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
14970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    for(col = 0; col < wd; col += 4)
14980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    {
14990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
15000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
15010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
15020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
15030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 1 */
15050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
15060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
15070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 2 */
15080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
15090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
15100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 3 */
15110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
15120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
15130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* Pack two rows together */
15150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
15160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
15170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
15180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
15190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* (pi2_src1[col] + pi2_src2[col]) */
15210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
15220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
15230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
15250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
15260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
15270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* (i4_tmp >> shift) */
15290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
15300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
15310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
15330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
15340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
15350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
15370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* dst row = 1 to 3 */
15380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
15390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
15400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* store four 8-bit output values  */
15420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
15430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
15450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
15460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
15470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* row = 1 to row = 3 */
15490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
15500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
15510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
15520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        /* To update pointer */
15540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src1 += 4;
15550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pi2_src2 += 4;
15560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        pu1_dst  += 4;
15570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    } /* inner loop ends here(4-output values in single iteration) */
15590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
15610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
15620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst  = pu1_dst  - wd + 4 * dst_strd;  /* Pointer update */
15630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                }
15650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
15660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
15670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else /* ht multiple of 2 case and wd multiple of 4 case*/
15680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
15690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 dst0, dst1;
15710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  outer for loop starts from here */
15730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
15740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
15750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wd; col += 4)
15760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
15770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
15780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
15790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
15800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
15810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
15830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
15840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
15850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Pack two rows together */
15870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
15880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
15890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (pi2_src1[col] + pi2_src2[col]) */
15910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
15920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
15940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
15950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */
15970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
15980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
16000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
16010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
16030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* dst row = 1 to 3 */
16040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
16050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store four 8-bit output values  */
16070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
16080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
16100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 to row = 3 */
16120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
16130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* To update pointer */
16150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src1 += 4;
16160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src2 += 4;
16170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst  += 4;
16180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner loop ends here(4-output values in single iteration) */
16200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
16220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
16230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst  = pu1_dst  - wd + 2 * dst_strd;  /* Pointer update */
16240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
16260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
16280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
16300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
16310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
16340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
16350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
16370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and
16380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location  pointed by pi2_dst
16390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
16410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
16420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* >> shift  where shift = 15 - BitDepth
16430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1
16450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 1
16460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2
16480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to source 2
16490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
16510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Pointer to destination
16520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1
16540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 1
16550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2
16570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Source stride 2
16580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
16600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Destination stride
16610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1
16630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
16640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2
16660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  added before shift and offset
16670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
16690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  height of the source
16700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
16720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  width of the source (each colour component)
16730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
16750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
16770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
16780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
16790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Assumption : ht%2 == 0, wd%2 == 0, lvl_shift1==0, lvl_shift2==0.
16800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
16810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* final result will match even if intermediate precision is in 16 bit.
16820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
16830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
16840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 *pi2_src1,
16860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                 WORD16 *pi2_src2,
16870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                 UWORD8 *pu1_dst,
16880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                 WORD32 src_strd1,
16890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                 WORD32 src_strd2,
16900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                 WORD32 dst_strd,
16910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                 WORD32 lvl_shift1,
16920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                 WORD32 lvl_shift2,
16930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                 WORD32 ht,
16940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                                 WORD32 wd)
16950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
16960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col, temp;
16970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 shift, wdx2;
16980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
17000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i lvl_shift1_8x16b;
17010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
17020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(wd % 2 == 0); /* checking assumption*/
17040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(ht % 2 == 0); /* checking assumption*/
17050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(lvl_shift1);
17060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(lvl_shift2);
17070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
17080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    temp = 1 << (shift - 1);
17090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wdx2 = wd * 2;
17100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    // seting values in register
17120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    lvl_shift1_8x16b = _mm_set1_epi16(temp);
17130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(0 == (ht & 3)) /* ht multiple of 4 case */
17150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
17160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
17170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
17180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
17190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
17200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  outer for loop starts from here */
17210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 4)
17220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
17230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 16)
17240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
17250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values */ /* First 8 Values */
17260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
17270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
17280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
17290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
17300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
17310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 2 */
17320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
17330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
17340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 3 */
17350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
17360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
17370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values */ /* Second 8 Values */
17390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
17400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
17410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
17420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
17430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
17440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 2 */
17450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
17460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
17470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
17490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
17500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
17510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
17520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
17530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values */ /* Second 8 Values */
17550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 3 */
17560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
17570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
17580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
17600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
17610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
17620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
17630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
17640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
17660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
17670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
17680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
17690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
17700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */ /* First 8 Values */
17720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
17730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
17740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
17750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
17760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
17780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
17790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
17800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
17810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
17820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
17840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
17850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
17860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
17870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
17880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */ /* Second 8 Values */
17900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
17910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
17920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
17930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
17940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store four 8-bit output values  */ /* First 8 Values */
17960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
17970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
17980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
17990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
18000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
18020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp9_8x16b  = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
18030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
18040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, src_temp13_8x16b);
18050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp15_8x16b = _mm_packus_epi16(src_temp15_8x16b, src_temp15_8x16b);
18060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store four 8-bit output values  */ /* Second 8 Values */
18080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
18090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
18100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd + 8), src_temp13_8x16b); /* row = 1*/
18110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd + 8), src_temp15_8x16b); /* row = 3*/
18120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* To update pointer */
18140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src1 += 16;
18150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src2 += 16;
18160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst  += 16;
18170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner loop ends here(8-output values in single iteration) */
18190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;    /* Pointer update */
18210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;    /* Pointer update */
18220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst  = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
18230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
18250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
18260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(0 == (wdx2 & 7)) /* multiple of 8 case */
18270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
18280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  outer for loop starts from here */
18290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 4)
18300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
18310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 8)
18320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
18330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values */
18340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
18350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
18360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
18370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
18380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
18390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 2 */
18400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
18410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
18420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 3 */
18430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
18440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
18450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (pi2_src1[col] + pi2_src2[col]) */
18470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
18480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
18490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
18500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
18510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
18530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
18540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
18550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
18560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
18570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */
18590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
18600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
18610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
18620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
18630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
18650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
18660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
18670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
18680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
18690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store four 8-bit output values  */
18710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
18720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
18730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
18740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
18750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* To update pointer */
18770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src1 += 8;
18780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src2 += 8;
18790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst  += 8;
18800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner loop ends here(8-output values in single iteration) */
18820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;    /* Pointer update */
18840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;    /* Pointer update */
18850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst  = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
18860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
18880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
18890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else /* 2*wd multiple of 4 case */
18900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
18910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 dst0, dst1, dst2, dst3;
18920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  outer for loop starts from here */
18930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 4)
18940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
18950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 4)
18960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
18970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
18980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
18990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
19000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
19010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
19030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
19040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
19050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 2 */
19060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
19070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
19080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 3 */
19090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
19100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
19110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Pack two rows together */
19130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
19140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
19150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
19160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
19170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (pi2_src1[col] + pi2_src2[col]) */
19190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
19200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
19210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
19230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
19240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
19250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */
19270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
19280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
19290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
19310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
19320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
19330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
19350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* dst row = 1 to 3 */
19360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
19370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
19380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store four 8-bit output values  */
19400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
19410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
19430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
19440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
19450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 to row = 3 */
19470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
19480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
19490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
19500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* To update pointer */
19520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src1 += 4;
19530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src2 += 4;
19540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst  += 4;
19550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner loop ends here(4-output values in single iteration) */
19570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;   /* Pointer update */
19590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;   /* Pointer update */
19600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst  = pu1_dst  - wdx2 + 4 * dst_strd;    /* Pointer update */
19610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
19630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
19640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
19650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else /* ht multiple of 2 case */
19660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
19670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
19680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
19690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
19700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  outer for loop starts from here */
19710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
19720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
19730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 16)
19740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
19750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values */ /* First 8 Values */
19760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
19770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
19780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
19790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
19800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
19810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values */ /* Second 8 Values */
19830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
19840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
19850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
19860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
19870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
19880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
19900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b  = _mm_adds_epi16(src_temp1_8x16b,  src_temp2_8x16b);
19910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b  = _mm_adds_epi16(src_temp3_8x16b,  src_temp4_8x16b);
19920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
19940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
19950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
19960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
19980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
19990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
20000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */ /* First 8 Values */
20020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
20030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
20040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
20060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
20070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
20080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
20100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
20110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
20120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */ /* Second 8 Values */
20140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
20150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
20160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store four 8-bit output values  */ /* First 8 Values */
20180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
20190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
20200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
20220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp9_8x16b  = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
20230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
20240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store four 8-bit output values  */ /* Second 8 Values */
20260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
20270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
20280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* To update pointer */
20300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src1 += 16;
20310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src2 += 16;
20320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst  += 16;
20330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner loop ends here(8-output values in single iteration) */
20350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
20370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
20380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst  = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
20390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
20410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
20420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(0 == (wdx2 & 7)) /* multiple of 8 case */
20430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
20440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  outer for loop starts from here */
20450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
20460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
20470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 8)
20480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
20490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values */
20500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
20510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
20520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
20530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
20540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
20550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (pi2_src1[col] + pi2_src2[col]) */
20570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
20580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
20590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
20610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
20620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
20630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */
20650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
20660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
20670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
20690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
20700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
20710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store four 8-bit output values  */
20730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
20740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 1*/
20750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* To update pointer */
20770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src1 += 8;
20780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src2 += 8;
20790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst  += 8;
20800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner loop ends here(8-output values in single iteration) */
20820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
20840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
20850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst  = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
20860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
20880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
20890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else /* 2*wd multiple of 4 case */
20900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
20910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 dst0, dst1;
20920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  outer for loop starts from here */
20930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
20940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
20950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 4)
20960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
20970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
20980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
20990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
21000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
21010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
21020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
21030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
21040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* Pack two rows together */
21060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
21070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
21080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (pi2_src1[col] + pi2_src2[col]) */
21100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
21110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
21120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
21130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* (i4_tmp >> shift) */
21150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
21160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
21170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
21180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
21200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* dst row = 1 */
21210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
21220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* store four 8-bit output values  */
21240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
21250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
21270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* row = 1 */
21280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
21290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* To update pointer */
21310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src1 += 4;
21320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_src2 += 4;
21330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst  += 4;
21340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner loop ends here(4-output values in single iteration) */
21350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;   /* Pointer update */
21370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;   /* Pointer update */
21380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;    /* Pointer update */
21390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
21410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
21420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
21430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
2144