10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* ihevc_weighted_pred_atom_intr.c 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Contains function definitions for weighted prediction used in inter 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* prediction 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions: 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_weighted_pred_uni_ssse3() 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_weighted_pred_bi_ssse3() 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_weighted_pred_bi_default_ssse3() 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_weighted_pred_chroma_uni_ssse3() 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_weighted_pred_chroma_bi_ssse3() 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_weighted_pred_chroma_bi_default_ssse3() 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/ 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* File Includes */ 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/ 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdio.h> 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <assert.h> 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_debug.h" 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h" 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h" 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h" 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h" 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h" 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_weighted_pred.h" 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_inter_pred.h" 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h> 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Does uni-weighted prediction on the array pointed by pi2_src and stores 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* it at the location pointed by pi2_dst 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to the source 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to the destination 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Source stride 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Destination stride 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* weight to be multiplied to the source 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset to be added after rounding and 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shifting 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* (14 Bit depth) + log2_weight_denominator 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* added before shift and offset 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* height of the source 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* width of the source 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_uni_ssse3(WORD16 *pi2_src, 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wgt0, 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 off0, 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 shift, 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 lvl_shift, 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 ht, 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wd) 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 row, col, temp; 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* all 128 bit registers are named with a suffix mxnb, where m is the */ 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* number of n bits packed in the register */ 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b; 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b; 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp0_4x32b, res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b; 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(wd % 4 == 0); /* checking assumption*/ 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(ht % 4 == 0); /* checking assumption*/ 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp = 1 << (shift - 1); 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // seting values in register 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift_4x32b = _mm_set1_epi16(lvl_shift); 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar wgt0_8x16b = _mm_set1_epi16(wgt0); 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift * wgt0 */ 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b); 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b); 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp_4x32b = _mm_set1_epi32(temp); 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar off0_4x32b = _mm_set1_epi32(off0); 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift * wgt0 */ 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b); 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift * wgt0 + 1 << (shift - 1) */ 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b); 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wd & 7)) /* wd multiple of 8 case */ 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b; 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wd; col += 8) 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { /* for row =0 ,1,2,3*/ 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src)); 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 2 */ 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd)); 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 3 */ 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd)); 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b); 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b); 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Get 32 bit Result */ 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b); 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b); 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b); 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b); 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b); 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b); 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ /* First 4 pixels */ 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ /* Last 4 pixels */ 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift); 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift); 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */ 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b); 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b); 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b); 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b); 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp4_4x32b); 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b); 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp6_4x32b); 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b); 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b); 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_packus_epi16(res_temp2_4x32b, res_temp2_4x32b); 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b); 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/ 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 2*/ 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), res_temp2_4x32b); /* row = 1*/ 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), res_temp3_4x32b); /* row = 3*/ 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += 8; 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 8; 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */ 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* wd multiple of 4 case */ 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst0, dst1, dst2, dst3; 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wd; col += 4) 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { /* for row =0 ,1,2,3*/ 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src)); 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd)); 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 2 */ 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 2 * src_strd)); 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 3 */ 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 3 * src_strd)); 2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* 2 rows together */ 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp2_8x16b); 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Higher 16 bit */ 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Get 32 bit Result */ 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (i4_tmp >> shift) + off0; */ 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b); 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp3_4x32b); 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp2_4x32b); 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst0 = _mm_cvtsi128_si32(res_temp0_4x32b); 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* dst row = 1 to 3 */ 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1); 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 2); 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 3); 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst1 = _mm_cvtsi128_si32(res_temp1_4x32b); 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst2 = _mm_cvtsi128_si32(res_temp2_4x32b); 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst3 = _mm_cvtsi128_si32(res_temp3_4x32b); 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 to row = 3 */ 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += 4; 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4; 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */ 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Does chroma uni-weighted prediction on array pointed by pi2_src and stores 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* it at the location pointed by pi2_dst 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + 3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to the source 3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to the destination 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Source stride 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd 3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Destination stride 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0 3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* weight to be multiplied to the source 3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0 3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset to be added after rounding and 3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shifting 3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift 3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* (14 Bit depth) + log2_weight_denominator 3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift 3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* added before shift and offset 3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht 3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* height of the source 3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd 3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* width of the source (each colour component) 3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_chroma_uni_ssse3(WORD16 *pi2_src, 4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wgt0_cb, 4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wgt0_cr, 4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 off0_cb, 4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 off0_cr, 4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 shift, 4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 lvl_shift, 4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 ht, 4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wd) 4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 row, col, temp, wdx2; 4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* all 128 bit registers are named with a suffix mxnb, where m is the */ 4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* number of n bits packed in the register */ 4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp0_8x16b, src_temp1_8x16b; 4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b; 4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp0_4x32b, res_temp1_4x32b; 4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(wd % 2 == 0); /* checking assumption*/ 4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(ht % 2 == 0); /* checking assumption*/ 4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp = 1 << (shift - 1); 4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar wdx2 = 2 * wd; 4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // seting values in register 4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift_4x32b = _mm_set1_epi16(lvl_shift); 4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb); 4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift * wgt0 */ 4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b); 4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b); 4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp_4x32b = _mm_set1_epi32(temp); 4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb); 4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift * wgt0 */ 4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b); 4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift * wgt0 + 1 << (shift - 1) */ 4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b); 4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */ 4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp2_8x16b, src_temp3_8x16b; 4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp2_4x32b, res_temp3_4x32b; 4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b; 4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 16) 4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src)); 4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); 4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 0 */ /* Next 8 pixels */ 4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 8)); 4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8)); 4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ 4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); 4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b); 4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); 4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ 4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); 4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b); 4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); 4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Get 32 bit Result */ 4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); 4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp2_8x16b); 4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_unpackhi_epi16(res_temp5_4x32b, src_temp3_8x16b); 4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); 4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp2_8x16b); 4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, src_temp3_8x16b); 4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ 4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); 4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); 4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); 4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); 4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b); 4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b); 4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b); 4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b); 4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); 5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); 5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (i4_tmp >> shift) + off0; */ 5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); 5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); 5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */ 5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); 5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); 5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift); 5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); 5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift); 5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); 5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */ 5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b); 5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b); 5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b); 5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b); 5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b); 5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); 5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_packs_epi32(res_temp4_4x32b, res_temp6_4x32b); 5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_packs_epi32(res_temp5_4x32b, res_temp7_4x32b); 5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp4_4x32b); 5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp5_4x32b); 5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store 16 8-bit output values */ 5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/ 5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/ 5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += 16; /* Pointer update */ 5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 16; /* Pointer update */ 5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ 5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */ 5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp2_4x32b, res_temp3_4x32b; 5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 8) 5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src)); 5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); 5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ 5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); 5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ 5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); 5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Get 32 bit Result */ 5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); 5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); 5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ 5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); 5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); 5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); 5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); 5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); 5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); 5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (i4_tmp >> shift) + off0; */ 5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); 5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); 5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); 5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); 5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b); 5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); 5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b); 5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/ 5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/ 5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += 8; /* Pointer update */ 6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 8; /* Pointer update */ 6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ 6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* 2*wd multiple of 4 case */ 6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst0, dst1; 6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 4) 6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src)); 6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd)); 6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* 2 rows together */ 6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b); 6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ 6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); 6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ 6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); 6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Get 32 bit Result */ 6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); 6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); 6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ 6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); 6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); 6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); 6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (i4_tmp >> shift) + off0; */ 6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); 6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); 6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b); 6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b); 6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst0 = _mm_cvtsi128_si32(res_temp0_4x32b); 6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* dst row = 1 to 3 */ 6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1); 6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst1 = _mm_cvtsi128_si32(res_temp1_4x32b); 6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src += 4; /* Pointer update */ 6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4; /* Pointer update */ 6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ 6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Does bi-weighted prediction on the arrays pointed by pi2_src1 and 6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location pointed by pi2_dst 6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + 6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* off1 + 1) << (shift - 1) ) >> shift 6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1 6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to source 1 6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2 6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to source 2 6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst 6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to destination 6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1 6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Source stride 1 6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2 6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Source stride 2 6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd 6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Destination stride 6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0 7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* weight to be multiplied to source 1 7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0 7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset 0 7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt1 7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* weight to be multiplied to source 2 7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off1 7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset 1 7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift 7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* (14 Bit depth) + log2_weight_denominator 7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1 7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* added before shift and offset 7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2 7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* added before shift and offset 7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht 7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* height of the source 7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd 7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* width of the source 7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_bi_ssse3(WORD16 *pi2_src1, 7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src2, 7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd1, 7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd2, 7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wgt0, 7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 off0, 7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wgt1, 7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 off1, 7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 shift, 7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 lvl_shift1, 7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 lvl_shift2, 7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 ht, 7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wd) 7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 row, col, temp; 7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b; 7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b; 7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <assert.h> 7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(wd % 4 == 0); /* checking assumption*/ 7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(ht % 4 == 0); /* checking assumption*/ 7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp = (off0 + off1 + 1) << (shift - 1); 7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // seting values in register 7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1); 7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar wgt0_8x16b = _mm_set1_epi16(wgt0); 7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2); 7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar wgt1_8x16b = _mm_set1_epi16(wgt1); 7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift1 * wgt0 */ 7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b); 7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b); 7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift2 * wgt1 */ 7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b); 7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b); 7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp_4x32b = _mm_set1_epi32(temp); 7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift1 * wgt0 */ 7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b); 7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift2 * wgt1 */ 7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b); 7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wd & 7)) /* wd multiple of 8 case */ 7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b; 7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wd; col += 8) 7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ 7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ 7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ 7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); 8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); 8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b); 8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ 8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); 8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); 8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b); 8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Get 32 bit Result */ 8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); 8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b); 8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b); 8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); 8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b); 8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b); 8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src[col] + lvl_shift) * wgt */ 8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b); 8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b); 8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b); 8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b); 8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); 8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); 8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); 8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); 8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); 8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); 8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); 8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); 8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Next 4 Pixels */ 8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b); 8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b); 8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b); 8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b); 8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); 8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); 8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b); 8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b); 8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b); 8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/ 8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/ 8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 8; /* Pointer update */ 8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 8; /* Pointer update */ 8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 8; /* Pointer update */ 8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ 8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ 8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ 8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* outer loop ends */ 8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* wd multiple of 4 case */ 8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst0, dst1; 8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wd; col += 4) 8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */ 8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */ 8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* 2 rows together */ 8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ 8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); 8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ 8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); 8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Get 32 bit Result */ 8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); 8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); 9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src[col] + lvl_shift) * wgt */ 9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); 9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); 9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); 9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); 9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); 9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); 9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); 9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); 9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); 9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst0 = _mm_cvtsi128_si32(res_temp1_4x32b); 9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* dst row = 1 to 3 */ 9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1); 9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst1 = _mm_cvtsi128_si32(res_temp2_4x32b); 9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 4; /* Pointer update */ 9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 4; /* Pointer update */ 9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4; /* Pointer update */ 9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ 9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ 9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ 9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* outer loop ends */ 9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Does chroma bi-weighted prediction on the arrays pointed by pi2_src1 and 9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location pointed by pi2_dst 9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + 9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* off1 + 1) << (shift - 1) ) >> shift 9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1 9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to source 1 9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2 9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to source 2 9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst 9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to destination 9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1 9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Source stride 1 9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2 9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Source stride 2 9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd 9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Destination stride 9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt0 9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* weight to be multiplied to source 1 9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off0 9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset 0 9880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wgt1 9900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* weight to be multiplied to source 2 9910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] off1 9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* offset 1 9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] shift 9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* (14 Bit depth) + log2_weight_denominator 9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1 9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* added before shift and offset 10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2 10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* added before shift and offset 10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht 10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* height of the source 10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd 10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* width of the source (each colour component) 10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_chroma_bi_ssse3(WORD16 *pi2_src1, 10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src2, 10210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd1, 10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd2, 10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wgt0_cb, 10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wgt0_cr, 10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 off0_cb, 10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 off0_cr, 10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wgt1_cb, 10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wgt1_cr, 10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 off1_cb, 10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 off1_cr, 10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 shift, 10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 lvl_shift1, 10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 lvl_shift2, 10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 ht, 10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wd) 10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 row, col, temp1, temp2; 10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wdx2; 10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b; 10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b; 10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(wd % 2 == 0); /* checking assumption*/ 10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(ht % 2 == 0); /* checking assumption*/ 10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp1 = (off0_cb + off1_cb + 1) << (shift - 1); 10500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp2 = (off0_cr + off1_cr + 1) << (shift - 1); 10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // seting values in register 10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1); 10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb); 10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2); 10560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar wgt1_8x16b = _mm_set_epi16(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb); 10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift1 * wgt0 */ 10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b); 10600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b); 10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift2 * wgt1 */ 10620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b); 10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b); 10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1); 10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar wdx2 = wd * 2; 10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift1 * wgt0 */ 10690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b); 10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* lvl_shift2 * wgt1 */ 10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b); 10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */ 10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b; 10760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 8) 10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ 10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ 10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ 10880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); 10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); 10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b); 10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ 10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); 10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); 10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b); 10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Get 32 bit Result */ 10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); 11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b); 11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b); 11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); 11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b); 11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b); 11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src[col] + lvl_shift) * wgt */ 11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b); 11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b); 11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b); 11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b); 11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); 11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); 11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); 11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); 11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); 11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); 11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); 11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); 11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Next 4 Pixels */ 11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b); 11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b); 11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b); 11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b); 11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); 11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); 11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b); 11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b); 11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b); 11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/ 11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/ 11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 8; /* Pointer update */ 11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 8; /* Pointer update */ 11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 8; /* Pointer update */ 11510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 11560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 11570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* outer loop ends */ 11590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 11600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* wdx2 multiple of 4 case */ 11610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst0, dst1; 11630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 11640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 11650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 4) 11670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 11680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 11690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */ 11700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */ 11710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 11720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 11730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* 2 rows together */ 11750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 11760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 11770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ 11790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 11800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); 11810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ 11820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 11830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); 11840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Get 32 bit Result */ 11860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 11870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); 11880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 11900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); 11910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src[col] + lvl_shift) * wgt */ 11930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); 11940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); 11950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); 11960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); 11970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 11980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 11990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); 12000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); 12010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 12030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); 12040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); 12050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 12070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 12080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 12090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); 12110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 12130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 12140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst0 = _mm_cvtsi128_si32(res_temp1_4x32b); 12160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* dst row = 1 to 3 */ 12180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1); 12190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 12210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 12220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst1 = _mm_cvtsi128_si32(res_temp2_4x32b); 12240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 12260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 12270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 4; /* Pointer update */ 12290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 4; /* Pointer update */ 12300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4; /* Pointer update */ 12310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 12330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 12350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 12360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 12370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 12390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 12410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 12420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 12430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 12440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 12460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and 12470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location pointed by pi2_dst 12480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 12500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) 12510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* >> shift where shift = 15 - BitDepth 12520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1 12540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to source 1 12550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2 12570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to source 2 12580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst 12600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to destination 12610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1 12630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Source stride 1 12640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2 12660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Source stride 2 12670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd 12690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Destination stride 12700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1 12720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* added before shift and offset 12730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2 12750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* added before shift and offset 12760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht 12780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* height of the source 12790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd 12810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* width of the source 12820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 12840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 12860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 12870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Assumption : ht%4 == 0, wd%4 == 0 12890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case, 12900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* final result will match even if intermediate precision is in 16 bit. 12910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 12920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 12930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 12940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_bi_default_ssse3(WORD16 *pi2_src1, 12950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src2, 12960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 12970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd1, 12980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd2, 12990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 13000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 lvl_shift1, 13010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 lvl_shift2, 13020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 ht, 13030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wd) 13040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 13050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 row, col, temp; 13070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 shift; 13080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 13100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b; 13110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 13120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(wd % 4 == 0); /* checking assumption*/ 13140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(ht % 2 == 0); /* checking assumption*/ 13150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shift = SHIFT_14_MINUS_BIT_DEPTH + 1; 13170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp = 1 << (shift - 1); 13180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // seting values in register 13200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1); 13210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2); 13220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar const_temp_8x16b = _mm_set1_epi16(temp); 13230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b); 13250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b); 13260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (ht & 3)) /* ht multiple of 4*/ 13280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wd & 15)) /* wd multiple of 16 case */ 13300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b; 13320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b; 13330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 13340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 13350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wd; col += 16) 13370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 13380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values */ /* First 8 Values */ 13390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 13400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 13410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 13420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 13430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 13440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 2 */ 13450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); 13460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); 13470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 3 */ 13480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); 13490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); 13500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values */ /* Second 8 Values */ 13520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8)); 13530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8)); 13540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 13550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8)); 13560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8)); 13570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 2 */ 13580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8)); 13590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8)); 13600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */ 13620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 13630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 13640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 13650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); 13660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values */ /* Second 8 Values */ 13680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 3 */ 13690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8)); 13700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8)); 13710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */ 13730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 13740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 13750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 13760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); 13770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */ 13790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b); 13800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b); 13810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b); 13820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b); 13830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ /* First 8 Values */ 13850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 13860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 13870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 13880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); 13890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */ 13910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b); 13920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b); 13930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b); 13940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b); 13950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 13960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ /* Second 8 Values */ 13970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift); 13980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift); 13990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift); 14000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift); 14010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */ 14030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b); 14040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b); 14050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b); 14060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b); 14070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ /* 16 8 Values */ 14090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 14100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 14110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ 14120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ 14130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 14150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 16; 14160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 16; 14170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 16; 14180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(8-output values in single iteration) */ 14200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ 14220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ 14230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 14240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(0 == (wd & 7)) /* multiple of 8 case */ 14280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 14300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 14310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wd; col += 8) 14330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values */ 14350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 14360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 14370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 14380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 14390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 14400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 2 */ 14410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); 14420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); 14430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 3 */ 14440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); 14450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); 14460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ 14480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 14490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 14500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 14510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); 14520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 14540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 14550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 14560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 14570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); 14580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 14600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 14610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 14620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 14630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); 14640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 14660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 14670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); 14680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 14690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b); 14700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 14720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 14730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 14740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ 14750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ 14760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 14780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 8; 14790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 8; 14800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 8; 14810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(8-output values in single iteration) */ 14830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ 14850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ 14860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 14870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 14900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* wd multiple of 4 case*/ 14910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst0, dst1, dst2, dst3; 14930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 14940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 14950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 14960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wd; col += 4) 14980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 14990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ 15000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); 15010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 15020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); 15030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 15050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); 15060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); 15070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 2 */ 15080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1)); 15090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2)); 15100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 3 */ 15110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1)); 15120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2)); 15130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Pack two rows together */ 15150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 15160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 15170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b); 15180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b); 15190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ 15210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 15220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 15230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 15250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 15260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 15270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 15290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 15300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 15310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 15330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 15340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 15350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); 15370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* dst row = 1 to 3 */ 15380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); 15390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1); 15400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 15420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 15430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); 15450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst2 = _mm_cvtsi128_si32(src_temp5_8x16b); 15460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst3 = _mm_cvtsi128_si32(src_temp4_8x16b); 15470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 to row = 3 */ 15490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 15500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; 15510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; 15520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 15540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 4; 15550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 4; 15560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4; 15570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 15590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ 15610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ 15620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 15630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 15670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* ht multiple of 2 case and wd multiple of 4 case*/ 15680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst0, dst1; 15710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 15730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 15740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wd; col += 4) 15760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 15770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ 15780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); 15790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 15800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); 15810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 15830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); 15840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); 15850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Pack two rows together */ 15870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 15880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 15890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ 15910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 15920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 15940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 15950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 15970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 15980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 15990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 16000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 16010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); 16030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* dst row = 1 to 3 */ 16040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); 16050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 16070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 16080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); 16100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 to row = 3 */ 16120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 16130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 16150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 4; 16160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 4; 16170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4; 16180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 16200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ 16220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ 16230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ 16240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 16260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 16280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 16300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 16310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 16340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 16350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 16370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and 16380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pi2_src2 and stores it at location pointed by pi2_dst 16390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 16410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) 16420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* >> shift where shift = 15 - BitDepth 16430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src1 16450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to source 1 16460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi2_src2 16480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to source 2 16490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst 16510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Pointer to destination 16520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd1 16540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Source stride 1 16550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd2 16570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Source stride 2 16580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd 16600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Destination stride 16610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift1 16630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* added before shift and offset 16640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] lvl_shift2 16660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* added before shift and offset 16670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht 16690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* height of the source 16700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd 16720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* width of the source (each colour component) 16730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 16750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 16770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 16780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 16790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Assumption : ht%2 == 0, wd%2 == 0, lvl_shift1==0, lvl_shift2==0. 16800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case, 16810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* final result will match even if intermediate precision is in 16 bit. 16820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 16830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 16840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 *pi2_src1, 16860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_src2, 16870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 16880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd1, 16890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd2, 16900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 16910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 lvl_shift1, 16920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 lvl_shift2, 16930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 ht, 16940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wd) 16950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 16960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 row, col, temp; 16970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 shift, wdx2; 16980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 16990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 17000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i lvl_shift1_8x16b; 17010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 17020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(wd % 2 == 0); /* checking assumption*/ 17040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(ht % 2 == 0); /* checking assumption*/ 17050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(lvl_shift1); 17060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(lvl_shift2); 17070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar shift = SHIFT_14_MINUS_BIT_DEPTH + 1; 17080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar temp = 1 << (shift - 1); 17090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar wdx2 = wd * 2; 17100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar // seting values in register 17120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar lvl_shift1_8x16b = _mm_set1_epi16(temp); 17130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (ht & 3)) /* ht multiple of 4 case */ 17150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */ 17170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b; 17190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b; 17200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 17210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 17220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 16) 17240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 17250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values */ /* First 8 Values */ 17260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 17270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 17280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 17290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 17300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 17310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 2 */ 17320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); 17330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); 17340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 3 */ 17350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); 17360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); 17370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values */ /* Second 8 Values */ 17390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8)); 17400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8)); 17410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 17420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8)); 17430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8)); 17440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 2 */ 17450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8)); 17460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8)); 17470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */ 17490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 17500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 17510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 17520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); 17530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values */ /* Second 8 Values */ 17550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 3 */ 17560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8)); 17570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8)); 17580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */ 17600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 17610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 17620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 17630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); 17640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */ 17660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b); 17670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b); 17680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b); 17690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b); 17700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ /* First 8 Values */ 17720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 17730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 17740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 17750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); 17760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */ 17780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b); 17790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b); 17800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b); 17810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b); 17820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */ 17840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 17850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); 17860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 17870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b); 17880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ /* Second 8 Values */ 17900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift); 17910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift); 17920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift); 17930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift); 17940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 17950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ /* First 8 Values */ 17960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 17970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 17980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ 17990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ 18000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */ 18020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b); 18030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b); 18040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, src_temp13_8x16b); 18050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp15_8x16b = _mm_packus_epi16(src_temp15_8x16b, src_temp15_8x16b); 18060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ /* Second 8 Values */ 18080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/ 18090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/ 18100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd + 8), src_temp13_8x16b); /* row = 1*/ 18110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd + 8), src_temp15_8x16b); /* row = 3*/ 18120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 18140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 16; 18150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 16; 18160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 16; 18170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(8-output values in single iteration) */ 18190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */ 18210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */ 18220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */ 18230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(0 == (wdx2 & 7)) /* multiple of 8 case */ 18270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 18290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 18300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 8) 18320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values */ 18340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 18350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 18360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 18370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 18380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 18390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 2 */ 18400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); 18410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); 18420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 3 */ 18430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); 18440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); 18450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ 18470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 18480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 18490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 18500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); 18510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 18530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 18540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 18550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 18560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); 18570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 18590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 18600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 18610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 18620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); 18630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 18650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 18660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); 18670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 18680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b); 18690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 18710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 18720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 18730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ 18740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ 18750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 18770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 8; 18780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 8; 18790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 8; 18800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(8-output values in single iteration) */ 18820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */ 18840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */ 18850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */ 18860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 18870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 18890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* 2*wd multiple of 4 case */ 18900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst0, dst1, dst2, dst3; 18920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 18930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 18940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 4) 18960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 18970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ 18980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); 18990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 19000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); 19010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 19030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); 19040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); 19050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 2 */ 19060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1)); 19070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2)); 19080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 3 */ 19090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1)); 19100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2)); 19110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Pack two rows together */ 19130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 19140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 19150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b); 19160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b); 19170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ 19190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 19200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 19210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 19230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 19240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 19250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 19270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 19280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 19290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 19310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 19320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 19330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); 19350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* dst row = 1 to 3 */ 19360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); 19370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1); 19380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 19400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 19410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); 19430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst2 = _mm_cvtsi128_si32(src_temp5_8x16b); 19440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst3 = _mm_cvtsi128_si32(src_temp4_8x16b); 19450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 to row = 3 */ 19470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 19480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; 19490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; 19500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 19520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 4; 19530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 4; 19540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4; 19550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 19570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */ 19590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */ 19600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */ 19610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 19650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* ht multiple of 2 case */ 19660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */ 19680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b; 19700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 19710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 19720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 16) 19740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 19750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values */ /* First 8 Values */ 19760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 19770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 19780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 19790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 19800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 19810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values */ /* Second 8 Values */ 19830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8)); 19840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8)); 19850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 19860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8)); 19870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8)); 19880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */ 19900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 19910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 19920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */ 19940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 19950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 19960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 19970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */ 19980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b); 19990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b); 20000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ /* First 8 Values */ 20020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 20030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 20040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */ 20060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b); 20070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b); 20080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */ 20100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 20110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); 20120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ /* Second 8 Values */ 20140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift); 20150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift); 20160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ /* First 8 Values */ 20180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 20190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 20200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */ 20220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp9_8x16b = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b); 20230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b); 20240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ /* Second 8 Values */ 20260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/ 20270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/ 20280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 20300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 16; 20310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 16; 20320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 16; 20330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(8-output values in single iteration) */ 20350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 20370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 20380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 20390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(0 == (wdx2 & 7)) /* multiple of 8 case */ 20430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 20450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 20460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 8) 20480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values */ 20500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 20510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 20520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 20530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 20540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 20550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ 20570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 20580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 20590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 20610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 20620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 20630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 20650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 20660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 20670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 20690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 20700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); 20710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 20730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 20740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 1*/ 20750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 20770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 8; 20780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 8; 20790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 8; 20800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(8-output values in single iteration) */ 20820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 20840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 20850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 20860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 20870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 20890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* 2*wd multiple of 4 case */ 20900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst0, dst1; 20920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* outer for loop starts from here */ 20930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 20940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 4) 20960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 20970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ 20980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); 20990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 21000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); 21010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 21020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); 21030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); 21040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* Pack two rows together */ 21060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 21070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 21080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (pi2_src1[col] + pi2_src2[col]) */ 21100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 21110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 21120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 21130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* (i4_tmp >> shift) */ 21150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 21160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 21170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 21180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); 21200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* dst row = 1 */ 21210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); 21220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* store four 8-bit output values */ 21240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 21250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); 21270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* row = 1 */ 21280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 21290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* To update pointer */ 21310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 += 4; 21320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 += 4; 21330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4; 21340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner loop ends here(4-output values in single iteration) */ 21350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 21370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 21380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 21390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 21400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 21430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 2144