10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************** 20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License"); 60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License. 70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at: 80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0 100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software 120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS, 130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and 150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License. 160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/ 180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file 230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* ihevc_inter_pred_filters_x86_intr.c 240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Contains function definitions for inter prediction interpolation filters 270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* coded in x86 intrinsics 280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author 310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions: 340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_inter_pred_luma_copy_w16out_sse42() 350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_inter_pred_chroma_copy_sse42() 360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* - ihevc_inter_pred_chroma_copy_w16out_sse42() 370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/ 460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* File Includes */ 470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/ 480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <assert.h> 490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_debug.h" 510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h" 520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h" 530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_inter_pred.h" 540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h" 550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h" 560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h" 570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h> 590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <emmintrin.h> 600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <smmintrin.h> 610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <tmmintrin.h> 620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/ 640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* Function Definitions */ 650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/ 660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Interprediction luma filter for copy 16bit output 740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copies the array of width 'wd' and height 'ht' from the location pointed 770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* by 'src' to the location pointed by 'dst' The output is upshifted by 6 780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* bits and is used as input for vertical filtering or weighted prediction 790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src 810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* UWORD8 pointer to the source 820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pi2_dst 840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WORD16 pointer to the destination 850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd 870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer source stride 880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd 900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer destination stride 910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi1_coeff 930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WORD8 pointer to the filter coefficients 940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht 960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer height of the array 970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd 990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer width of the array 1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_inter_pred_luma_copy_w16out_sse42(UWORD8 *pu1_src, 1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst, 1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD8 *pi1_coeff, 1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 ht, 1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wd) 1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 row, col; 1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b; 1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(pi1_coeff); 1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(wd % 4 == 0); /* checking assumption*/ 1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(ht % 4 == 0); /* checking assumption*/ 1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wd & 7)) /* multiple of 8 case */ 1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wd; col += 8) 1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b); 1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b); 1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 16 8-bit output values */ 1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 8; /* pointer update */ 1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 8; /* pointer update */ 1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(8-output values in single iteration) */ 1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4 * src_strd - wd; /* pointer update */ 1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 4 * dst_strd - wd; /* pointer update */ 1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* wd = multiple of 4 case */ 1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wd; col += 4) 1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b); 1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b); 1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 16 8-bit output values */ 1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4; /* pointer update */ 1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 4; /* pointer update */ 1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(4-output values in single iteration) */ 1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4 * src_strd - wd; /* pointer update */ 1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 4 * dst_strd - wd; /* pointer update */ 1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Chroma interprediction filter for copy 2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copies the array of width 'wd' and height 'ht' from the location pointed 2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* by 'src' to the location pointed by 'dst' 2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src 2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* UWORD8 pointer to the source 2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst 2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* UWORD8 pointer to the destination 2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd 2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer source stride 2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd 2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer destination stride 2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi1_coeff 2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WORD8 pointer to the filter coefficients 2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht 2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer height of the array 2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd 2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer width of the array 2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_inter_pred_chroma_copy_sse42(UWORD8 *pu1_src, 2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UWORD8 *pu1_dst, 2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD8 *pi1_coeff, 2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 ht, 2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wd) 2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 row, col, wdx2; 2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b; 2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(wd % 2 == 0); /* checking assumption*/ 2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(ht % 2 == 0); /* checking assumption*/ 2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(pi1_coeff); 2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar wdx2 = wd * 2; 2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (ht & 3)) /* ht multiple of 4 */ 2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */ 2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 16) 2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); /* row =0 */ 2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 16 8-bit output values */ 2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */ 2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 16; /* pointer update */ 2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 16; /* pointer update */ 2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(16-output values in single iteration) */ 2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4 * src_strd - wdx2; /* pointer update */ 2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4 * dst_strd - wdx2; /* pointer update */ 2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(0 == (wdx2 & 7)) /* multiple of 8 case */ 2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 8) 2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 16 8-bit output values */ 2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */ 2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 8; /* pointer update */ 3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 8; /* pointer update */ 3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(8-output values in single iteration) */ 3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4 * src_strd - wdx2; /* pointer update */ 3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4 * dst_strd - wdx2; /* pointer update */ 3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* wdx2 = multiple of 4 case */ 3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst0, dst1, dst2, dst3; 3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 4) 3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst0 = _mm_cvtsi128_si32(src0_16x8b); 3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst1 = _mm_cvtsi128_si32(src1_16x8b); 3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst2 = _mm_cvtsi128_si32(src2_16x8b); 3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst3 = _mm_cvtsi128_si32(src3_16x8b); 3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 4 8-bit output values */ 3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */ 3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */ 3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */ 3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */ 3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4; /* pointer update */ 3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4; /* pointer update */ 3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(4- output values in single iteration) */ 3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4 * src_strd - wdx2; /* pointer update */ 3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4 * dst_strd - wdx2; /* pointer update */ 3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* ht multiple of 2 */ 3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */ 3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 16) 3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); /* row =0 */ 3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 16 8-bit output values */ 3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */ 3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 16; /* pointer update */ 3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 16; /* pointer update */ 3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(16-output values in single iteration) */ 3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 2 * src_strd - wdx2; /* pointer update */ 3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 2 * dst_strd - wdx2; /* pointer update */ 3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else if(0 == (wdx2 & 7)) /* multiple of 8 case */ 3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 8) 3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 16 8-bit output values */ 3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */ 3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 8; /* pointer update */ 3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 8; /* pointer update */ 3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(8-output values in single iteration) */ 3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 2 * src_strd - wdx2; /* pointer update */ 3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 2 * dst_strd - wdx2; /* pointer update */ 3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* wdx2 = multiple of 4 case */ 3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst0, dst1; 3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 4) 3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst0 = _mm_cvtsi128_si32(src0_16x8b); 3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar dst1 = _mm_cvtsi128_si32(src1_16x8b); 4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 4 8-bit output values */ 4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */ 4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */ 4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4; /* pointer update */ 4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 4; /* pointer update */ 4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(4- output values in single iteration) */ 4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 2 * src_strd - wdx2; /* pointer update */ 4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_dst += 2 * dst_strd - wdx2; /* pointer update */ 4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/** 4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief 4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* chroma interprediction filter for copying 16bit output 4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description: 4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copies the array of width 'wd' and height 'ht' from the location pointed 4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* by 'src' to the location pointed by 'dst' The output is upshifted by 6 4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* bits and is used as input for vertical filtering or weighted prediction 4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src 4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* UWORD8 pointer to the source 4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pi2_dst 4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WORD16 pointer to the destination 4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd 4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer source stride 4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd 4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer destination stride 4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi1_coeff 4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WORD8 pointer to the filter coefficients 4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht 4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer height of the array 4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd 4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* integer width of the array 4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns 4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks 4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* None 4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* 4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************* 4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/ 4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_inter_pred_chroma_copy_w16out_sse42(UWORD8 *pu1_src, 4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD16 *pi2_dst, 4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 src_strd, 4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 dst_strd, 4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD8 *pi1_coeff, 4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 ht, 4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 wd) 4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{ 4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar WORD32 row, col, wdx2; 4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b; 4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(wd % 2 == 0); /* checking assumption*/ 4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar ASSERT(ht % 2 == 0); /* checking assumption*/ 4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar UNUSED(pi1_coeff); 4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar wdx2 = wd * 2; 4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (ht & 3)) /* multiple of 4 case */ 4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wdx2 & 7)) /* multiple of 8 case */ 4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 8) 4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b); 4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b); 4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 16 8-bit output values */ 4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 8; /* pointer update */ 5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 8; /* pointer update */ 5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(8-output values in single iteration) */ 5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4 * src_strd - wdx2; /* pointer update */ 5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 4 * dst_strd - wdx2; /* pointer update */ 5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* wdx2 = multiple of 4 case */ 5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 4) 5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 4) 5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b); 5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b); 5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 16 8-bit output values */ 5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4; /* pointer update */ 5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 4; /* pointer update */ 5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(4-output values in single iteration) */ 5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4 * src_strd - wdx2; /* pointer update */ 5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 4 * dst_strd - wdx2; /* pointer update */ 5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* ht multiple of 2 case */ 5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar if(0 == (wdx2 & 7)) /* multiple of 8 case */ 5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 8) 5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 16 8-bit output values */ 5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 8; /* pointer update */ 5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 8; /* pointer update */ 5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(8-output values in single iteration) */ 5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 2 * src_strd - wdx2; /* pointer update */ 5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 2 * dst_strd - wdx2; /* pointer update */ 5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar else /* wdx2 = multiple of 4 case */ 5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(row = 0; row < ht; row += 2) 5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar for(col = 0; col < wdx2; col += 4) 5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar { 5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar /* storing 16 8-bit output values */ 5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 4; /* pointer update */ 5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 4; /* pointer update */ 6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } /* inner for loop ends here(4-output values in single iteration) */ 6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar 6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pu1_src += 2 * src_strd - wdx2; /* pointer update */ 6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar pi2_dst += 2 * dst_strd - wdx2; /* pointer update */ 6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar } 6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar} 608