10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/******************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  ihevc_inter_pred_filters_x86_intr.c
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Contains function definitions for inter prediction  interpolation filters
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  coded in x86 intrinsics
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions:
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_inter_pred_luma_copy_w16out_sse42()
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_inter_pred_chroma_copy_sse42()
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_inter_pred_chroma_copy_w16out_sse42()
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* File Includes                                                             */
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <assert.h>
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_debug.h"
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h"
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h"
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_inter_pred.h"
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h"
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h"
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h"
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h>
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <emmintrin.h>
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <smmintrin.h>
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <tmmintrin.h>
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* Function Definitions                                                      */
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*       Interprediction luma filter for copy 16bit output
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Copies the array of width 'wd' and height 'ht' from the  location pointed
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    by 'src' to the location pointed by 'dst' The output is upshifted by 6
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    bits and is used as input for vertical filtering or weighted prediction
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pi2_dst
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  WORD16 pointer to the destination
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi1_coeff
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  WORD8 pointer to the filter coefficients
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer height of the array
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer width of the array
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_inter_pred_luma_copy_w16out_sse42(UWORD8 *pu1_src,
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD16 *pi2_dst,
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 src_strd,
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 dst_strd,
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD8 *pi1_coeff,
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 ht,
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 wd)
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col;
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(pi1_coeff);
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(wd % 4 == 0); /* checking assumption*/
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(ht % 4 == 0); /* checking assumption*/
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(0 == (wd & 7)) /* multiple of 8 case */
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < ht; row += 4)
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(col = 0; col < wd; col += 8)
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* storing 16 8-bit output values */
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 8; /* pointer update */
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst += 8; /* pointer update */
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            } /* inner for loop ends here(8-output values in single iteration) */
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_src += 4 * src_strd - wd; /* pointer update */
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_dst += 4 * dst_strd - wd; /* pointer update */
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else /* wd = multiple of 4 case */
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < ht; row += 4)
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(col = 0; col < wd; col += 4)
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* storing 16 8-bit output values */
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 4; /* pointer update */
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst += 4; /* pointer update */
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            } /* inner for loop ends here(4-output values in single iteration) */
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_src += 4 * src_strd - wd; /* pointer update */
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_dst += 4 * dst_strd - wd; /* pointer update */
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*      Chroma interprediction filter for copy
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Copies the array of width 'wd' and height 'ht' from the  location pointed
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    by 'src' to the location pointed by 'dst'
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the destination
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi1_coeff
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  WORD8 pointer to the filter coefficients
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer height of the array
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer width of the array
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_inter_pred_chroma_copy_sse42(UWORD8 *pu1_src,
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                        UWORD8 *pu1_dst,
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                        WORD32 src_strd,
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                        WORD32 dst_strd,
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                        WORD8 *pi1_coeff,
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                        WORD32 ht,
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                        WORD32 wd)
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col, wdx2;
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(wd % 2 == 0); /* checking assumption*/
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(ht % 2 == 0); /* checking assumption*/
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(pi1_coeff);
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wdx2 = wd * 2;
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(0 == (ht & 3)) /* ht multiple of 4 */
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 4)
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 16)
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                /* row =0 */
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* storing 16 8-bit output values */
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_src += 16; /* pointer update */
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst += 16; /* pointer update */
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner for loop ends here(16-output values in single iteration) */
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 4 * src_strd - wdx2; /* pointer update */
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(0 == (wdx2 & 7)) /* multiple of 8 case */
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 4)
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 8)
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* storing 16 8-bit output values */
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_src += 8; /* pointer update */
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst += 8; /* pointer update */
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /*  inner for loop ends here(8-output values in single iteration) */
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 4 * src_strd - wdx2; /* pointer update */
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else /* wdx2 = multiple of 4 case */
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 dst0, dst1, dst2, dst3;
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 4)
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 4)
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst0 = _mm_cvtsi128_si32(src0_16x8b);
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst1 = _mm_cvtsi128_si32(src1_16x8b);
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst2 = _mm_cvtsi128_si32(src2_16x8b);
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst3 = _mm_cvtsi128_si32(src3_16x8b);
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* storing 4 8-bit output values */
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_src += 4; /* pointer update */
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst += 4; /* pointer update */
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /*  inner for loop ends here(4- output values in single iteration) */
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 4 * src_strd - wdx2; /* pointer update */
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += 4 * dst_strd - wdx2; /* pointer update */
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else /* ht multiple of 2 */
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 16)
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                /* row =0 */
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* storing 16 8-bit output values */
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_src += 16; /* pointer update */
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst += 16; /* pointer update */
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner for loop ends here(16-output values in single iteration) */
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 2 * src_strd - wdx2; /* pointer update */
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(0 == (wdx2 & 7)) /* multiple of 8 case */
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 8)
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* storing 16 8-bit output values */
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_src += 8; /* pointer update */
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst += 8; /* pointer update */
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /*  inner for loop ends here(8-output values in single iteration) */
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 2 * src_strd - wdx2; /* pointer update */
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else /* wdx2 = multiple of 4 case */
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 dst0, dst1;
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 4)
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst0 = _mm_cvtsi128_si32(src0_16x8b);
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    dst1 = _mm_cvtsi128_si32(src1_16x8b);
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* storing 4 8-bit output values */
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_src += 4; /* pointer update */
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_dst += 4; /* pointer update */
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /*  inner for loop ends here(4- output values in single iteration) */
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 2 * src_strd - wdx2; /* pointer update */
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += 2 * dst_strd - wdx2; /* pointer update */
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*       chroma interprediction filter for copying 16bit output
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Copies the array of width 'wd' and height 'ht' from the  location pointed
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    by 'src' to the location pointed by 'dst' The output is upshifted by 6
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    bits and is used as input for vertical filtering or weighted prediction
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pi2_dst
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  WORD16 pointer to the destination
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pi1_coeff
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  WORD8 pointer to the filter coefficients
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] ht
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer height of the array
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] wd
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer width of the array
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_inter_pred_chroma_copy_w16out_sse42(UWORD8 *pu1_src,
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD16 *pi2_dst,
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 src_strd,
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 dst_strd,
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD8 *pi1_coeff,
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 ht,
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 wd)
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col, wdx2;
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(wd % 2 == 0); /* checking assumption*/
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ASSERT(ht % 2 == 0); /* checking assumption*/
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(pi1_coeff);
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    wdx2 = wd * 2;
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(0 == (ht & 3)) /* multiple of 4 case */
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(0 == (wdx2 & 7)) /* multiple of 8 case */
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 4)
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 8)
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* storing 16 8-bit output values */
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_src += 8; /* pointer update */
5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst += 8; /* pointer update */
5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner for loop ends here(8-output values in single iteration) */
5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 4 * src_strd - wdx2; /* pointer update */
5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else /* wdx2 = multiple of 4 case */
5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 4)
5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 4)
5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b);
5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b);
5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src2_16x8b = _mm_slli_epi16(src2_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src3_16x8b = _mm_slli_epi16(src3_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* storing 16 8-bit output values */
5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_src += 4; /* pointer update */
5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst += 4; /* pointer update */
5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner for loop ends here(4-output values in single iteration) */
5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 4 * src_strd - wdx2; /* pointer update */
5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst += 4 * dst_strd - wdx2; /* pointer update */
5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else  /* ht multiple of 2 case */
5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(0 == (wdx2 & 7)) /* multiple of 8 case */
5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 8)
5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* storing 16 8-bit output values */
5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_src += 8; /* pointer update */
5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst += 8; /* pointer update */
5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner for loop ends here(8-output values in single iteration) */
5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 2 * src_strd - wdx2; /* pointer update */
5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else /* wdx2 = multiple of 4 case */
5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < ht; row += 2)
5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                for(col = 0; col < wdx2; col += 4)
5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                {
5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b);
5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b);
5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src0_16x8b = _mm_slli_epi16(src0_16x8b,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    src1_16x8b = _mm_slli_epi16(src1_16x8b,  SHIFT_14_MINUS_BIT_DEPTH);
5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    /* storing 16 8-bit output values */
5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b);                 /* row =0 */
5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pu1_src += 4; /* pointer update */
5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                    pi2_dst += 4; /* pointer update */
6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                } /* inner for loop ends here(4-output values in single iteration) */
6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_src += 2 * src_strd - wdx2; /* pointer update */
6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi2_dst += 2 * dst_strd - wdx2; /* pointer update */
6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
608