10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/******************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  ihevc_intra_pred_filters_x86_intr.c
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Contains function Definition for intra prediction  interpolation filters
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Ittiam
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions:
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_intra_pred_ref_filtering_sse42()
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_intra_pred_luma_dc_sse42()
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_intra_pred_luma_horz_sse42()
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_intra_pred_luma_ver_sse42()
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_intra_pred_luma_mode_3_to_9_sse42()
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_intra_pred_luma_mode_11_to_17_sse42()
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_intra_pred_luma_mode_19_to_25_sse42()
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevc_intra_pred_luma_mode_27_to_33_sse42()
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* File Includes                                                             */
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <stdlib.h>
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h"
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_intra_pred.h"
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h"
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h"
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h"
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_common_tables.h"
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h"
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_tables_x86_intr.h"
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h>
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* Constant Macros                                                          */
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define MAX_CU_SIZE 64
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define BIT_DEPTH 8
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define T32_4NT 128
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define T16_4NT 64
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* Function Macros                                                          */
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* tables to shuffle 8-bit values */
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* global tables Definition                                                  */
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* Function Definition                                                      */
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Intra prediction interpolation filter for ref_filtering
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Reference DC filtering for neighboring samples dependent  on TU size and
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    mode  Refer to section 8.4.4.2.3 in the standard
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the destination
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer Transform Block size
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer intraprediction mode
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_ref_filtering_sse42(UWORD8 *pu1_src,
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 nt,
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          UWORD8 *pu1_dst,
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 mode,
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 strong_intra_smoothing_enable_flag)
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 filter_flag;
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 i; /* Generic indexing variable */
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 four_nt = 4 * nt;
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1];
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 bi_linear_int_flag = 0;
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 abs_cond_left_flag = 0;
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 abs_cond_top_flag = 0;
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 dc_val = 1 << (BIT_DEPTH - 5);
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp1, src_temp2, src_temp3, src_temp7;
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp4, src_temp5, src_temp6, src_temp8;
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //WORD32 strong_intra_smoothing_enable_flag  = 1;
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(0 == filter_flag)
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(pu1_src == pu1_dst)
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            return;
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            if(nt == 4)
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst[four_nt] = pu1_src[four_nt];
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            else if(nt == 8)
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst[four_nt] = pu1_src[four_nt];
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            else if(nt == 16)
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst[four_nt] = pu1_src[four_nt];
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            else if(nt == 32)
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5 = _mm_loadu_si128((__m128i *)(pu1_src + 64));
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6 = _mm_loadu_si128((__m128i *)(pu1_src + 80));
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7 = _mm_loadu_si128((__m128i *)(pu1_src + 96));
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8 = _mm_loadu_si128((__m128i *)(pu1_src + 112));
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst[four_nt] = pu1_src[four_nt];
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* If strong intra smoothin is enabled and transform size is 32 */
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Strong Intra Filtering */
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            abs_cond_top_flag = (abs(pu1_src[2 * nt] + pu1_src[4 * nt]
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     - (2 * pu1_src[3 * nt]))) < dc_val;
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            abs_cond_left_flag = (abs(pu1_src[2 * nt] + pu1_src[0]
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      - (2 * pu1_src[nt]))) < dc_val;
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            bi_linear_int_flag = ((1 == abs_cond_left_flag)
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            && (1 == abs_cond_top_flag));
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Extremities Untouched*/
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        au1_flt[0] = pu1_src[0];
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        au1_flt[4 * nt] = pu1_src[4 * nt];
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Strong filtering of reference samples */
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(1 == bi_linear_int_flag)
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            au1_flt[2 * nt] = pu1_src[2 * nt];
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(i = 1; i < (2 * nt); i++)
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6;
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(i = 1; i < (2 * nt); i++)
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6;
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i const_value_8x16;
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const_value_8x16 = _mm_set1_epi16(2);
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            au1_flt[0] = pu1_src[0];
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            au1_flt[4 * nt] = pu1_src[4 * nt];
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Perform bilinear filtering of Reference Samples */
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(i = 0; i < (four_nt); i += 16)
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src + i));
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 = _mm_srli_si128(src_temp1, 1);
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3 = _mm_srli_si128(src_temp2, 1);
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 =  _mm_cvtepu8_epi16(src_temp1);
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 = _mm_slli_epi16(src_temp2,  1);
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_add_epi16(src_temp1, src_temp2);
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_add_epi16(src_temp1, src_temp3);
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_add_epi16(src_temp1, const_value_8x16);
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_srai_epi16(src_temp1,  2);
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 8 + i));
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5 = _mm_srli_si128(src_temp4, 1);
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6 = _mm_srli_si128(src_temp5, 1);
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5 =  _mm_cvtepu8_epi16(src_temp5);
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6 =  _mm_cvtepu8_epi16(src_temp6);
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5 = _mm_slli_epi16(src_temp5,  1);
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = _mm_add_epi16(src_temp4, const_value_8x16);
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = _mm_srai_epi16(src_temp4,  2);
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_packus_epi16(src_temp1, src_temp4);
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(au1_flt + 1 + i), src_temp1);
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            au1_flt[4 * nt] = pu1_src[4 * nt];
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(nt == 4)
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst[four_nt] = au1_flt[four_nt];
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(nt == 8)
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst[four_nt] = au1_flt[four_nt];
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(nt == 16)
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst[four_nt] = au1_flt[four_nt];
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(nt == 32)
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 = _mm_loadu_si128((__m128i *)(au1_flt + 64));
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 = _mm_loadu_si128((__m128i *)(au1_flt + 80));
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 = _mm_loadu_si128((__m128i *)(au1_flt + 96));
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp8 = _mm_loadu_si128((__m128i *)(au1_flt + 112));
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst[four_nt] = au1_flt[four_nt];
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Intra prediction interpolation filter for luma dc
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   Intraprediction for DC mode with reference neighboring  samples location
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   to section 8.4.4.2.5 in the standard
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the destination
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer Transform Block size
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer intraprediction mode
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_luma_dc_sse42(UWORD8 *pu1_ref,
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 src_strd,
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    UWORD8 *pu1_dst,
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 dst_strd,
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 nt,
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                    WORD32 mode)
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 acc_dc;
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 dc_val, two_dc_val, three_dc_val;
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row;
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 log2nt = 5;
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 two_nt, three_nt;
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp1, src_temp7, src_temp3, src_temp4, src_temp5, src_temp6;
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp8, src_temp9, src_temp10, src_temp2;
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_zero = _mm_set1_epi32(0);
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i sm = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK5[0]);
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(src_strd);
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(mode);
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    switch(nt)
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        case 32:
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            log2nt = 5;
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        case 16:
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            log2nt = 4;
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        case 8:
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            log2nt = 3;
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        case 4:
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            log2nt = 2;
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        default:
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    two_nt = 2 * nt;
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    three_nt = 3 * nt;
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    acc_dc = 0;
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Calculate DC value for the transform block */
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(nt == 32)
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i temp;
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 itr_count;
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp7 = _mm_sad_epu8(src_temp7, m_zero);
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp8 = _mm_sad_epu8(src_temp8, m_zero);
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        acc_dc = _mm_cvtsi128_si32(src_temp4);
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        acc_dc += pu1_ref[three_nt];
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        acc_dc -= pu1_ref[two_nt];
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* computing acc_dc value */
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        dc_val = (acc_dc + nt) >> (log2nt + 1);
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_dc_val = 2 * dc_val;
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        three_dc_val = 3 * dc_val;
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp = _mm_set1_epi8(dc_val);
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(itr_count = 0; itr_count < 2; itr_count++)
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp);
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp);
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp);
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp);
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp);
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp);
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp);
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp);
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp);
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp);
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp);
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp);
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp);
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp);
4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp);
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp);
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp);
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp);
5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp);
5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp);
5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp);
5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp);
5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp);
5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp);
5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp);
5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp);
5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp);
5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp);
5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp);
5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp);
5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp);
5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp);
5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 16 * dst_strd;
5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else
5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i  zero_8x16b;
5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i sm1 = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* DC filtering for the first top row and first left column */
5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        zero_8x16b = _mm_set1_epi16(0);
5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(nt == 4) /* nt multiple of 4*/
5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 temp1, temp2, temp3;
5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  _mm_cvtepu8_epi16(src_temp3);
5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc = _mm_cvtsi128_si32(src_temp4);
5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc += pu1_ref[three_nt];
5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc -= pu1_ref[two_nt];
5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* computing acc_dc value */
5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            dc_val = (acc_dc + nt) >> (log2nt + 1);
5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            three_dc_val = 3 * dc_val;
5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* loding 8-bit 16 pixel */
5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_set1_epi16(three_dc_val + 2);
5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            two_dc_val = 2 * dc_val;
5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2 */
5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_srli_epi16(src_temp2, 2);
5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_packus_epi16(src_temp2, zero_8x16b);
5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 = _mm_cvtsi128_si32(src_temp2);
5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[0]) = temp1;
5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  retore  first value*/
5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            >> 2);
5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 1; row < nt; row++)
5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                >> 2;
5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_insert_epi8(src_temp2, dc_val, 0);
5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  _mm_shuffle_epi8(src_temp2, sm1);
5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 =  _mm_shuffle_epi8(src_temp2, sm1);
5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  _mm_shuffle_epi8(src_temp2, sm1);
5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[(1 * dst_strd) + 0], 0);
5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[(2 * dst_strd) + 0], 0);
5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[(3 * dst_strd) + 0], 0);
5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 = _mm_cvtsi128_si32(src_temp2);
5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 = _mm_cvtsi128_si32(src_temp3);
5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 = _mm_cvtsi128_si32(src_temp4);
5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(nt == 8) /* if nt%8==0*/
5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_sad_epu8(src_temp3, m_zero);
6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc = _mm_cvtsi128_si32(src_temp4);
6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc += pu1_ref[three_nt];
6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc -= pu1_ref[two_nt];
6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* computing acc_dc value */
6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            dc_val = (acc_dc + nt) >> (log2nt + 1);
6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            three_dc_val = 3 * dc_val;
6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_set1_epi16(three_dc_val + 2);
6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            two_dc_val = 2 * dc_val;
6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* loding 8-bit 16 pixel */
6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_srli_epi16(src_temp2, 2);
6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_packus_epi16(src_temp2, zero_8x16b);
6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst), src_temp2);
6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  retore  first value*/
6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            >> 2);
6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 1; row < nt; row++)
6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                >> 2;
6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Fill the remaining rows with DC value*/
6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_set1_epi8(dc_val);
6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_set1_epi8(dc_val);
6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = _mm_set1_epi8(dc_val);
6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_set1_epi8(dc_val);
6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 = _mm_set1_epi8(dc_val);
6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 = _mm_set1_epi8(dc_val);
6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 = _mm_set1_epi8(dc_val);
6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(nt == 16) /* if nt%8==0*/
6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 =  _mm_cvtepu8_epi16(src_temp10);
6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc = _mm_cvtsi128_si32(src_temp4);
6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc += pu1_ref[three_nt];
6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc -= pu1_ref[two_nt];
6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* computing acc_dc value */
6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            dc_val = (acc_dc + nt) >> (log2nt + 1);
6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            three_dc_val = 3 * dc_val;
6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_set1_epi16(three_dc_val + 2);
6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            two_dc_val = 2 * dc_val;
6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 = _mm_add_epi16(src_temp10, src_temp1);
7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_srli_epi16(src_temp2, 2);
7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 = _mm_srli_epi16(src_temp10, 2);
7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_packus_epi16(src_temp2, src_temp10);
7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2);
7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  retore  first value*/
7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            >> 2);
7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 1; row < nt; row++)
7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                >> 2;
7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Fill the remaining rows with DC value*/
7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 =  _mm_set1_epi8(dc_val);
7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  _mm_set1_epi8(dc_val);
7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 =  _mm_set1_epi8(dc_val);
7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  _mm_set1_epi8(dc_val);
7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 =  _mm_set1_epi8(dc_val);
7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 =  _mm_set1_epi8(dc_val);
7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 =  _mm_set1_epi8(dc_val);
7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 1; row < nt; row += 8)
7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((8) * dst_strd)], 0);
7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((9) * dst_strd)], 0);
7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((10) * dst_strd)], 0);
7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((11) * dst_strd)], 0);
7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((12) * dst_strd)], 0);
7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((13) * dst_strd)], 0);
7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((14) * dst_strd)], 0);
7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp1);
7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp2);
7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp3);
7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((15) * dst_strd)], 0);
7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp4);
7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp5);
7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp6);
7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp7);
7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp1);
7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(nt == 32) /* if nt%8==0*/
7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16, src_temp17;
7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* loding 8-bit 16 pixel */
7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 24));
7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 = _mm_sad_epu8(src_temp7, m_zero);
7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp8 = _mm_sad_epu8(src_temp8, m_zero);
7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 =  _mm_cvtepu8_epi16(src_temp6);
7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp9 =  _mm_cvtepu8_epi16(src_temp9);
7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 =  _mm_cvtepu8_epi16(src_temp10);
7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc = _mm_cvtsi128_si32(src_temp4);
8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc += pu1_ref[three_nt];
8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            acc_dc -= pu1_ref[two_nt];
8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* computing acc_dc value */
8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            dc_val = (acc_dc + nt) >> (log2nt + 1);
8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            three_dc_val = 3 * dc_val;
8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_set1_epi16(three_dc_val + 2);
8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            two_dc_val = 2 * dc_val;
8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_add_epi16(src_temp6, src_temp1);
8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_add_epi16(src_temp9, src_temp1);
8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_add_epi16(src_temp10, src_temp1);
8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_srli_epi16(src_temp2, 2);
8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 = _mm_srli_epi16(src_temp6, 2);
8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp9 = _mm_srli_epi16(src_temp9, 2);
8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 = _mm_srli_epi16(src_temp10, 2);
8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_packus_epi16(src_temp2, src_temp6);
8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 = _mm_packus_epi16(src_temp9, src_temp10);
8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2);
8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp10);
8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  retore  first value*/
8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            >> 2);
8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 1; row < nt; row++)
8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                >> 2;
8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* Fill the remaining rows with DC value*/
8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_insert_epi8(src_temp1, dc_val, 0);
8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  src_temp1;
8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = src_temp1;
8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  src_temp1;
8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 =  src_temp1;
8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 =  src_temp1;
8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 =  src_temp1;
8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp12 = src_temp1;
8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp13 = src_temp1;
8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp14 = src_temp1;
8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp15 = src_temp1;
8550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp16 = src_temp1;
8560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp17 = src_temp1;
8570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp11 = src_temp1;
8580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 1; row < nt; row++)
8600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
8610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
8620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
8630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
8640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
8650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
8660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
8670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
8680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), src_temp1);
8700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd) + 16), src_temp11);
8710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2);
8720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), src_temp12);
8730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3);
8740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), src_temp13);
8750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4);
8770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), src_temp14);
8780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5);
8790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd) + 16), src_temp15);
8800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6);
8810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd) + 16), src_temp16);
8820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7);
8830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd) + 16), src_temp17);
8840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
8870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
8890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
8900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
8910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
8930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
8940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
8960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*     Intra prediction interpolation filter for horizontal luma variable.
8970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
8980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
8990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*      Horizontal intraprediction(mode 10) with reference  samples location
9000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*      pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
9010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*      to section 8.4.4.2.6 in the standard (Special case)
9020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
9040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
9050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
9070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the destination
9080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
9100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
9110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
9130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
9140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt
9160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer Transform Block size
9170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode
9190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer intraprediction mode
9200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
9220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
9240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
9250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
9260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
9270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
9280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_luma_horz_sse42(UWORD8 *pu1_ref,
9300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 src_strd,
9310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      UWORD8 *pu1_dst,
9320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 dst_strd,
9330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 nt,
9340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 mode)
9350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
9360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row;
9380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 two_nt;
9390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(src_strd);
9400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(mode);
9410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    two_nt = 2 * nt;
9430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(nt == 32)
9460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
9470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
9480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
9490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i sm = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
9500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < nt; row += 16)
9520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
9530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
9540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 - row - 15));
9550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 =  _mm_srli_si128(src_temp1, 1);
9570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3 =  _mm_srli_si128(src_temp1, 2);
9580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 =  _mm_srli_si128(src_temp1, 3);
9590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5 =  _mm_srli_si128(src_temp1, 4);
9600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6 =  _mm_srli_si128(src_temp1, 5);
9610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7 =  _mm_srli_si128(src_temp1, 6);
9620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8 =  _mm_srli_si128(src_temp1, 7);
9630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp9 =  _mm_srli_si128(src_temp1, 8);
9650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp10 =  _mm_srli_si128(src_temp1, 9);
9660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11 =  _mm_srli_si128(src_temp1, 10);
9670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12 =  _mm_srli_si128(src_temp1, 11);
9680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13 =  _mm_srli_si128(src_temp1, 12);
9690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14 =  _mm_srli_si128(src_temp1, 13);
9700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp15 =  _mm_srli_si128(src_temp1, 14);
9710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp16 =  _mm_srli_si128(src_temp1, 15);
9720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8 =  _mm_shuffle_epi8(src_temp8, sm);
9740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7 =  _mm_shuffle_epi8(src_temp7, sm);
9750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6 =  _mm_shuffle_epi8(src_temp6, sm);
9760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5 =  _mm_shuffle_epi8(src_temp5, sm);
9770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 =  _mm_shuffle_epi8(src_temp4, sm);
9780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3 =  _mm_shuffle_epi8(src_temp3, sm);
9790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 =  _mm_shuffle_epi8(src_temp2, sm);
9800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 =  _mm_shuffle_epi8(src_temp1, sm);
9810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp16 =  _mm_shuffle_epi8(src_temp16, sm);
9830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp15 =  _mm_shuffle_epi8(src_temp15, sm);
9840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14 =  _mm_shuffle_epi8(src_temp14, sm);
9850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13 =  _mm_shuffle_epi8(src_temp13, sm);
9860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12 =  _mm_shuffle_epi8(src_temp12, sm);
9870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11 =  _mm_shuffle_epi8(src_temp11, sm);
9880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp10 =  _mm_shuffle_epi8(src_temp10, sm);
9890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp9 =  _mm_shuffle_epi8(src_temp9, sm);
9900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
9910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp16);
9920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp15);
9930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp14);
9940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp13);
9950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp12);
9960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp11);
9970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp10);
9980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp9);
9990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 8) * dst_strd)), src_temp8);
10010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 9) * dst_strd)), src_temp7);
10020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 10) * dst_strd)), src_temp6);
10030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 11) * dst_strd)), src_temp5);
10040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 12) * dst_strd)), src_temp4);
10050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 13) * dst_strd)), src_temp3);
10060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 14) * dst_strd)), src_temp2);
10070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 15) * dst_strd)), src_temp1);
10080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 0) * dst_strd)), src_temp16);
10100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 1) * dst_strd)), src_temp15);
10110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 2) * dst_strd)), src_temp14);
10120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 3) * dst_strd)), src_temp13);
10130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 4) * dst_strd)), src_temp12);
10140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 5) * dst_strd)), src_temp11);
10150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 6) * dst_strd)), src_temp10);
10160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 7) * dst_strd)), src_temp9);
10170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 8) * dst_strd)), src_temp8);
10190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 9) * dst_strd)), src_temp7);
10200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 10) * dst_strd)), src_temp6);
10210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 11) * dst_strd)), src_temp5);
10220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 12) * dst_strd)), src_temp4);
10230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 13) * dst_strd)), src_temp3);
10240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 14) * dst_strd)), src_temp2);
10250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 15) * dst_strd)), src_temp1);
10260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
10280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
10300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
10320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else
10330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
10350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6;
10360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_temp10, zero_8x16b, src_temp7;
10370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* DC filtering for the first top row and first left column */
10390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        zero_8x16b = _mm_set1_epi16(0);
10410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*Filtering done for the 1st row */
10430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2 =  _mm_set1_epi16(pu1_ref[two_nt - 1]);
10450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp10 =  _mm_set1_epi16(pu1_ref[two_nt]);
10460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  loding 8-bit 16 pixels */
10480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
10490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
10510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*(pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt])*/
10530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 = _mm_sub_epi16(src_temp4, src_temp10);
10540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
10560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 = _mm_srai_epi16(src_temp3, 1);
10570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* pu1_ref[two_nt - 1]+((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
10590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 = _mm_add_epi16(src_temp2, src_temp3);
10600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(nt == 4)
10620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
10630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            int temp1, temp2, temp3;
10640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = _mm_packus_epi16(src_temp3, zero_8x16b);
10650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 = _mm_cvtsi128_si32(src_temp3);
10660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[0]) = temp1;
10680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
10700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
10710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
10720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 = _mm_cvtsi128_si32(src_temp2);
10740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 = _mm_cvtsi128_si32(src_temp3);
10750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 = _mm_cvtsi128_si32(src_temp4);
10760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
10780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
10790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
10800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
10810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
10830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(nt == 8)
10840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
10850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 = _mm_packus_epi16(src_temp3, zero_8x16b);
10860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
10890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
10900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
10910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 5]);
10920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 6]);
10930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 7]);
10940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 8]);
10950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst), src_temp10);
10970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
10980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
10990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
11000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp2);
11010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp3);
11020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
11030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp5);
11040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp6);
11050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp7);
11060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
11080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(nt == 16)
11090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
11100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
11110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
11120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 = _mm_sub_epi16(src_temp4, src_temp10);
11140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 = _mm_srai_epi16(src_temp10, 1);
11150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 = _mm_add_epi16(src_temp2, src_temp10);
11160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = _mm_packus_epi16(src_temp3, src_temp10);
11180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst), src_temp3);
11190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
11210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
11220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
11230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
11240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 5]);
11250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 6]);
11260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 7]);
11270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 8]);
11280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp10 =  _mm_set1_epi8(pu1_ref[two_nt - 9]);
11290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
11310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
11320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
11330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
11340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
11350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
11360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
11370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp10);
11380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 10]);
11400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 11]);
11410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 12]);
11420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 13]);
11430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 14]);
11440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 15]);
11450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 16]);
11460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp1);
11480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp2);
11490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp3);
11500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp4);
11510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp5);
11520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp6);
11530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp7);
11540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
11560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
11570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
11580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
11600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
11610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
11620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
11630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*     Intra prediction interpolation filter for vertical luma variable.
11640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
11650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
11660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Horizontal intraprediction with reference neighboring  samples location
11670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
11680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    to section 8.4.4.2.6 in the standard (Special case)
11690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
11700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
11710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
11720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
11730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
11740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the destination
11750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
11760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
11770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
11780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
11790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
11800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
11810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
11820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt
11830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer Transform Block size
11840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
11850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode
11860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer intraprediction mode
11870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
11880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
11890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
11900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
11910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
11920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
11930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
11940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
11950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
11970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_luma_ver_sse42(UWORD8 *pu1_ref,
11980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 src_strd,
11990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     UWORD8 *pu1_dst,
12000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 dst_strd,
12010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 nt,
12020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                     WORD32 mode)
12030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
12040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row;
12050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD16 s2_predpixel;
12060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 two_nt = 2 * nt;
12070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp0, src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7;
12080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(src_strd);
12100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(mode);
12110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(nt == 32)
12130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
12140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i temp1, temp2;
12150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 itr_count;
12160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
12180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
12190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(itr_count = 0; itr_count < 2; itr_count++)
12210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
12220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
12230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
12240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
12250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
12260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
12270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
12280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
12290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
12300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
12310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
12330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
12340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
12350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
12360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
12370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
12380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
12390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
12400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
12420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
12430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
12440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
12450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
12460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
12470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
12480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
12490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
12510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
12520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
12530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
12540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
12550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
12560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
12570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
12580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 16 * dst_strd;
12600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
12610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
12620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else
12640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
12660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*Filtering done for the 1st column */
12670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = nt - 1; row >= 0; row--)
12680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
12690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            s2_predpixel = pu1_ref[two_nt + 1]
12700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                            + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1);
12710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel);
12720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
12730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Replication to next columns*/
12750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        if(nt == 4)
12770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
12780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            int temp1, temp2, temp3, temp4;
12790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 =   _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
12810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 =  src_temp2;
12820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 =  src_temp2;
12830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 =  src_temp2;
12840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[(0 * dst_strd)], 0);
12860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[(1 * dst_strd)], 0);
12870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[(2 * dst_strd)], 0);
12880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[(3 * dst_strd)], 0);
12890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 = _mm_cvtsi128_si32(src_temp2);
12910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 = _mm_cvtsi128_si32(src_temp3);
12920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 = _mm_cvtsi128_si32(src_temp4);
12930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 = _mm_cvtsi128_si32(src_temp5);
12940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
12950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* loding 4-bit 8 pixels values */
12960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
12970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
12980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
12990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
13000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
13020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(nt == 8)
13030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
13040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
13060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = src_temp0;
13070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = src_temp0;
13080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = src_temp0;
13090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = src_temp0;
13100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 = src_temp0;
13110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 = src_temp0;
13120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 = src_temp0;
13130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp0 = _mm_insert_epi8(src_temp0, pu1_dst[((0) * dst_strd)], 0);
13150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
13160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
13170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
13180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
13190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
13200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
13210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
13220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp0);
13240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
13250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
13260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
13270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
13280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
13290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
13300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
13310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
13340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        else if(nt == 16)
13350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
13360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < nt; row += 8)
13370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
13380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
13400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = src_temp0;
13410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 = src_temp0;
13420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3 = src_temp0;
13430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = src_temp0;
13440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5 = src_temp0;
13450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6 = src_temp0;
13460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7 = src_temp0;
13470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp0 = _mm_insert_epi8(src_temp0, pu1_dst[((row + 0) * dst_strd)], 0);
13490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((row + 1) * dst_strd)], 0);
13500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((row + 2) * dst_strd)], 0);
13510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((row + 3) * dst_strd)], 0);
13520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((row + 4) * dst_strd)], 0);
13530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((row + 5) * dst_strd)], 0);
13540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((row + 6) * dst_strd)], 0);
13550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((row + 7) * dst_strd)], 0);
13560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp0);
13580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp1);
13590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp2);
13600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp3);
13610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp4);
13620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp5);
13630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp6);
13640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp7);
13650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
13670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
13690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
13720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
13730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
13750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
13760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
13770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
13780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
13790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Intra prediction interpolation filter for luma mode 3 to mode 9
13800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
13810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
13820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
13830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
13840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    block location pointed by 'pu1_dst'
13850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
13860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
13870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
13880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
13890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
13900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the destination
13910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
13920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
13930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
13940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
13950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
13960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
13970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
13980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt
13990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer Transform Block size
14000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
14010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode
14020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer intraprediction mode
14030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
14040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
14050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
14060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
14070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
14080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
14090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
14100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
14110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_luma_mode_3_to_9_sse42(UWORD8 *pu1_ref,
14140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 src_strd,
14150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             UWORD8 *pu1_dst,
14160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 dst_strd,
14170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 nt,
14180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                             WORD32 mode)
14190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
14200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col;
14210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 two_nt = 2 * nt;
14220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 intra_pred_ang;
14230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
14260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i fract_4x32b, intra_pred_ang_4x32b;
14270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
14280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(src_strd);
14290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Intra Pred Angle according to the mode */
14320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    intra_pred_ang = gai4_ihevc_ang_table[mode];
14330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
14350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* samples dependent on distance to obtain destination sample */
14360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
14380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* samples dependent on distance to obtain destination sample */
14390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp_4x32b  = _mm_set1_epi16(16);
14410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp2_4x32b = _mm_set1_epi32(31);
14420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp3_4x32b = _mm_set1_epi32(32);
14430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp4_4x32b = _mm_set1_epi32(4);
14440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    two_nt_4x32b = _mm_set1_epi32(two_nt - nt);
14460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
14490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
14510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
14520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    row_4x32b = _mm_set_epi32(4, 3, 2, 1);
14540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(nt == 4)
14560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
14570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
14590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        int temp11, temp21, temp31, temp41;
14600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        // WORD8  ai1_fract_temp_val[16], ai1_row_temp_val[16];
14610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i fract1_8x16b, fract2_8x16b;
14630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
14640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
14660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b; //, src_temp8_8x16b;
14670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
14680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* pos = ((row + 1) * intra_pred_ang); */
14700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
14710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* idx = pos >> 5; */
14730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
14740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* fract = pos & (31); */
14760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_idx_4x32b = _mm_sub_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
14770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*(32 - fract) */
14790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
14800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
14820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
14830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
14850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
14860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract2_8x16b = _mm_unpackhi_epi8(row_4x32b, fract_4x32b);
14880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract1_8x16b = _mm_unpacklo_epi8(row_4x32b, fract_4x32b);
14890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
14910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp2_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
14920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp3_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
14930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp4_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
14940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
14950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4);  /* next 32 bit values */
14960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8);  /* next 32 bit values */
14970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
14980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_idx1  = _mm_cvtsi128_si32(ref_main_idx_4x32b);    /* col=0*/
14990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* col=1*/
15000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* col=2*/
15010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* col=3*/
15020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* loding 8-bit 16 pixels */
15040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 - 1)); /* col=0*/
15050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2 - 1)); /* col=1*/
15060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3 - 1)); /* col=2*/
15070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4 - 1)); /* col=3*/
15080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
15100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
15110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
15120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
15130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
15150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
15160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
15170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
15180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
15190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
15210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
15220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
15230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
15240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
15250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
15270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
15280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
15290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
15300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
15310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* converting 16 bit to 8 bit */
15330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
15340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
15350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
15380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
15390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
15410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 4);
15420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
15430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 12);
15440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
15460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
15470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
15480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
15490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* loding 4-bit 8 pixels values */
15510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
15520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
15530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
15540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
15550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
15570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else if(nt == 16 || nt == 32)
15590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
15600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
15610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
15620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi16(31);
15630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp4_4x32b = _mm_set1_epi16(8);
15640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi16(32);
15650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi16(two_nt);
15660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(col = 0; col < nt; col += 8)
15680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
15690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
15700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
15710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
15720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
15740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
15760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
15770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
15790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
15800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
15820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
15830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
15850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
15860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
15880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
15890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
15910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
15920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
15950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
15960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
15970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
15980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
15990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
16000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
16010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
16030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
16040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
16050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
16060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
16080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
16090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
16110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
16130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
16140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
16150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
16160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
16180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
16190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
16200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
16210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < nt; row += 8)
16230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
16240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
16250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
16260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
16290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
16300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loding 8-bit 16 pixels */
16320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1 - (8 + row))); /* col=0*/
16330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1 - (8 + row))); /* col=1*/
16340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1 - (8 + row))); /* col=2*/
16350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1 - (8 + row))); /* col=3*/
16360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loding 8-bit 16 pixels */
16380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1 - (8 + row))); /* col=5*/
16390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1 - (8 + row))); /* col=6*/
16400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1 - (8 + row))); /* col=7*/
16410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1 - (8 + row))); /* col=8*/
16420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
16440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
16450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
16460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
16470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
16490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
16500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
16510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
16520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
16540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
16550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
16560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
16570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
16580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
16600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
16610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
16620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
16630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
16640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
16660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
16670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
16680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
16690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
16700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
16720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
16730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
16740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
16750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
16760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
16780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
16790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
16800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
16810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
16820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
16840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
16850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
16860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
16870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
16880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
16900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
16910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
16920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
16940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
16950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
16960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
16970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
16980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
16990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
17010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
17020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
17040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
17050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
17070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
17080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
17100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
17110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
17130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
17140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
17160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
17170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
17190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
17200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp1_8x16b);          /* row=7*/
17220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp5_8x16b);       /* row=6*/
17240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp2_8x16b);       /* row=5*/
17260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp6_8x16b);       /* row=4*/
17280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp3_8x16b);       /* row=3*/
17300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp7_8x16b);       /* row=2*/
17320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp4_8x16b);       /* row=1*/
17340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 0))), src_temp8_8x16b);       /* row=0*/
17360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
17380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
17390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
17400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else
17410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
17420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
17430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
17440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi16(31);
17450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp4_4x32b = _mm_set1_epi16(8);
17460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi16(32);
17470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi16(two_nt - nt);
17480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
17490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
17500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
17510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
17530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
17550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
17560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
17580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
17590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
17610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
17620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
17640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
17650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
17670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
17680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
17700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
17710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
17730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
17740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
17770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
17780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
17800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
17810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
17820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
17830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
17850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
17860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
17870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
17880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
17900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
17910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
17920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
17930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
17950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
17960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
17970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
17980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
17990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
18000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
18010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
18020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
18040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
18050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loding 8-bit 16 pixels */
18070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1)); /* col=0*/
18080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1)); /* col=1*/
18090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1)); /* col=2*/
18100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1)); /* col=3*/
18110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loding 8-bit 16 pixels */
18130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1)); /* col=5*/
18140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1)); /* col=6*/
18150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1)); /* col=7*/
18160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1)); /* col=8*/
18170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
18190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
18200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
18210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
18220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
18240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
18250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
18260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
18270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
18290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
18300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
18310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
18320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
18330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
18350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
18360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
18370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
18380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
18390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
18410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
18420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
18430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
18440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
18450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
18470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
18480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
18490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
18500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
18510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
18530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
18540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
18550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
18560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
18570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
18590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
18600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
18610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
18620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
18630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
18650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
18660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
18670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
18690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
18700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
18710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
18730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
18740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
18760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
18770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
18790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
18800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
18820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
18830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
18850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
18860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
18880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
18890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
18910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
18920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
18940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
18950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
18960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst), src_temp8_8x16b);       /* row=0*/
18970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 1)), src_temp4_8x16b);       /* row=1*/
18980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 2)), src_temp7_8x16b);       /* row=2*/
18990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 3)), src_temp3_8x16b);       /* row=3*/
19000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 4)), src_temp6_8x16b);       /* row=4*/
19010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 5)), src_temp2_8x16b);       /* row=5*/
19020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 6)), src_temp5_8x16b);       /* row=6*/
19040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 7)), src_temp1_8x16b);          /* row=7*/
19050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
19070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
19080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
19090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
19110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
19130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
19140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
19150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
19160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   Intra prediction interpolation filter for luma mode 11 to mode 17
19170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
19180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
19190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
19200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    with reference  neighboring samples location pointed by 'pu1_ref' to the
19210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    TU block location pointed by 'pu1_dst'
19220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
19230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
19240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
19250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
19260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
19270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the destination
19280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
19290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
19300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
19310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
19320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
19330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
19340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
19350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt
19360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer Transform Block size
19370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
19380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode
19390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer intraprediction mode
19400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
19410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
19420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
19430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
19440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
19450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
19460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
19470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
19480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_luma_mode_11_to_17_sse42(UWORD8 *pu1_ref,
19510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 src_strd,
19520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               UWORD8 *pu1_dst,
19530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 dst_strd,
19540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 nt,
19550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 mode)
19560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
19570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/
19590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* for ref main & side samples assignment,can be combined for */
19600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* optimzation*/
19610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col, k;
19630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 two_nt;
19640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
19650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 ref_idx;
19660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
19680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i fract_4x32b,  intra_pred_ang_4x32b;
19690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
19700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 ref_tmp[2 * MAX_CU_SIZE + 2];
19730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 *ref_main;
19740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 *ref_temp;
19750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(src_strd);
19760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    inv_ang_sum = 128;
19780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    two_nt    = 2 * nt;
19790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ref_temp = ref_tmp + 1;
19800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ref_main = ref_temp + nt - 1;
19810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    intra_pred_ang = gai4_ihevc_ang_table[mode];
19820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
19840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* samples dependent on distance to obtain destination sample */
19850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp_4x32b  = _mm_set1_epi16(16);
19860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp2_4x32b = _mm_set1_epi32(31);
19870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp3_4x32b = _mm_set1_epi32(32);
19880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp4_4x32b = _mm_set1_epi32(4);
19890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    two_nt_4x32b = _mm_set1_epi32(1);
19910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
19940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
19960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
19970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
19980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    row_4x32b = _mm_set_epi32(4, 3, 2, 1);
19990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(nt == 4)
20010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
20020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
20040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        int temp11, temp21, temp31, temp41;
20050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//        WORD8  ai1_fract_temp_val[16], ai1_row_temp_val[16];
20060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i fract1_8x16b, fract2_8x16b;
20080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
20090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
20110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
20120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
20130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Intermediate reference samples for negative angle modes */
20150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* This have to be removed during optimization*/
20160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
20170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
20180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main = ref_temp + nt - 1;
20200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(k = 0; k < nt + 1; k++)
20210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_temp[k + nt - 1] = pu1_ref[two_nt - k];
20220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main = ref_temp + nt - 1;
20240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_idx = (nt * intra_pred_ang) >> 5;
20250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* SIMD Optimization can be done using look-up table for the loop */
20270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* For negative angled derive the main reference samples from side */
20280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  reference samples refer to section 8.4.4.2.6 */
20290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(k = -1; k > ref_idx; k--)
20300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
20310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            inv_ang_sum += inv_ang;
20320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
20330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
20340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* pos = ((row + 1) * intra_pred_ang); */
20370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
20380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* idx = pos >> 5; */
20400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
20410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* fract = pos & (31); */
20430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_idx_4x32b = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
20440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*(32 - fract) */
20460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
20470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
20490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
20500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
20520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
20530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract2_8x16b = _mm_unpackhi_epi8(fract_4x32b, row_4x32b);
20550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        fract1_8x16b = _mm_unpacklo_epi8(fract_4x32b, row_4x32b);
20560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
20580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp2_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
20590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp3_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
20600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp4_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
20610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4);  /* next 32 bit values */
20630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8);  /* next 32 bit values */
20640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
20650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_idx1  = _mm_cvtsi128_si32(ref_main_idx_4x32b);    /* col=0*/
20660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* col=1*/
20670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* col=2*/
20680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* col=3*/
20690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* loding 8-bit 16 pixels */
20710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col=0*/
20720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col=1*/
20730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col=2*/
20740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col=3*/
20750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
20770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
20780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
20790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
20800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
20820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
20830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
20840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
20850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
20860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
20880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
20890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
20900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
20910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
20920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
20940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
20950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
20960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
20970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
20980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
20990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* converting 16 bit to 8 bit */
21000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
21010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
21020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
21050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
21060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
21080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1_8x16b = _mm_srli_si128(src_temp7_8x16b, 4);
21090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp2_8x16b = _mm_srli_si128(src_temp7_8x16b, 8);
21100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 12);
21110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
21130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
21140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
21150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
21160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* loding 8-bit 4 pixels values */
21180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
21190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
21200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
21210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
21220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
21230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else if(nt == 32)
21250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
21260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i temp1, temp2, temp3, temp11, temp12;
21290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values0, src_values1;
21300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Intermediate reference samples for negative angle modes */
21310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
21330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
21340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 17));
21350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
21360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* For negative angled derive the main reference samples from side */
21380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
21400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); /*(nt+16)-(two_nt-1)*/
21410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode]));
21430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
21440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_shuffle_epi8(src_values0, temp2);
21460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values1 = _mm_shuffle_epi8(src_values1, temp2);
21470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_shuffle_epi8(src_values0, temp12);
21480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values1 = _mm_shuffle_epi8(src_values1, temp11);
21490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1 = _mm_shuffle_epi8(temp1, temp2);
21510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp3 = _mm_shuffle_epi8(temp3, temp2);
21520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp3);
21540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp1);
21550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_main - 16), src_values0);
21560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[17 - mode][0]), src_values1);
21570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
21600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
21610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi16(31);
21620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp4_4x32b = _mm_set1_epi16(8);
21630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi16(32);
21640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi16(1);
21650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(col = 0; col < nt; col += 8)
21670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
21680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
21690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
21700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
21710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
21730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
21750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
21760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
21780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
21790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
21810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
21820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
21840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
21850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
21870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
21880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
21890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
21910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
21920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
21940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
21950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
21970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
21980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
21990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
22010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
22020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
22030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
22040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
22060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
22070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
22080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
22090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
22110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
22120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
22130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
22140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
22160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
22170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
22180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
22190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < nt; row += 8)
22210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
22220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
22230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
22240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
22270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
22280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loding 8-bit 16 pixels */
22300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
22310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
22320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
22330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
22340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
22360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
22370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
22380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
22390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loding 8-bit 16 pixels */
22410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
22420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
22430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
22440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
22450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
22470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
22480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
22490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
22500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
22520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
22530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
22540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
22550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
22570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
22580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
22590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
22600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
22610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
22630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
22640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
22650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
22660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
22670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
22690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
22700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
22710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
22720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
22730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
22750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
22760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
22770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
22780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
22790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
22810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
22820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
22830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
22840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
22850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
22870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
22880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
22890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
22900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
22910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
22930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
22940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
22950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
22960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
22970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
22980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
22990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
23010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
23020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
23040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
23050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
23070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
23080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
23100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
23110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
23140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
23150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
23170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
23180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
23200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
23210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
23220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
23230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b);          /* row=0*/
23250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b);       /* row=1*/
23270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b);       /* row=2*/
23290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b);       /* row=4*/
23310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b);       /* row=5*/
23330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b);       /* row=6*/
23350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b);       /* row=7*/
23370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b);       /* row=8*/
23390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
23410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
23420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
23430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else if(nt == 16)
23440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
23450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i temp1, temp2, temp11, src_values0;
23470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Intermediate reference samples for negative angle modes */
23480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
23490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
23500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
23510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
23520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
23530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
23550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_shuffle_epi8(src_values0, temp2);
23570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1 = _mm_shuffle_epi8(temp1, temp2);
23580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
23590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
23610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
23620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
23640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
23650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi16(31);
23660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp4_4x32b = _mm_set1_epi16(8);
23670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi16(32);
23680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi16(1);
23690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(col = 0; col < nt; col += 8)
23710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
23720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
23730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
23740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            // WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
23750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
23770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
23790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
23800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
23820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
23830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
23850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
23860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
23880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
23890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
23910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
23920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
23930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
23950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
23960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
23970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
23980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
23990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
24020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
24030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
24050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
24060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
24070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
24080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
24100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
24110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
24120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
24130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
24150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
24160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
24170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
24180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
24200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
24210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
24220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
24230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(row = 0; row < nt; row += 8)
24250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
24260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
24270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
24280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
24310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
24320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loding 8-bit 16 pixels */
24340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
24350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
24360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
24370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
24380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
24400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
24410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
24420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
24430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loding 8-bit 16 pixels */
24450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
24460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
24470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
24480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
24490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
24510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
24520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
24530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
24540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
24560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
24570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
24580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
24590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
24610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
24620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
24630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
24640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
24650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
24670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
24680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
24690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
24700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
24710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
24730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
24740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
24750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
24760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
24770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
24790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
24800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
24810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
24820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
24830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
24850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
24860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
24870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
24880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
24890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
24910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
24920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
24930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
24940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
24950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
24960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
24970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
24980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
24990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
25010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
25020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
25030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
25050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
25060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
25080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
25090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
25110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
25120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
25140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
25150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
25180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
25190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
25210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
25220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
25240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
25250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
25260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
25270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b);          /* row=0*/
25290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b);       /* row=1*/
25310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b);       /* row=2*/
25330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b);       /* row=4*/
25350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b);       /* row=5*/
25370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b);       /* row=6*/
25390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b);       /* row=7*/
25410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b);       /* row=8*/
25430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
25450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
25460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
25470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else
25480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
25490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i temp1, temp2, temp11, src_values0;
25520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Intermediate reference samples for negative angle modes */
25530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
25540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_temp[two_nt - 1] = pu1_ref[nt];
25550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 1));
25560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* For negative angled derive the main reference samples from side */
25580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
25600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
25610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
25620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_shuffle_epi8(src_values0, temp2);
25640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1 = _mm_shuffle_epi8(temp1, temp2);
25650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
25660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_srli_si128(src_values0, 8);
25670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
25690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
25700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
25730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
25740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi16(31);
25750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp4_4x32b = _mm_set1_epi16(8);
25760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi16(32);
25770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi16(1);
25780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
25800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
25810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
25820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            //WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
25830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
25850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
25870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
25880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
25900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
25910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
25930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
25940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
25960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
25970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
25980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
25990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
26000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
26020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
26030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
26050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
26060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
26080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
26090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
26110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
26120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
26130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
26140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
26160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
26170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
26180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
26190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
26210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
26220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
26230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
26240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
26260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
26270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
26280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
26290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
26310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
26320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
26330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
26350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
26360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loding 8-bit 16 pixels */
26380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
26390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
26400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
26410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
26420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loding 8-bit 16 pixels */
26440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5)); /* col=5*/
26450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6)); /* col=6*/
26460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7)); /* col=7*/
26470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8)); /* col=8*/
26480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
26500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
26510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
26520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
26530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
26550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
26560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
26570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
26580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
26600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
26610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
26620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
26630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
26640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
26660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
26670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
26680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
26690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
26700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
26720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
26730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
26740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
26750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
26760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
26780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* row=0*/
26790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* row=1*/
26800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* row=2*/
26810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* row=3*/
26820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
26840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
26850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
26860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
26870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
26880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
26900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
26910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
26920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
26930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
26940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
26960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
26970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
26980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
26990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
27000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=4*/
27010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=5*/
27020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
27040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
27050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
27070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
27080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
27100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
27110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
27130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
27140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
27170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
27180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
27200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
27210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
27230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
27240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
27250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
27260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp1_8x16b);       /* row=0*/
27280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp5_8x16b);       /* row=1*/
27300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp2_8x16b);       /* row=2*/
27320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp6_8x16b);       /* row=3*/
27340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (4))), src_temp3_8x16b);       /* row=4*/
27360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (5))), src_temp7_8x16b);       /* row=5*/
27380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (6))), src_temp4_8x16b);       /* row=6*/
27400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (7))), src_temp8_8x16b);       /* row=7*/
27420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
27440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
27450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
27460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
27480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
27520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
27530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
27540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
27550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*   Intra prediction interpolation filter for luma mode 19 to mode 25
27560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
27570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
27580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
27590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
27600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    block location pointed by 'pu1_dst'
27610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
27620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
27630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
27640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
27650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
27660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the destination
27670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
27680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
27690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
27700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
27710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
27720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
27730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
27740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt
27750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer Transform Block size
27760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
27770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode
27780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer intraprediction mode
27790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
27800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
27810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
27820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
27830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
27840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
27850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
27860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
27870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_luma_mode_19_to_25_sse42(UWORD8 *pu1_ref,
27900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 src_strd,
27910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               UWORD8 *pu1_dst,
27920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 dst_strd,
27930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 nt,
27940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 mode)
27950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
27960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
27970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, k;
27980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 two_nt, intra_pred_ang;
27990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 inv_ang, inv_ang_sum;
28000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //WORD32 ref_main_idx, pos, fract, idx;
28010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 ref_idx;
28020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 ref_tmp[(2 * MAX_CU_SIZE) + 2];
28030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UWORD8 *ref_main, *ref_temp;
28040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i  /*fract_8x16b,*/ const_temp_8x16b, sm3;
28060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i temp1, temp2, temp3, temp4;
28070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i temp11, temp12, temp13, temp14;
28080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(src_strd);
28090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    two_nt = 2 * nt;
28110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    intra_pred_ang = gai4_ihevc_ang_table[mode];
28120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
28130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Intermediate reference samples for negative angle modes */
28150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* This have to be removed during optimization*/
28160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
28170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ref_temp = ref_tmp + 1;
28180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ref_main = ref_temp + nt - 1;
28190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
28220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp_8x16b = _mm_set1_epi16(16);
28260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(nt == 32)
28280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
28290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
28310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
28320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i row_4x32b, two_nt_4x32b, src_values12;
28330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values0, src_values1, src_values2, src_values3;
28350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i  src_values4, src_values5, src_values6, src_values7;
28360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 col = 0;
28370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Intermediate reference samples for negative angle modes */
28390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* This have to be removed during optimization*/
28400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
28410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
28420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
28430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 16));
28440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* SIMD Optimization can be done using look-up table for the loop */
28460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* For negative angled derive the main reference samples from side */
28470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  reference samples refer to section 8.4.4.2.6 */
28480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
28490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); /*(nt+16)-(two_nt-1)*/
28500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19]));
28520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
28530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
28550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values1 = _mm_shuffle_epi8(src_values1, temp12);
28560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
28580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp3);
28590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_main - 16), src_values1);
28600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[mode - 19][0]), src_values0);
28610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi16(31);
28630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi16(32);
28640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp8_4x32b = _mm_set1_epi16(8);
28650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi16(1);
28670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
28690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
28700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
28720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < nt; row += 8)
28740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
28750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 ref_main_idx[9];
28770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i res_temp5_4x32b;
28790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b;
28800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
28820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
28830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
28850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
28860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
28880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
28890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
28910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
28920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
28940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
28950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
28970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
28980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
28990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
29000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
29010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
29030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
29040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
29050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
29060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
29080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
29090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
29100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
29110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
29130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
29140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(col = 0; col < nt; col += 16)
29150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
29160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + col));
29170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + col));
29180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + col));
29190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + col));
29200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8 + col));
29210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8 + col));
29220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8 + col));
29230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8 + col));
29240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
29260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
29270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
29280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
29290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
29300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
29310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
29320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
29330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
29360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
29370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
29380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
29390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
29400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
29410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
29420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
29430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
29450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
29460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
29470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
29480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
29490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
29500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
29510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
29520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
29530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
29550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_srai_epi16(src_values0,  5);
29560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_srai_epi16(src_values1,  5);
29570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_srai_epi16(src_values2,  5);
29580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_srai_epi16(src_values3,  5);
29590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_srai_epi16(src_values4,  5);
29600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_srai_epi16(src_values5,  5);
29610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_srai_epi16(src_values6,  5);
29620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_srai_epi16(src_values7,  5);
29630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
29650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_packus_epi16(src_values0, src_values4);
29660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_packus_epi16(src_values1, src_values5);
29670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_packus_epi16(src_values2, src_values6);
29680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_packus_epi16(src_values3, src_values7);
29690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loading 8-bit 8 pixels values */
29710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0);       /* row=0*/
29720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1);   /* row=1*/
29730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2);   /* row=2*/
29740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3);   /* row=3*/
29750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + col));
29780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + col));
29790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + col));
29800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + col));
29810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8 + col));
29820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8 + col));
29830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8 + col));
29840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8 + col));
29850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
29870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
29880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
29890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
29900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
29910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
29920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
29930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
29940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
29960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
29970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
29980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
29990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
30000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
30010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
30020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
30030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
30040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
30060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
30070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
30080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
30090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
30100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
30110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
30120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
30130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
30140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
30160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_srai_epi16(src_values0,  5);
30170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_srai_epi16(src_values1,  5);
30180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_srai_epi16(src_values2,  5);
30190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_srai_epi16(src_values3,  5);
30200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_srai_epi16(src_values4,  5);
30210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_srai_epi16(src_values5,  5);
30220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_srai_epi16(src_values6,  5);
30230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_srai_epi16(src_values7,  5);
30240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
30260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_packus_epi16(src_values0, src_values4);
30270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_packus_epi16(src_values1, src_values5);
30280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_packus_epi16(src_values2, src_values6);
30290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_packus_epi16(src_values3, src_values7);
30300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loading 8-bit 8 pixels values */
30320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0);   /* row=4*/
30330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1);   /* row=5*/
30340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2);   /* row=6*/
30350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3);   /* row=7*/
30360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
30380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 8 * dst_strd;
30390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
30400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
30420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else if(nt == 16) /* for nt = 16 case */
30430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
30440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
30460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
30470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i row_4x32b, two_nt_4x32b, src_values12;
30480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values0, src_values1, src_values2, src_values3;
30490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i  src_values4, src_values5, src_values6, src_values7;
30500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Intermediate reference samples for negative angle modes */
30530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
30540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
30550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
30560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
30580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
30600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
30620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
30640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
30650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi16(31);
30670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi16(32);
30680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp8_4x32b = _mm_set1_epi16(8);
30690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi16(1);
30710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
30730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
30740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
30760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < nt; row += 8)
30780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
30790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 ref_main_idx[9];
30810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i res_temp5_4x32b;
30830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b;
30840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
30860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
30870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
30890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
30900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
30920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
30930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
30950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
30960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
30970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
30980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
30990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
31010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
31020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
31040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
31050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
31070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
31080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
31090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
31100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
31120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
31130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
31140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
31150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
31170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
31180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
31200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));
31210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));
31220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));
31230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));
31240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8));
31250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8));
31260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8));
31270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8));
31280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
31300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
31310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
31320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
31330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
31340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
31350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
31360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
31370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
31400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
31410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
31420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
31430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
31440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
31450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
31460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
31470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
31490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
31500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
31510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
31520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
31530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
31540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
31550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
31560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
31570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
31590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_srai_epi16(src_values0,  5);
31600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_srai_epi16(src_values1,  5);
31610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_srai_epi16(src_values2,  5);
31620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_srai_epi16(src_values3,  5);
31630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_srai_epi16(src_values4,  5);
31640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_srai_epi16(src_values5,  5);
31650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_srai_epi16(src_values6,  5);
31660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_srai_epi16(src_values7,  5);
31670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
31690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_packus_epi16(src_values0, src_values4);
31700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_packus_epi16(src_values1, src_values5);
31710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_packus_epi16(src_values2, src_values6);
31720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_packus_epi16(src_values3, src_values7);
31730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loading 8-bit 8 pixels values */
31750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0);       /* row=0*/
31760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1);   /* row=1*/
31770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2);   /* row=2*/
31780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3);   /* row=3*/
31790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));
31820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));
31830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));
31840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));
31850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8));
31860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8));
31870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8));
31880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8));
31890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
31910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
31920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
31930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
31940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
31950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
31960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
31970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
31980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
31990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
32010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
32020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
32030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
32040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
32050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
32060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
32070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
32080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
32100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
32110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
32120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
32130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
32140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
32150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
32160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
32170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
32180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
32200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_srai_epi16(src_values0,  5);
32210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_srai_epi16(src_values1,  5);
32220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_srai_epi16(src_values2,  5);
32230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_srai_epi16(src_values3,  5);
32240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_srai_epi16(src_values4,  5);
32250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_srai_epi16(src_values5,  5);
32260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_srai_epi16(src_values6,  5);
32270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_srai_epi16(src_values7,  5);
32280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
32300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_packus_epi16(src_values0, src_values4);
32310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_packus_epi16(src_values1, src_values5);
32320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_packus_epi16(src_values2, src_values6);
32330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_packus_epi16(src_values3, src_values7);
32340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loading 8-bit 8 pixels values */
32360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0);   /* row=4*/
32370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1);   /* row=5*/
32380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2);   /* row=6*/
32390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3);   /* row=7*/
32400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
32420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 8 * dst_strd;
32430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
32440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
32450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else if(nt == 8)
32460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
32470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i const_temp2_4x32b, const_temp3_4x32b;
32500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
32510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i row_4x32b, two_nt_4x32b, src_values12;
32530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values0, src_values1, src_values2, src_values3;
32540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i  src_values4, src_values5, src_values6, src_values7;
32550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* Intermediate reference samples for negative angle modes */
32580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
32590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
32600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp1 = _mm_loadl_epi64((__m128i *)(pu1_ref + two_nt));
32610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* For negative angled derive the main reference samples from side */
32630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref)); /*nt-(nt+15)*/
32650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
32670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_shuffle_epi8(src_values0, temp11);
32690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_values0 = _mm_srli_si128(src_values0, 8);
32700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
32710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
32720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi16(31);
32760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi16(32);
32770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi16(1);
32800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
32830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
32840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
32860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
32880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 ref_main_idx[9];
32900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i res_temp5_4x32b;
32920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b;
32930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
32950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
32960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
32970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
32980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
32990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
33010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
33020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
33040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
33050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
33070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
33080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
33100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
33110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
33130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
33140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
33160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
33170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
33180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
33190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
33210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
33220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
33230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
33240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
33260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));  /* col = 0-7   */
33280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));  /* col = 8-15  */
33290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));  /* col = 16-23 */
33300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));  /* col = 24-31 */
33310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));  /* col = 32-39   */
33320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));  /* col = 40-47  */
33330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));  /* col = 48-55 */
33340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));  /* col = 56-63*/
33350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
33370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
33380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
33390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
33400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
33410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
33420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
33430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
33440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
33470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
33480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
33490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
33500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 = _mm_maddubs_epi16(src_values4, temp11);
33510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 = _mm_maddubs_epi16(src_values5, temp12);
33520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 = _mm_maddubs_epi16(src_values6, temp13);
33530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 = _mm_maddubs_epi16(src_values7, temp14);
33540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
33560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
33570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
33580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
33590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
33600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
33610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
33620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
33630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
33640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
33660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_srai_epi16(src_values0,  5);
33670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_srai_epi16(src_values1,  5);
33680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_srai_epi16(src_values2,  5);
33690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_srai_epi16(src_values3,  5);
33700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 = _mm_srai_epi16(src_values4,  5);
33710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 = _mm_srai_epi16(src_values5,  5);
33720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 = _mm_srai_epi16(src_values6,  5);
33730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 = _mm_srai_epi16(src_values7,  5);
33740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* converting 16 bit to 8 bit */
33760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_packus_epi16(src_values0, src_values1);
33770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_packus_epi16(src_values2, src_values3);
33780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_srli_si128(src_values0, 8);
33790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_srli_si128(src_values2, 8);
33800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 = _mm_packus_epi16(src_values4, src_values5);
33810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 = _mm_packus_epi16(src_values6, src_values7);
33820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 = _mm_srli_si128(src_values4, 8);
33830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 = _mm_srli_si128(src_values6, 8);
33840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* loading 8-bit 8 pixels values */
33860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0);       /* row=0*/
33870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1);   /* row=1*/
33880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2);   /* row=2*/
33890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3);   /* row=3*/
33900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4);   /* row=4*/
33910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5);   /* row=5*/
33920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6);   /* row=6*/
33930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7);   /* row=7*/
33940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
33950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
33960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else /* if nt =4*/
33970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
33980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
33990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i const_temp2_4x32b, const_temp3_4x32b;
34000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
34010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i row_4x32b, two_nt_4x32b, src_values12;
34030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(k = 0; k < (nt + 1); k++)
34060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_temp[k + nt - 1] = pu1_ref[two_nt + k];
34070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        ref_idx = (nt * intra_pred_ang) >> 5;
34080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        inv_ang_sum = 128;
34090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(k = -1; k > ref_idx; k--)
34110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
34120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            inv_ang_sum += inv_ang;
34130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
34140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
34150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi32(31);
34180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi32(32);
34190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi32(1);
34210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
34240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
34250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi32(4, 3, 2, 1);
34270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
34280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
34290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            int temp11, temp21, temp31, temp41;
34300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b,  res_temp5_4x32b;
34330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i src_values0, src_values1, src_values2, src_values3;
34340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
34350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
34370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
34380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
34400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
34410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
34430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
34440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
34450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
34460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
34470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
34480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
34490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
34510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
34520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
34540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
34550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
34570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
34580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
34600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
34610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
34630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
34640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
34660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
34670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
34680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
34690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));     /* col = 0-7   */
34710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2));   /* col = 8-15  */
34720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3));  /* col = 16-23 */
34730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4));  /* col = 24-31 */
34740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
34760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
34770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
34780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
34790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
34820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
34830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
34840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
34850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
34870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
34880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
34890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
34900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
34910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
34930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_srai_epi16(src_values0,  5);
34940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_srai_epi16(src_values1,  5);
34950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_srai_epi16(src_values2,  5);
34960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_srai_epi16(src_values3,  5);
34970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
34980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* converting 16 bit to 8 bit */
34990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_packus_epi16(src_values0, src_values1);
35000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_packus_epi16(src_values2, src_values3);
35010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_srli_si128(src_values0, 8);
35020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_srli_si128(src_values2, 8);
35030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11 = _mm_cvtsi128_si32(src_values0);
35050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp21 = _mm_cvtsi128_si32(src_values1);
35060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp31 = _mm_cvtsi128_si32(src_values2);
35070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp41 = _mm_cvtsi128_si32(src_values3);
35080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* loding 4-bit 8 pixels values */
35100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
35110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
35120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
35130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
35140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
35160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
35170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
35180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
35210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
35220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
35230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
35240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Intra prediction interpolation filter for luma mode 27 to mode 33
35250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
35260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
35270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
35280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
35290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*    block location pointed by 'pu1_dst'
35300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
35310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
35320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
35330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
35340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[out] pu1_dst
35350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the destination
35360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
35370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
35380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
35390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
35400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
35410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
35420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
35430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt
35440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer Transform Block size
35450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
35460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode
35470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer intraprediction mode
35480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
35490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
35500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
35510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
35520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
35530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
35540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
35550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
35560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_luma_mode_27_to_33_sse42(UWORD8 *pu1_ref,
35590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 src_strd,
35600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               UWORD8 *pu1_dst,
35610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 dst_strd,
35620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 nt,
35630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               WORD32 mode)
35640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
35650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row;
35660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 two_nt;
35670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 intra_pred_ang;
35680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i temp11, temp12, temp13, temp14;
35700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i     const_temp_8x16b;
35720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i temp1, temp2, temp3, temp4, sm3;
35730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(src_strd);
35740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    two_nt = 2 * nt;
35760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    intra_pred_ang = gai4_ihevc_ang_table[mode];
35770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp_8x16b = _mm_set1_epi16(16);
35790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
35800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(nt == 32)
35810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
35820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
35840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
35850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i row_4x32b, two_nt_4x32b, src_values12;
35860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        int col = 0;
35870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi16(31);
35890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi16(32);
35900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp8_4x32b = _mm_set1_epi16(8);
35910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
35930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
35950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
35960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
35980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
35990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < nt; row += 8)
36000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
36010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 ref_main_idx[9];
36030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i res_temp5_4x32b;
36050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b;
36060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i src_values0, src_values1, src_values2, src_values3;
36070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i  src_values4, src_values5, src_values6, src_values7;
36080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
36100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
36110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
36130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
36140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
36160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
36170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
36190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
36200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
36220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
36230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
36250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
36260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
36280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
36290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
36310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
36320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
36330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
36340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
36360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
36370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
36380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
36390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
36410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
36420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(col = 0; col < nt; col += 16)
36430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
36440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + col));
36450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + col));
36460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + col));
36470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + col));
36480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8 + col));
36490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8 + col));
36500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8 + col));
36510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8 + col));
36520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
36540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
36550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
36560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
36570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
36580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
36590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
36600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
36610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
36640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
36650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
36660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
36670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
36680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
36690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
36700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
36710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
36730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
36740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
36750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
36760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
36770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
36780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
36790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
36800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
36810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
36830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_srai_epi16(src_values0,  5);
36840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_srai_epi16(src_values1,  5);
36850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_srai_epi16(src_values2,  5);
36860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_srai_epi16(src_values3,  5);
36870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_srai_epi16(src_values4,  5);
36880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_srai_epi16(src_values5,  5);
36890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_srai_epi16(src_values6,  5);
36900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_srai_epi16(src_values7,  5);
36910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
36930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_packus_epi16(src_values0, src_values4);
36940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_packus_epi16(src_values1, src_values5);
36950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_packus_epi16(src_values2, src_values6);
36960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_packus_epi16(src_values3, src_values7);
36970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
36980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loading 8-bit 8 pixels values */
36990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0);       /* row=0*/
37000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1);   /* row=1*/
37010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2);   /* row=2*/
37020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3);   /* row=3*/
37030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + col));
37060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + col));
37070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + col));
37080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + col));
37090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8 + col));
37100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8 + col));
37110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8 + col));
37120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8 + col));
37130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
37150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
37160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
37170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
37180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
37190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
37200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
37210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
37220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
37250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
37260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
37270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
37280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
37290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
37300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
37310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
37320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
37340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
37350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
37360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
37370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
37380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
37390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
37400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
37410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
37420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
37440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_srai_epi16(src_values0,  5);
37450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_srai_epi16(src_values1,  5);
37460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_srai_epi16(src_values2,  5);
37470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_srai_epi16(src_values3,  5);
37480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_srai_epi16(src_values4,  5);
37490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_srai_epi16(src_values5,  5);
37500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_srai_epi16(src_values6,  5);
37510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_srai_epi16(src_values7,  5);
37520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
37540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_packus_epi16(src_values0, src_values4);
37550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_packus_epi16(src_values1, src_values5);
37560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_packus_epi16(src_values2, src_values6);
37570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_packus_epi16(src_values3, src_values7);
37580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loading 8-bit 8 pixels values */
37600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0);   /* row=4*/
37610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1);   /* row=5*/
37620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2);   /* row=6*/
37630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3);   /* row=7*/
37640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
37660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 8 * dst_strd;
37670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
37680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
37700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else if(nt == 16) /* for nt = 16 case */
37710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
37720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
37740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
37750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i row_4x32b, two_nt_4x32b, src_values12;
37760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi16(31);
37790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi16(32);
37800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp8_4x32b = _mm_set1_epi16(8);
37810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
37830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
37850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
37860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
37880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < nt; row += 8)
37900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
37910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 ref_main_idx[9];
37930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i res_temp5_4x32b;
37950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b;
37960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i src_values0, src_values1, src_values2, src_values3;
37970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i  src_values4, src_values5, src_values6, src_values7;
37980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
37990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
38000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
38010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
38030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
38040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
38060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
38070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
38090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
38100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
38120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
38130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
38150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
38160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
38180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
38190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
38210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
38220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
38230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
38240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
38260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
38270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
38280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
38290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
38310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
38320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
38340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));
38350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));
38360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));
38370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));
38380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8));
38390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8));
38400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8));
38410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8));
38420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
38440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
38450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
38460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
38470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
38480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
38490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
38500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
38510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_maddubs_epi16(src_values0, temp1);
38540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_maddubs_epi16(src_values1, temp2);
38550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_maddubs_epi16(src_values2, temp3);
38560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_maddubs_epi16(src_values3, temp4);
38570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_maddubs_epi16(src_values4, temp1);
38580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_maddubs_epi16(src_values5, temp2);
38590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_maddubs_epi16(src_values6, temp3);
38600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_maddubs_epi16(src_values7, temp4);
38610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
38630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
38640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
38650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
38660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
38670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
38680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
38690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
38700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
38710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
38730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_srai_epi16(src_values0,  5);
38740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_srai_epi16(src_values1,  5);
38750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_srai_epi16(src_values2,  5);
38760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_srai_epi16(src_values3,  5);
38770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_srai_epi16(src_values4,  5);
38780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_srai_epi16(src_values5,  5);
38790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_srai_epi16(src_values6,  5);
38800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_srai_epi16(src_values7,  5);
38810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
38830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_packus_epi16(src_values0, src_values4);
38840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_packus_epi16(src_values1, src_values5);
38850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_packus_epi16(src_values2, src_values6);
38860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_packus_epi16(src_values3, src_values7);
38870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loading 8-bit 8 pixels values */
38890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0);       /* row=0*/
38900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1);   /* row=1*/
38910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2);   /* row=2*/
38920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3);   /* row=3*/
38930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
38950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));
38960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));
38970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));
38980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));
38990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8));
39000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8));
39010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8));
39020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8));
39030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
39050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
39060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
39070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
39080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
39090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
39100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
39110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
39120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_maddubs_epi16(src_values0, temp11);
39150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_maddubs_epi16(src_values1, temp12);
39160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_maddubs_epi16(src_values2, temp13);
39170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_maddubs_epi16(src_values3, temp14);
39180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_maddubs_epi16(src_values4, temp11);
39190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_maddubs_epi16(src_values5, temp12);
39200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_maddubs_epi16(src_values6, temp13);
39210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_maddubs_epi16(src_values7, temp14);
39220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
39240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
39250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
39260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
39270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
39280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
39290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
39300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
39310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
39320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
39340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_srai_epi16(src_values0,  5);
39350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_srai_epi16(src_values1,  5);
39360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_srai_epi16(src_values2,  5);
39370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_srai_epi16(src_values3,  5);
39380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values4 = _mm_srai_epi16(src_values4,  5);
39390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values5 = _mm_srai_epi16(src_values5,  5);
39400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values6 = _mm_srai_epi16(src_values6,  5);
39410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values7 = _mm_srai_epi16(src_values7,  5);
39420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* converting 16 bit to 8 bit */
39440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values0 = _mm_packus_epi16(src_values0, src_values4);
39450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values1 = _mm_packus_epi16(src_values1, src_values5);
39460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values2 = _mm_packus_epi16(src_values2, src_values6);
39470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_values3 = _mm_packus_epi16(src_values3, src_values7);
39480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loading 8-bit 8 pixels values */
39500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0);   /* row=4*/
39510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1);   /* row=5*/
39520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2);   /* row=6*/
39530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3);   /* row=7*/
39540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
39560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 8 * dst_strd;
39570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
39580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
39600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else if(nt == 8)
39610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
39620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i const_temp2_4x32b, const_temp3_4x32b;
39640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
39650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i row_4x32b, two_nt_4x32b, src_values12;
39660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi16(31);
39690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi16(32);
39700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
39720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
39750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
39760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
39780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        //for(row = 0; row < nt; row +=4)
39800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
39810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD16 ref_main_idx[9];
39830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i res_temp5_4x32b;
39850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b;
39860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i src_values0, src_values1, src_values2, src_values3;
39870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i  src_values4, src_values5, src_values6, src_values7;
39880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
39900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
39910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
39930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
39940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
39960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
39970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
39980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
39990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
40000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
40020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
40030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
40050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
40060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
40080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
40090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
40110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
40120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
40130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
40140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
40160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
40170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
40180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
40190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
40210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));  /* col = 0-7   */
40230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));  /* col = 8-15  */
40240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));  /* col = 16-23 */
40250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));  /* col = 24-31 */
40260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));  /* col = 32-39   */
40270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));  /* col = 40-47  */
40280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));  /* col = 48-55 */
40290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));  /* col = 56-63*/
40300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
40320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
40330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
40340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
40350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
40360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
40370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
40380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
40390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
40420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
40430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
40440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
40450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 = _mm_maddubs_epi16(src_values4, temp11);
40460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 = _mm_maddubs_epi16(src_values5, temp12);
40470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 = _mm_maddubs_epi16(src_values6, temp13);
40480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 = _mm_maddubs_epi16(src_values7, temp14);
40490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
40510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
40520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
40530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
40540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
40550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
40560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
40570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
40580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
40590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
40610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_srai_epi16(src_values0,  5);
40620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_srai_epi16(src_values1,  5);
40630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_srai_epi16(src_values2,  5);
40640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_srai_epi16(src_values3,  5);
40650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 = _mm_srai_epi16(src_values4,  5);
40660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 = _mm_srai_epi16(src_values5,  5);
40670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 = _mm_srai_epi16(src_values6,  5);
40680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 = _mm_srai_epi16(src_values7,  5);
40690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* converting 16 bit to 8 bit */
40710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_packus_epi16(src_values0, src_values1);
40720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_packus_epi16(src_values2, src_values3);
40730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_srli_si128(src_values0, 8);
40740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_srli_si128(src_values2, 8);
40750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values4 = _mm_packus_epi16(src_values4, src_values5);
40760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values6 = _mm_packus_epi16(src_values6, src_values7);
40770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values5 = _mm_srli_si128(src_values4, 8);
40780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values7 = _mm_srli_si128(src_values6, 8);
40790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* loading 8-bit 8 pixels values */
40810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0);       /* row=0*/
40820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1);   /* row=1*/
40830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2);   /* row=2*/
40840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3);   /* row=3*/
40850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4);   /* row=4*/
40860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5);   /* row=5*/
40870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6);   /* row=6*/
40880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7);   /* row=7*/
40890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
40900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
40920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else /* if nt =4*/
40930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
40940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i const_temp2_4x32b, const_temp3_4x32b;
40960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i src_values10, src_values11, intra_pred_ang_4x32b;
40970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
40980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i row_4x32b, two_nt_4x32b, src_values12;
40990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp2_4x32b = _mm_set1_epi32(31);
41020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp3_4x32b = _mm_set1_epi32(32);
41030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        two_nt_4x32b = _mm_set1_epi32(two_nt + 1);
41050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
41080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
41090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        row_4x32b = _mm_set_epi32(4, 3, 2, 1);
41110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
41120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            int temp11, temp21, temp31, temp41;
41130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
41150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b;
41170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i src_values0, src_values1, src_values2, src_values3;
41180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
41190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* pos = ((row + 1) * intra_pred_ang); */
41210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
41220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* fract = pos & (31); */
41240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
41250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
41270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
41280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
41290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
41300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
41310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
41320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
41330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* idx = pos >> 5; */
41350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
41360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(32 - fract) */
41380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
41390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_slli_epi16(src_values11, 8);
41410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_slli_epi16(src_values10, 8);
41420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
41440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
41450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
41470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
41480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
41500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
41510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp3 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
41520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp4 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
41530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));     /* col = 0-7   */
41550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2));   /* col = 8-15  */
41560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3));  /* col = 16-23 */
41570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4));  /* col = 24-31 */
41580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
41600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
41610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
41620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
41630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_maddubs_epi16(src_values0, temp1);
41650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_maddubs_epi16(src_values1, temp2);
41660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_maddubs_epi16(src_values2, temp3);
41670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_maddubs_epi16(src_values3, temp4);
41680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
41700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
41710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
41720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
41730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
41740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
41760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_srai_epi16(src_values0,  5);
41770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_srai_epi16(src_values1,  5);
41780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_srai_epi16(src_values2,  5);
41790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_srai_epi16(src_values3,  5);
41800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* converting 16 bit to 8 bit */
41820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values0 = _mm_packus_epi16(src_values0, src_values1);
41830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values2 = _mm_packus_epi16(src_values2, src_values3);
41840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values1 = _mm_srli_si128(src_values0, 8);
41850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            src_values3 = _mm_srli_si128(src_values2, 8);
41860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp11 = _mm_cvtsi128_si32(src_values0);
41880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp21 = _mm_cvtsi128_si32(src_values1);
41890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp31 = _mm_cvtsi128_si32(src_values2);
41900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            temp41 = _mm_cvtsi128_si32(src_values3);
41910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /* loding 4-bit 8 pixels values */
41930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
41940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
41950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
41960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
41970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
41980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
41990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
42000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
42010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4202