10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/******************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  ihevc_chroma_intra_pred_filters_x86_intr.c
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Contains function Definition for intra prediction  interpolation filters
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Ittiam
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions:
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  ihevc_intra_pred_chroma_planar_sse42()
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  ihevc_intra_pred_chroma_dc_sse42()
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* File Includes                                                             */
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h"
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h"
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_func_selector.h"
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h"
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_intra_pred.h"
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_chroma_intra_pred.h"
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_common_tables.h"
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_tables_x86_intr.h"
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <mmintrin.h>
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <xmmintrin.h>
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <emmintrin.h>
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <smmintrin.h>
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h>
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* Constant Macros                                                          */
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define MAX_CU_SIZE 64
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define BIT_DEPTH 8
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define T32_4NT 128
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define T16_4NT 64
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define T16C_4NT 64
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define T8C_4NT 32
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* Function Macros                                                          */
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/****************************************************************************/
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* tables to shuffle 8-bit values */
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* Function Definition                                                      */
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/*****************************************************************************/
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Planar Intraprediction with reference neighboring samples location
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* to section 8.4.4.2.4 in the standard
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_dst
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the destination
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer Transform Block size
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer intraprediction mode
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_chroma_planar_sse42(UWORD8 *pu1_ref,
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 src_strd,
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          UWORD8 *pu1_dst,
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 dst_strd,
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 nt,
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                          WORD32 mode)
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row, col;
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 log2nt = 5;
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 two_nt, three_nt;
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(src_strd);
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(mode);
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    switch(nt)
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        case 16:
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            log2nt = 4;
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        case 8:
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            log2nt = 3;
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        case 4:
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            log2nt = 2;
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        default:
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    two_nt = 2 * nt;
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    three_nt = 3 * nt;
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Planar filtering */
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/* setting vallues in  registera*/
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  pu1_ref[2*(two_nt - 1 - row)]
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  pu1_ref[2 * (three_nt + 1)]
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  pu1_ref[2 * (two_nt + 1) + col]
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar//  pu1_ref[2 * (nt - 1)]
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp_4x32b  = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp4_4x32b = _mm_set1_epi16(nt - 1);
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp6_4x32b = _mm_set1_epi16(nt);
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    const_temp7_4x32b = _mm_set1_epi16(4);
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_8x16b = _mm_set1_epi32(0);
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(nt % 4 == 0)
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        const_temp7_4x32b = _mm_set1_epi16(4);
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < nt; row++)
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            __m128i res_temp3_8x16b;
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const_temp2_4x32b  = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                               pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const_temp3_4x32b  = _mm_set1_epi16((row + 1));
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            row_8x16b = _mm_set1_epi16((nt - 1 - row));
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(row + 1) * pu1_ref[nt - 1]*/
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp_8x16b  = _mm_mullo_epi16(const_temp3_4x32b,  const_temp1_4x32b);
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*(row + 1) * pu1_ref[nt - 1] + nt)*/
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(col = 0; col < 2 * nt; col += 8)
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                __m128i src_temp_8x16b;
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* loding 8bit 16 pixles*/
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                src_temp_8x16b =  _mm_cvtepu8_epi16(src_temp_8x16b); /* row=0*/
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_8x16b  = _mm_mullo_epi16(src_temp_8x16b,  row_8x16b);
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*(col + 1) * pu1_ref[three_nt + 1]*/
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp2_8x16b  = _mm_mullo_epi16(const_temp_4x32b,  col_8x16b);
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp3_8x16b  = _mm_mullo_epi16(const_temp2_4x32b,  const_temp5_4x32b);
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            } /* inner loop ends here */
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Intraprediction for DC mode with reference neighboring  samples location
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* to section 8.4.4.2.5 in the standard
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par Description:
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_src
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the source
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] pu1_dst
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  UWORD8 pointer to the destination
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] src_strd
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer source stride
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] dst_strd
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer destination stride
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] nt
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer Transform Block size (Chroma)
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @param[in] mode
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  integer intraprediction mode
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @returns
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevc_intra_pred_chroma_dc_sse42(UWORD8 *pu1_ref,
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 src_strd,
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      UWORD8 *pu1_dst,
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 dst_strd,
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 nt,
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                      WORD32 mode)
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 acc_dc_u, acc_dc_v;
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 dc_val_u, dc_val_v;
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 row;
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 log2nt = 5;
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i src_temp7, src_temp8, src_temp9, src_temp10;
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_zero = _mm_set1_epi32(0);
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(src_strd);
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    UNUSED(mode);
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    switch(nt)
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        case 32:
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            log2nt = 5;
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        case 16:
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            log2nt = 4;
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        case 8:
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            log2nt = 3;
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        case 4:
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            log2nt = 2;
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        default:
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            break;
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    acc_dc_u = 0;
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    acc_dc_v = 0;
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Calculate DC value for the transform block */
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_mask = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(nt == 16)
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i temp_sad;
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp6 =  _mm_cvtepu8_epi16(src_temp4);
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp9 =  _mm_cvtepu8_epi16(src_temp7);
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp10 =  _mm_cvtepu8_epi16(src_temp8);
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 = _mm_srli_si128(src_temp3, 8);
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_srli_si128(src_temp4, 8);
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp7 = _mm_srli_si128(src_temp7, 8);
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp8 = _mm_srli_si128(src_temp8, 8);
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp7 =  _mm_cvtepu8_epi16(src_temp7);
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp8 =  _mm_cvtepu8_epi16(src_temp8);
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else if(nt == 8)
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i temp_sad;
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp6 =  _mm_cvtepu8_epi16(src_temp4);
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 = _mm_srli_si128(src_temp3, 8);
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_srli_si128(src_temp4, 8);
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else if(nt == 4)
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i temp_sad;
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_srli_si128(src_temp3, 8);
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    acc_dc_u += pu1_ref[6 * nt];
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    acc_dc_v += pu1_ref[6 * nt + 1];
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    acc_dc_u -= pu1_ref[4 * nt];
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    acc_dc_v -= pu1_ref[4 * nt + 1];
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dc_val_u = dc_val_u | (dc_val_v << 8);
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /* Fill the remaining rows with DC value*/
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(nt == 4)
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1 = _mm_set1_epi16(dc_val_u);
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else if(nt == 8)
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1 = _mm_set1_epi16(dc_val_u);
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else /* nt == 16 */
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        src_temp1 = _mm_set1_epi16(dc_val_u);
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(row = 0; row < nt; row += 8)
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 8 * dst_strd;
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
487