10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/******************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar******************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  ihevcd_it_rec_dc_x86_intr.c
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Platform specific intrinsic implementation of certain functions
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @author
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  Ittiam
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @par List of Functions:
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevcd_itrans_recon_dc
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  - ihevcd_fmt_conv_420sp_to_420p
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar* @remarks
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*  None
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*******************************************************************************
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar*/
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_typedefs.h"
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_defs.h"
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_macros.h"
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevc_platform_macros.h"
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include "ihevcd_function_selector.h"
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar#include <immintrin.h>
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevcd_itrans_recon_dc_luma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                       WORD32 log2_trans_size, WORD16 i2_coeff_value)
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_0;
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_1;
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_2;
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_3;
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_4;
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_5;
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_6;
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_7;
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_8;
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_9;
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_10;
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_11;
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_12;
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_13;
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_14;
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_15;
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_20, zero_8x16b;
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i *pi4_dst = (__m128i *)pu1_dst;
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //WORD32 row,col;
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 add, shift;
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 dc_value, quant_out;
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 trans_size;
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    trans_size = (1 << log2_trans_size);
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    quant_out = i2_coeff_value;
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shift = IT_SHIFT_STAGE_1;
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add = 1 << (shift - 1);
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shift = IT_SHIFT_STAGE_2;
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add = 1 << (shift - 1);
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /*Replicate the DC value within 16 bits in 128 bit register*/
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_20 = _mm_set1_epi16(dc_value);
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_8x16b = _mm_setzero_si128();
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(trans_size == 4)
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 *pi4_dst = (WORD32 *)pu1_dst;
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1);
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3);
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8);
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4);
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8);
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12);
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += dst_strd;
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi4_dst = (WORD32 *)(pu1_dst);
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += dst_strd;
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi4_dst = (WORD32 *)(pu1_dst);
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += dst_strd;
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi4_dst = (WORD32 *)(pu1_dst);
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 i, j;
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(i = 1; i <= trans_size; i += 4)
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(j = 1; j <= trans_size; j += 8)
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b);
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b);
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b);
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b);
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst);
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_12);
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst + dst_strd);
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8);
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_13);
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_14);
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8);
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_15);
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += 8;
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += 8;
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += 4 * pred_strd - trans_size;
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 4 * dst_strd - trans_size;
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarvoid ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                                         WORD32 log2_trans_size, WORD16 i2_coeff_value)
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar{
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_0;
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_1;
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_2;
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_3;
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_4;
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_5;
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_6;
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_7;
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_8;
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_9;
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_10;
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_11;
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_12;
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_13;
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_14;
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_15;
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i m_temp_reg_20, zero_8x16b;
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    __m128i *pi4_dst = (__m128i *)pu1_dst;
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    //WORD32 row,col;
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 add, shift;
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 dc_value, quant_out;
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 trans_size;
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 shuffle_mask_4x4 = 0x06040200;
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    WORD32 unchanged_mask_4x4 = 0x07050301;
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LWORD64 shuffle_mask = 0x0E0C0A0806040200LL;
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LWORD64 unchanged_mask = 0x0F0D0B0907050301LL;
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    trans_size = (1 << log2_trans_size);
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    quant_out = i2_coeff_value;
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shift = IT_SHIFT_STAGE_1;
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add = 1 << (shift - 1);
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    shift = IT_SHIFT_STAGE_2;
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add = 1 << (shift - 1);
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    /*Replicate the DC value within 16 bits in 128 bit register*/
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    m_temp_reg_20 = _mm_set1_epi16(dc_value);
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    zero_8x16b = _mm_setzero_si128();
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    if(trans_size == 4)
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i chroma_shuffle_mask_16x8b;
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i chroma_unchanged_mask_16x8b;
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4);
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4);
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*Load the prediction data*/
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_10  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_11  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_12  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_13  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b);
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b);
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst);
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd));
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd));
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0);
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1);
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2);
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3);
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        /*Store the result in the destination*/
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64(pi4_dst, m_temp_reg_9);
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += dst_strd;
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi4_dst = (__m128i *)(pu1_dst);
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64(pi4_dst, m_temp_reg_10);
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += dst_strd;
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi4_dst = (__m128i *)(pu1_dst);
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64(pi4_dst, m_temp_reg_11);
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pu1_dst += dst_strd;
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        pi4_dst = (__m128i *)(pu1_dst);
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        _mm_storel_epi64(pi4_dst, m_temp_reg_12);
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    else
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    {
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        WORD32 i, j;
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i chroma_shuffle_mask_16x8b;
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        __m128i chroma_unchanged_mask_16x8b;
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask));
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        chroma_unchanged_mask_16x8b =
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                        _mm_loadl_epi64((__m128i *)(&unchanged_mask));
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        for(i = 0; i < trans_size; i += 4)
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        {
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            for(j = 0; j < trans_size; j += 8)
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            {
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred);
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd));
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd));
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd));
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*Retain only one chroma component*/
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b);
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b);
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst);
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd));
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd));
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd));
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0);
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8);
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1);
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2);
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8);
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3);
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                /*Store the result in the destination*/
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst);
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_12);
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8);
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst + 8);
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_8);
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst + dst_strd);
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_13);
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8);
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8);
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_9);
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_14);
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8);
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8);
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_10);
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_15);
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8);
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8);
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                _mm_storel_epi64(pi4_dst, m_temp_reg_11);
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_pred += 16;
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar                pu1_dst += 16;
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            }
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_pred += 4 * pred_strd - 2 * trans_size;
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar            pu1_dst += 4 * dst_strd - 2 * trans_size;
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar        }
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    }
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar}
402