common/x86/ihevc_chroma_intra_pred_filters_sse42_intr.c

/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
*  ihevc_chroma_intra_pred_filters_x86_intr.c
*
* @brief
*  Contains function Definition for intra prediction  interpolation filters
*
*
* @author
*  Ittiam
*
* @par List of Functions:
*  ihevc_intra_pred_chroma_planar_sse42()
*
*  ihevc_intra_pred_chroma_dc_sse42()
*
* @remarks
*  None
*
*******************************************************************************
*/


/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

#include "ihevc_typedefs.h"
#include "ihevc_macros.h"
#include "ihevc_func_selector.h"
#include "ihevc_platform_macros.h"
#include "ihevc_intra_pred.h"
#include "ihevc_chroma_intra_pred.h"
#include "ihevc_common_tables.h"
#include "ihevc_tables_x86_intr.h"

#include <mmintrin.h>
#include <xmmintrin.h>
#include <emmintrin.h>
#include <smmintrin.h>
#include <immintrin.h>


/****************************************************************************/
/* Constant Macros                                                          */
/****************************************************************************/
#define MAX_CU_SIZE 64
#define BIT_DEPTH 8
#define T32_4NT 128
#define T16_4NT 64
#define T16C_4NT 64
#define T8C_4NT 32
/****************************************************************************/
/* Function Macros                                                          */
/****************************************************************************/

#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)

/* tables to shuffle 8-bit values */

/*****************************************************************************/
/* Function Definition                                                      */
/*****************************************************************************/


/**
*******************************************************************************
*
* @brief
*  Planar Intraprediction with reference neighboring samples location
* pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
* to section 8.4.4.2.4 in the standard
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_intra_pred_chroma_planar_sse42(UWORD8 *pu1_ref,
                                          WORD32 src_strd,
                                          UWORD8 *pu1_dst,
                                          WORD32 dst_strd,
                                          WORD32 nt,
                                          WORD32 mode)
{

    WORD32 row, col;
    WORD32 log2nt = 5;
    WORD32 two_nt, three_nt;

    __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
    __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
    UNUSED(src_strd);
    UNUSED(mode);

    switch(nt)
    {
        case 16:
            log2nt = 4;
            break;
        case 8:
            log2nt = 3;
            break;
        case 4:
            log2nt = 2;
            break;
        default:
            break;
    }
    two_nt = 2 * nt;
    three_nt = 3 * nt;

    /* Planar filtering */

/* setting vallues in  registera*/

//  pu1_ref[2*(two_nt - 1 - row)]
//  pu1_ref[2 * (three_nt + 1)]
//  pu1_ref[2 * (two_nt + 1) + col]
//  pu1_ref[2 * (nt - 1)]

    const_temp_4x32b  = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
                                      pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
                                      pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);

    const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
                                      pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);

    const_temp4_4x32b = _mm_set1_epi16(nt - 1);
    const_temp6_4x32b = _mm_set1_epi16(nt);
    const_temp7_4x32b = _mm_set1_epi16(4);

    zero_8x16b = _mm_set1_epi32(0);

    if(nt % 4 == 0)
    {
        const_temp7_4x32b = _mm_set1_epi16(4);

        for(row = 0; row < nt; row++)
        {
            __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
            __m128i res_temp3_8x16b;

            const_temp2_4x32b  = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
                                               pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
                                               pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);

            const_temp3_4x32b  = _mm_set1_epi16((row + 1));
            row_8x16b = _mm_set1_epi16((nt - 1 - row));

            const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
            col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);

            const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);

            /*(row + 1) * pu1_ref[nt - 1]*/
            res_temp_8x16b  = _mm_mullo_epi16(const_temp3_4x32b,  const_temp1_4x32b);

            /*(row + 1) * pu1_ref[nt - 1] + nt)*/
            res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);

            for(col = 0; col < 2 * nt; col += 8)
            {
                __m128i src_temp_8x16b;

                /* loding 8bit 16 pixles*/
                src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));

                src_temp_8x16b =  _mm_cvtepu8_epi16(src_temp_8x16b); /* row=0*/

                /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
                res_temp1_8x16b  = _mm_mullo_epi16(src_temp_8x16b,  row_8x16b);

                /*(col + 1) * pu1_ref[three_nt + 1]*/
                res_temp2_8x16b  = _mm_mullo_epi16(const_temp_4x32b,  col_8x16b);

                /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
                res_temp3_8x16b  = _mm_mullo_epi16(const_temp2_4x32b,  const_temp5_4x32b);

                res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);

                res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
                res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);

                _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);

                const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
                col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
            } /* inner loop ends here */
        }
    }
}

/**
*******************************************************************************
*
* @brief
*  Intraprediction for DC mode with reference neighboring  samples location
* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
* to section 8.4.4.2.5 in the standard
*
* @par Description:
*
*
* @param[in] pu1_src
*  UWORD8 pointer to the source
*
* @param[in] pu1_dst
*  UWORD8 pointer to the destination
*
* @param[in] src_strd
*  integer source stride
*
* @param[in] dst_strd
*  integer destination stride
*
* @param[in] nt
*  integer Transform Block size (Chroma)
*
* @param[in] mode
*  integer intraprediction mode
*
* @returns
*
* @remarks
*  None
*
*******************************************************************************
*/

void ihevc_intra_pred_chroma_dc_sse42(UWORD8 *pu1_ref,
                                      WORD32 src_strd,
                                      UWORD8 *pu1_dst,
                                      WORD32 dst_strd,
                                      WORD32 nt,
                                      WORD32 mode)
{

    WORD32 acc_dc_u, acc_dc_v;
    WORD32 dc_val_u, dc_val_v;
    WORD32 row;
    WORD32 log2nt = 5;
    __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
    __m128i src_temp7, src_temp8, src_temp9, src_temp10;
    __m128i m_zero = _mm_set1_epi32(0);
    UNUSED(src_strd);
    UNUSED(mode);

    switch(nt)
    {
        case 32:
            log2nt = 5;
            break;
        case 16:
            log2nt = 4;
            break;
        case 8:
            log2nt = 3;
            break;
        case 4:
            log2nt = 2;
            break;
        default:
            break;
    }

    acc_dc_u = 0;
    acc_dc_v = 0;

    /* Calculate DC value for the transform block */

    m_mask = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);

    if(nt == 16)
    {
        __m128i temp_sad;

        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
        src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
        src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));

        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
        src_temp6 =  _mm_cvtepu8_epi16(src_temp4);
        src_temp9 =  _mm_cvtepu8_epi16(src_temp7);
        src_temp10 =  _mm_cvtepu8_epi16(src_temp8);

        src_temp3 = _mm_srli_si128(src_temp3, 8);
        src_temp4 = _mm_srli_si128(src_temp4, 8);
        src_temp7 = _mm_srli_si128(src_temp7, 8);
        src_temp8 = _mm_srli_si128(src_temp8, 8);

        src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
        src_temp7 =  _mm_cvtepu8_epi16(src_temp7);
        src_temp8 =  _mm_cvtepu8_epi16(src_temp8);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
        src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
        src_temp10 = _mm_add_epi16(src_temp9, src_temp10);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
        src_temp8 = _mm_add_epi16(src_temp8, src_temp10);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);

        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    }

    else if(nt == 8)
    {
        __m128i temp_sad;
        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));

        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
        src_temp6 =  _mm_cvtepu8_epi16(src_temp4);

        src_temp3 = _mm_srli_si128(src_temp3, 8);
        src_temp4 = _mm_srli_si128(src_temp4, 8);

        src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);

        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    }

    else if(nt == 4)
    {
        __m128i temp_sad;
        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));

        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
        src_temp4 = _mm_srli_si128(src_temp3, 8);
        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);

        src_temp4 = _mm_add_epi16(src_temp4, src_temp5);

        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);

        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    }


    acc_dc_u += pu1_ref[6 * nt];
    acc_dc_v += pu1_ref[6 * nt + 1];

    acc_dc_u -= pu1_ref[4 * nt];
    acc_dc_v -= pu1_ref[4 * nt + 1];

    dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
    dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);

    dc_val_u = dc_val_u | (dc_val_v << 8);

    /* Fill the remaining rows with DC value*/

    if(nt == 4)
    {
        src_temp1 = _mm_set1_epi16(dc_val_u);

        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);

    }
    else if(nt == 8)
    {
        src_temp1 = _mm_set1_epi16(dc_val_u);

        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
        _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);

        _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
        _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);

    }

    else /* nt == 16 */
    {

        src_temp1 = _mm_set1_epi16(dc_val_u);

        for(row = 0; row < nt; row += 8)
        {
            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);

            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);

            pu1_dst += 8 * dst_strd;
        }


    }

}