180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/******************************************************************************
280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar *
380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * Copyright (C) 2015 The Android Open Source Project
480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar *
580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * Licensed under the Apache License, Version 2.0 (the "License");
680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * you may not use this file except in compliance with the License.
780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * You may obtain a copy of the License at:
880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar *
980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * http://www.apache.org/licenses/LICENSE-2.0
1080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar *
1180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * Unless required by applicable law or agreed to in writing, software
1280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * distributed under the License is distributed on an "AS IS" BASIS,
1380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * See the License for the specific language governing permissions and
1580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * limitations under the License.
1680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar *
1780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar *****************************************************************************
1880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
1980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*/
2080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/**
2180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*******************************************************************************
2280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @file
2380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  icv_variance_sse42.c
2480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
2580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @brief
2680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  This file contains the functions to compute variance
2780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
2880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @author
2980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  Ittiam
3080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
3180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @par List of Functions:
3280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  icv_variance_8x4_ssse3()
3380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
3480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @remarks
3580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  None
3680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
3780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*******************************************************************************
3880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*/
3980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/*****************************************************************************/
4080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/* File Includes                                                             */
4180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/*****************************************************************************/
4280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/* System include files */
4380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <stdio.h>
4480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <stdint.h>
4580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <string.h>
4680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <stdlib.h>
4780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <assert.h>
4880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <immintrin.h>
4980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
5080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/* User include files */
5180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include "icv_datatypes.h"
5280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include "icv_macros.h"
5380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include "icv_platform_macros.h"
5480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include "icv.h"
5580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
5680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/**
5780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*******************************************************************************
5880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
5980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @brief
6080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  Computes variance of a given 8x4 block
6180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
6280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @par   Description
6380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  Compute variance of a given 8x4 block
6480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
6580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @param[in] pu1_src
6680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  Source
6780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
6880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @param[in] src_strd
6980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  Source stride
7080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
7180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @param[in] wd
7280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  Assumed to be 8
7380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
7480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @param[in] ht
7580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  Assumed to be 4
7680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
7780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @returns
7880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*  Variance
7980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
8080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @remarks
8180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*
8280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*******************************************************************************
8380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*/
8480a14110036632549a272c812f92b791fb08e87aHarish MahendrakarWORD32 icv_variance_8x4_ssse3(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 ht)
8580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar{
8680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    WORD32 sum;
8780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    WORD32 sum_sqr;
8880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    WORD32 blk_sz;
8980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    WORD32 vrnc;
9080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    __m128  src_r0, src_r1;
9180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    __m128i ssrc_r0, ssrc_r1, ssrc_r2, ssrc_r3;
9280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    __m128i sum_r0, sum_r1;
9380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    __m128i sqr_r0, sqr_r1, sqr_r2, sqr_r3;
9480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    __m128i vsum, vsum_sqr;
9580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    __m128i zero;
9680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    UNUSED(wd);
9780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    UNUSED(ht);
9880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
9980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    ASSERT(wd == 8);
10080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    ASSERT(ht == 4);
10180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
10280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    sum     = 0;
10380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    sum_sqr = 0;
10480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
10580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    blk_sz = 8 * 4;
10680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
10780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    zero = _mm_setzero_si128();
10880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
10980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    /* Load source */
11080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
11180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    pu1_src += src_strd;
11280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
11380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
11480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    pu1_src += src_strd;
11580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
11680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src));
11780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    pu1_src += src_strd;
11880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
11980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src));
12080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    pu1_src += src_strd;
12180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
12280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    /* Compute sum of all elements */
12380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    /* Use SAD with 0, since there is no pairwise addition */
12480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    sum_r0  = _mm_sad_epu8((__m128i)src_r0, zero);
12580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    sum_r1  = _mm_sad_epu8((__m128i)src_r1, zero);
12680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
12780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    /* Accumulate SAD */
12880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    vsum    = _mm_add_epi64(sum_r0, sum_r1);
12980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    vsum    = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8));
13080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
13180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    sum = _mm_cvtsi128_si32(vsum);
13280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
13380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    /* Unpack to 16 bits */
13480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    ssrc_r0 = _mm_unpacklo_epi8((__m128i)src_r0, zero);
13580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    ssrc_r1 = _mm_unpacklo_epi8((__m128i)src_r1, zero);
13680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    ssrc_r2 = _mm_unpackhi_epi8((__m128i)src_r0, zero);
13780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    ssrc_r3 = _mm_unpackhi_epi8((__m128i)src_r1, zero);
13880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
13980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    /* Compute sum of squares */
14080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    sqr_r0 = _mm_madd_epi16(ssrc_r0,  ssrc_r0);
14180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    sqr_r1 = _mm_madd_epi16(ssrc_r1,  ssrc_r1);
14280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    sqr_r2 = _mm_madd_epi16(ssrc_r2,  ssrc_r2);
14380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    sqr_r3 = _mm_madd_epi16(ssrc_r3,  ssrc_r3);
14480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
14580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    vsum_sqr = _mm_add_epi32(sqr_r0,   sqr_r1);
14680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r2);
14780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r3);
14880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
14980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 8));
15080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 4));
15180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    sum_sqr  = _mm_cvtsi128_si32(vsum_sqr);
15280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
15380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    /* Compute variance */
15480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    vrnc = ((sum_sqr * blk_sz) - (sum * sum)) / (blk_sz * blk_sz);
15580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
15680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar    return vrnc;
15780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar}
15880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar
159