180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/****************************************************************************** 280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * 380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * Copyright (C) 2015 The Android Open Source Project 480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * 580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * Licensed under the Apache License, Version 2.0 (the "License"); 680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * you may not use this file except in compliance with the License. 780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * You may obtain a copy of the License at: 880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * 980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * http://www.apache.org/licenses/LICENSE-2.0 1080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * 1180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * Unless required by applicable law or agreed to in writing, software 1280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * distributed under the License is distributed on an "AS IS" BASIS, 1380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * See the License for the specific language governing permissions and 1580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * limitations under the License. 1680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * 1780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar ***************************************************************************** 1880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 1980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*/ 2080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/** 2180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar******************************************************************************* 2280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @file 2380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* icv_variance_sse42.c 2480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 2580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @brief 2680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* This file contains the functions to compute variance 2780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 2880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @author 2980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* Ittiam 3080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 3180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @par List of Functions: 3280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* icv_variance_8x4_ssse3() 3380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 3480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @remarks 3580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* None 3680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 3780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar******************************************************************************* 3880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*/ 3980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/*****************************************************************************/ 4080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/* File Includes */ 4180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/*****************************************************************************/ 4280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/* System include files */ 4380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <stdio.h> 4480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <stdint.h> 4580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <string.h> 4680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <stdlib.h> 4780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <assert.h> 4880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include <immintrin.h> 4980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 5080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/* User include files */ 5180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include "icv_datatypes.h" 5280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include "icv_macros.h" 5380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include "icv_platform_macros.h" 5480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar#include "icv.h" 5580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 5680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar/** 5780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar******************************************************************************* 5880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 5980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @brief 6080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* Computes variance of a given 8x4 block 6180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 6280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @par Description 6380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* Compute variance of a given 8x4 block 6480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 6580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @param[in] pu1_src 6680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* Source 6780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 6880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @param[in] src_strd 6980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* Source stride 7080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 7180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @param[in] wd 7280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* Assumed to be 8 7380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 7480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @param[in] ht 7580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* Assumed to be 4 7680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 7780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @returns 7880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* Variance 7980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 8080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* @remarks 8180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar* 8280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar******************************************************************************* 8380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar*/ 8480a14110036632549a272c812f92b791fb08e87aHarish MahendrakarWORD32 icv_variance_8x4_ssse3(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 ht) 8580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar{ 8680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar WORD32 sum; 8780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar WORD32 sum_sqr; 8880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar WORD32 blk_sz; 8980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar WORD32 vrnc; 9080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar __m128 src_r0, src_r1; 9180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar __m128i ssrc_r0, ssrc_r1, ssrc_r2, ssrc_r3; 9280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar __m128i sum_r0, sum_r1; 9380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar __m128i sqr_r0, sqr_r1, sqr_r2, sqr_r3; 9480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar __m128i vsum, vsum_sqr; 9580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar __m128i zero; 9680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar UNUSED(wd); 9780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar UNUSED(ht); 9880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 9980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar ASSERT(wd == 8); 10080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar ASSERT(ht == 4); 10180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 10280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar sum = 0; 10380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar sum_sqr = 0; 10480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 10580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar blk_sz = 8 * 4; 10680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 10780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar zero = _mm_setzero_si128(); 10880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 10980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar /* Load source */ 11080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); 11180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar pu1_src += src_strd; 11280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 11380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); 11480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar pu1_src += src_strd; 11580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 11680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src)); 11780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar pu1_src += src_strd; 11880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 11980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src)); 12080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar pu1_src += src_strd; 12180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 12280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar /* Compute sum of all elements */ 12380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar /* Use SAD with 0, since there is no pairwise addition */ 12480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar sum_r0 = _mm_sad_epu8((__m128i)src_r0, zero); 12580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar sum_r1 = _mm_sad_epu8((__m128i)src_r1, zero); 12680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 12780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar /* Accumulate SAD */ 12880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar vsum = _mm_add_epi64(sum_r0, sum_r1); 12980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar vsum = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8)); 13080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 13180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar sum = _mm_cvtsi128_si32(vsum); 13280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 13380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar /* Unpack to 16 bits */ 13480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar ssrc_r0 = _mm_unpacklo_epi8((__m128i)src_r0, zero); 13580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar ssrc_r1 = _mm_unpacklo_epi8((__m128i)src_r1, zero); 13680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar ssrc_r2 = _mm_unpackhi_epi8((__m128i)src_r0, zero); 13780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar ssrc_r3 = _mm_unpackhi_epi8((__m128i)src_r1, zero); 13880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 13980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar /* Compute sum of squares */ 14080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar sqr_r0 = _mm_madd_epi16(ssrc_r0, ssrc_r0); 14180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar sqr_r1 = _mm_madd_epi16(ssrc_r1, ssrc_r1); 14280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar sqr_r2 = _mm_madd_epi16(ssrc_r2, ssrc_r2); 14380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar sqr_r3 = _mm_madd_epi16(ssrc_r3, ssrc_r3); 14480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 14580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar vsum_sqr = _mm_add_epi32(sqr_r0, sqr_r1); 14680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r2); 14780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r3); 14880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 14980a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 8)); 15080a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 4)); 15180a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar sum_sqr = _mm_cvtsi128_si32(vsum_sqr); 15280a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 15380a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar /* Compute variance */ 15480a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar vrnc = ((sum_sqr * blk_sz) - (sum * sum)) / (blk_sz * blk_sz); 15580a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 15680a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar return vrnc; 15780a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar} 15880a14110036632549a272c812f92b791fb08e87aHarish Mahendrakar 159