1/****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19*/ 20/** 21******************************************************************************* 22* @file 23* icv_variance_sse42.c 24* 25* @brief 26* This file contains the functions to compute variance 27* 28* @author 29* Ittiam 30* 31* @par List of Functions: 32* icv_variance_8x4_ssse3() 33* 34* @remarks 35* None 36* 37******************************************************************************* 38*/ 39/*****************************************************************************/ 40/* File Includes */ 41/*****************************************************************************/ 42/* System include files */ 43#include <stdio.h> 44#include <stdint.h> 45#include <string.h> 46#include <stdlib.h> 47#include <assert.h> 48#include <immintrin.h> 49 50/* User include files */ 51#include "icv_datatypes.h" 52#include "icv_macros.h" 53#include "icv_platform_macros.h" 54#include "icv.h" 55 56/** 57******************************************************************************* 58* 59* @brief 60* Computes variance of a given 8x4 block 61* 62* @par Description 63* Compute variance of a given 8x4 block 64* 65* @param[in] pu1_src 66* Source 67* 68* @param[in] src_strd 69* Source stride 70* 71* @param[in] wd 72* Assumed to be 8 73* 74* @param[in] ht 75* Assumed to be 4 76* 77* @returns 78* Variance 79* 80* @remarks 81* 82******************************************************************************* 83*/ 84WORD32 icv_variance_8x4_ssse3(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 ht) 85{ 86 WORD32 sum; 87 WORD32 sum_sqr; 88 WORD32 blk_sz; 89 WORD32 vrnc; 90 __m128 src_r0, src_r1; 91 __m128i ssrc_r0, ssrc_r1, ssrc_r2, ssrc_r3; 92 __m128i sum_r0, sum_r1; 93 __m128i sqr_r0, sqr_r1, sqr_r2, sqr_r3; 94 __m128i vsum, vsum_sqr; 95 __m128i zero; 96 UNUSED(wd); 97 UNUSED(ht); 98 99 ASSERT(wd == 8); 100 ASSERT(ht == 4); 101 102 sum = 0; 103 sum_sqr = 0; 104 105 blk_sz = 8 * 4; 106 107 zero = _mm_setzero_si128(); 108 109 /* Load source */ 110 src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); 111 pu1_src += src_strd; 112 113 src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src)); 114 pu1_src += src_strd; 115 116 src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src)); 117 pu1_src += src_strd; 118 119 src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src)); 120 pu1_src += src_strd; 121 122 /* Compute sum of all elements */ 123 /* Use SAD with 0, since there is no pairwise addition */ 124 sum_r0 = _mm_sad_epu8((__m128i)src_r0, zero); 125 sum_r1 = _mm_sad_epu8((__m128i)src_r1, zero); 126 127 /* Accumulate SAD */ 128 vsum = _mm_add_epi64(sum_r0, sum_r1); 129 vsum = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8)); 130 131 sum = _mm_cvtsi128_si32(vsum); 132 133 /* Unpack to 16 bits */ 134 ssrc_r0 = _mm_unpacklo_epi8((__m128i)src_r0, zero); 135 ssrc_r1 = _mm_unpacklo_epi8((__m128i)src_r1, zero); 136 ssrc_r2 = _mm_unpackhi_epi8((__m128i)src_r0, zero); 137 ssrc_r3 = _mm_unpackhi_epi8((__m128i)src_r1, zero); 138 139 /* Compute sum of squares */ 140 sqr_r0 = _mm_madd_epi16(ssrc_r0, ssrc_r0); 141 sqr_r1 = _mm_madd_epi16(ssrc_r1, ssrc_r1); 142 sqr_r2 = _mm_madd_epi16(ssrc_r2, ssrc_r2); 143 sqr_r3 = _mm_madd_epi16(ssrc_r3, ssrc_r3); 144 145 vsum_sqr = _mm_add_epi32(sqr_r0, sqr_r1); 146 vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r2); 147 vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r3); 148 149 vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 8)); 150 vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 4)); 151 sum_sqr = _mm_cvtsi128_si32(vsum_sqr); 152 153 /* Compute variance */ 154 vrnc = ((sum_sqr * blk_sz) - (sum * sum)) / (blk_sz * blk_sz); 155 156 return vrnc; 157} 158 159