1/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20/**
21*******************************************************************************
22* @file
23*  icv_variance_sse42.c
24*
25* @brief
26*  This file contains the functions to compute variance
27*
28* @author
29*  Ittiam
30*
31* @par List of Functions:
32*  icv_variance_8x4_ssse3()
33*
34* @remarks
35*  None
36*
37*******************************************************************************
38*/
39/*****************************************************************************/
40/* File Includes                                                             */
41/*****************************************************************************/
42/* System include files */
43#include <stdio.h>
44#include <stdint.h>
45#include <string.h>
46#include <stdlib.h>
47#include <assert.h>
48#include <immintrin.h>
49
50/* User include files */
51#include "icv_datatypes.h"
52#include "icv_macros.h"
53#include "icv_platform_macros.h"
54#include "icv.h"
55
56/**
57*******************************************************************************
58*
59* @brief
60*  Computes variance of a given 8x4 block
61*
62* @par   Description
63*  Compute variance of a given 8x4 block
64*
65* @param[in] pu1_src
66*  Source
67*
68* @param[in] src_strd
69*  Source stride
70*
71* @param[in] wd
72*  Assumed to be 8
73*
74* @param[in] ht
75*  Assumed to be 4
76*
77* @returns
78*  Variance
79*
80* @remarks
81*
82*******************************************************************************
83*/
84WORD32 icv_variance_8x4_ssse3(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 ht)
85{
86    WORD32 sum;
87    WORD32 sum_sqr;
88    WORD32 blk_sz;
89    WORD32 vrnc;
90    __m128  src_r0, src_r1;
91    __m128i ssrc_r0, ssrc_r1, ssrc_r2, ssrc_r3;
92    __m128i sum_r0, sum_r1;
93    __m128i sqr_r0, sqr_r1, sqr_r2, sqr_r3;
94    __m128i vsum, vsum_sqr;
95    __m128i zero;
96    UNUSED(wd);
97    UNUSED(ht);
98
99    ASSERT(wd == 8);
100    ASSERT(ht == 4);
101
102    sum     = 0;
103    sum_sqr = 0;
104
105    blk_sz = 8 * 4;
106
107    zero = _mm_setzero_si128();
108
109    /* Load source */
110    src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
111    pu1_src += src_strd;
112
113    src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
114    pu1_src += src_strd;
115
116    src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src));
117    pu1_src += src_strd;
118
119    src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src));
120    pu1_src += src_strd;
121
122    /* Compute sum of all elements */
123    /* Use SAD with 0, since there is no pairwise addition */
124    sum_r0  = _mm_sad_epu8((__m128i)src_r0, zero);
125    sum_r1  = _mm_sad_epu8((__m128i)src_r1, zero);
126
127    /* Accumulate SAD */
128    vsum    = _mm_add_epi64(sum_r0, sum_r1);
129    vsum    = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8));
130
131    sum = _mm_cvtsi128_si32(vsum);
132
133    /* Unpack to 16 bits */
134    ssrc_r0 = _mm_unpacklo_epi8((__m128i)src_r0, zero);
135    ssrc_r1 = _mm_unpacklo_epi8((__m128i)src_r1, zero);
136    ssrc_r2 = _mm_unpackhi_epi8((__m128i)src_r0, zero);
137    ssrc_r3 = _mm_unpackhi_epi8((__m128i)src_r1, zero);
138
139    /* Compute sum of squares */
140    sqr_r0 = _mm_madd_epi16(ssrc_r0,  ssrc_r0);
141    sqr_r1 = _mm_madd_epi16(ssrc_r1,  ssrc_r1);
142    sqr_r2 = _mm_madd_epi16(ssrc_r2,  ssrc_r2);
143    sqr_r3 = _mm_madd_epi16(ssrc_r3,  ssrc_r3);
144
145    vsum_sqr = _mm_add_epi32(sqr_r0,   sqr_r1);
146    vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r2);
147    vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r3);
148
149    vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 8));
150    vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 4));
151    sum_sqr  = _mm_cvtsi128_si32(vsum_sqr);
152
153    /* Compute variance */
154    vrnc = ((sum_sqr * blk_sz) - (sum * sum)) / (blk_sz * blk_sz);
155
156    return vrnc;
157}
158
159