1/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20/**
21 *******************************************************************************
22 * @file
23 *  ih264_resi_trans_quant_sse42.c
24 *
25 * @brief
26 *  Contains function definitions single stage  forward transform for H.264
27 *  It will calculate the residue, do the cf and then do quantization
28 *
29 * @author
30 *  Mohit [100664]
31 *
32 * @par List of Functions:
33 *  - ih264_resi_trans_quant_4x4_sse42()
34 *  - ih264_resi_trans_quant_chroma_4x4_sse42()
35 *
36 * @remarks
37 *  None
38 *
39 *******************************************************************************
40 */
41/* System include files */
42#include <stddef.h>
43
44/* User include files */
45#include "ih264_typedefs.h"
46#include "ih264_defs.h"
47#include "ih264_size_defs.h"
48#include "ih264_macros.h"
49#include "ih264_trans_macros.h"
50#include "ih264_trans_data.h"
51#include "ih264_structs.h"
52#include "ih264_trans_quant_itrans_iquant.h"
53#include <immintrin.h>
54/**
55 *******************************************************************************
56 *
57 * @brief
58 *   This function performs forward transform and quantization on a 4*4 block
59 *
60 * @par Description:
61 *   The function accepts source buffer and estimation buffer. From these, it
62 *   computes the residue. This is residue is then transformed and quantized.
63 *   The transform and quantization are in placed computed. They use the residue
64 *   buffer for this.
65 *
66 * @param[in] pu1_src
67 *   Pointer to source sub-block
68 *
69 * @param[in] pu1_pred
70 *   Pointer to prediction sub-block
71 *
72 * @param[in] pi2_out
73 *   Pointer to residual sub-block
74 *
75 * @param[in] src_strd
76 *   Source stride
77 *
78 * @param[in] pred_strd
79 *   Prediction stride
80 *
81 * @param[in] dst_strd
82 *   Destination stride
83 *
84 * @param[in] u4_qbits
85 *    QP_BITS_h264_4x4 + floor(QP/6)
86 *
87 * @param[in] pu2_threshold_matrix
88 *   Pointer to Forward Quant Threshold Matrix
89 *
90 * @param[in] pu2_scale_matrix
91 *   Pointer to Forward Quant Scale Matrix
92 *
93 * @param[in] u4_round_factor
94 *   Quantization Round factor
95 *
96 * @param[out] pu1_nnz
97 *   Total non-zero coefficients in the current sub-block
98 *
99 * @returns
100 *
101 * @remarks
102 *   None
103 *
104 *******************************************************************************
105 */
106void ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred,
107                                      WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
108                                      const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix,
109                                      UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD8 *pu1_nnz,
110                                      WORD16 *pi2_alt_dc_addr)
111{
112    WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
113    WORD32 mask0, mask1;
114    __m128i sum0, sum1, sum2, cmp0, cmp1;
115    __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
116    __m128i temp_2 = _mm_set1_epi16(2);
117    __m128i temp_1 = _mm_set1_epi16(1);
118    __m128i src_r0, src_r1, src_r2, src_r3;
119    __m128i pred_r0, pred_r1, pred_r2, pred_r3;
120    __m128i temp0, temp1, temp2, temp3;
121    __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
122    __m128i sign_reg0, sign_reg2;
123    __m128i scalemat_r0_r1, scalemat_r2_r3;
124
125    UNUSED (pu2_threshold_matrix);
126
127    scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
128    scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
129    src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
130    src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
131    src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
132    src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
133
134    src_r0 = _mm_cvtepu8_epi16(src_r0);
135    src_r1 = _mm_cvtepu8_epi16(src_r1);
136    src_r2 = _mm_cvtepu8_epi16(src_r2);
137    src_r3 = _mm_cvtepu8_epi16(src_r3);
138
139    pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
140    pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
141    pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
142    pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
143
144    pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
145    pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
146    pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
147    pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
148
149    src_r0 = _mm_sub_epi16(src_r0, pred_r0);
150    src_r1 = _mm_sub_epi16(src_r1, pred_r1);
151    src_r2 = _mm_sub_epi16(src_r2, pred_r2);
152    src_r3 = _mm_sub_epi16(src_r3, pred_r3);
153
154    /* Perform Forward transform */
155    /*-------------------------------------------------------------*/
156    /* DCT [ Horizontal transformation ]                          */
157    /*-------------------------------------------------------------*/
158    // Matrix transpose
159    /*
160     *  a0 a1 a2 a3
161     *  b0 b1 b2 b3
162     *  c0 c1 c2 c3
163     *  d0 d1 d2 d3
164     */
165    temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 b0 a1 b1 a2 b2 a3 b3
166    temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //c0 d0 c1 d1 c2 d2 c3 d3
167    temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 b0 c0 d0 a1 b1 c1 d1
168    temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //a2 b2 c2 d2 a3 b3 c3 d3
169
170    src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 b0 c0 d0
171    src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //a1 b1 c1 d1
172    src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //a2 b2 c2 d2
173    src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //a3 b3 c3 d3
174
175    /*----------------------------------------------------------*/
176    /* x0 = z0 + z3                                             */
177    temp0 = _mm_add_epi16(src_r0, src_r3);
178    /* x1 = z1 + z2                                             */
179    temp1 = _mm_add_epi16(src_r1, src_r2);
180    /* x2 = z1 - z2                                             */
181    temp2 = _mm_sub_epi16(src_r1, src_r2);
182    /* x3 = z0 - z3                                             */
183    temp3 = _mm_sub_epi16(src_r0, src_r3);
184
185    /* z0 = x0 + x1                                             */
186    src_r0 = _mm_add_epi16(temp0, temp1);
187    /* z1 = (x3 << 1) + x2                                      */
188    src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
189    src_r1 = _mm_add_epi16(src_r1, temp2);
190    /* z2 = x0 - x1                                             */
191    src_r2 = _mm_sub_epi16(temp0, temp1);
192    /* z3 = x3 - (x2 << 1)                                      */
193    src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
194    src_r3 = _mm_sub_epi16(temp3, src_r3);
195
196    // Matrix transpose
197    /*
198     *  a0 b0 c0 d0
199     *  a1 b1 c1 d1
200     *  a2 b2 c2 d2
201     *  a3 b3 c3 d3
202     */
203    temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 a1 b0 b1 c0 c1 d0 d1
204    temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //a2 a3 b2 b3 c2 c3 d2 d3
205    temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 a1 a2 a3 b0 b1 b2 b3
206    temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //c0 c1 c2 c3 d0 d1 d2 d3
207
208    src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 a1 a2 a3
209    src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //b0 b1 b2 b3
210    src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //c0 c1 c2 c3
211    src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //d0 d1 d2 d3
212
213    /*----------------------------------------------------------*/
214    /* x0 = z0 + z3                                             */
215    temp0 = _mm_add_epi16(src_r0, src_r3);
216    /* x1 = z1 + z2                                             */
217    temp1 = _mm_add_epi16(src_r1, src_r2);
218    /* x2 = z1 - z2                                             */
219    temp2 = _mm_sub_epi16(src_r1, src_r2);
220    /* x3 = z0 - z3                                             */
221    temp3 = _mm_sub_epi16(src_r0, src_r3);
222
223    /* z0 = x0 + x1                                             */
224    src_r0 = _mm_add_epi16(temp0, temp1);
225    /* z1 = (x3 << 1) + x2                                      */
226    src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
227    src_r1 = _mm_add_epi16(src_r1, temp2);
228    /* z2 = x0 - x1                                             */
229    src_r2 = _mm_sub_epi16(temp0, temp1);
230    /* z3 = x3 - (x2 << 1)                                      */
231    src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
232    src_r3 = _mm_sub_epi16(temp3, src_r3);
233
234    tmp_dc = _mm_extract_epi16(src_r0,0);                       //a0
235    *pi2_alt_dc_addr = tmp_dc;
236
237    src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);                //a0 a1 a2 a3 b0 b1 b2 b3
238    src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);                //c0 c1 c2 c3 d0 d1 d2 d3
239    sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
240    sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
241
242    sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
243    sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
244
245    sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
246    sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
247
248    src_r0 = _mm_abs_epi16(src_r0);
249    src_r2 = _mm_abs_epi16(src_r2);
250
251    src_r1 = _mm_srli_si128(src_r0, 8);
252    src_r0 = _mm_cvtepu16_epi32(src_r0);
253    src_r1 = _mm_cvtepu16_epi32(src_r1);
254    src_r3 = _mm_srli_si128(src_r2, 8);
255    src_r2 = _mm_cvtepu16_epi32(src_r2);
256    src_r3 = _mm_cvtepu16_epi32(src_r3);
257
258    temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
259    scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
260    temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
261    scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
262    temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
263    temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
264
265    temp0 = _mm_mullo_epi32(temp0, src_r0);
266    temp1 = _mm_mullo_epi32(temp1, src_r1);
267    temp2 = _mm_mullo_epi32(temp2, src_r2);
268    temp3 = _mm_mullo_epi32(temp3, src_r3);
269
270    temp0 = _mm_add_epi32(temp0,rnd_fact);
271    temp1 = _mm_add_epi32(temp1,rnd_fact);
272    temp2 = _mm_add_epi32(temp2,rnd_fact);
273    temp3 = _mm_add_epi32(temp3,rnd_fact);
274
275    temp0 = _mm_srli_epi32(temp0,u4_qbits);
276    temp1 = _mm_srli_epi32(temp1,u4_qbits);
277    temp2 = _mm_srli_epi32(temp2,u4_qbits);
278    temp3 = _mm_srli_epi32(temp3,u4_qbits);
279
280    temp0 =  _mm_packs_epi32 (temp0,temp1);
281    temp2 =  _mm_packs_epi32 (temp2,temp3);
282
283    temp0 =  _mm_sign_epi16(temp0, sign_reg0);
284    temp2 =  _mm_sign_epi16(temp2, sign_reg2);
285
286    _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
287    _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
288
289    cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
290    cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
291
292    mask0 = _mm_movemask_epi8(cmp0);
293    mask1 = _mm_movemask_epi8(cmp1);
294    u4_zero_coeff = 0;
295    if(mask0)
296    {
297        if(mask0 == 0xffff)
298            u4_zero_coeff+=8;
299        else
300        {
301            cmp0 = _mm_and_si128(temp_1, cmp0);
302            sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
303            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
304            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
305            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
306        }
307    }
308    if(mask1)
309    {
310        if(mask1 == 0xffff)
311            u4_zero_coeff+=8;
312        else
313        {
314            cmp1 = _mm_and_si128(temp_1, cmp1);
315            sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
316            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
317            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
318            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
319        }
320    }
321
322    /* Return total nonzero coefficients in the current sub block */
323    u4_nonzero_coeff = 16 - u4_zero_coeff;
324    *pu1_nnz =  u4_nonzero_coeff;
325}
326
327/**
328 *******************************************************************************
329 *
330 * @brief
331 *   This function performs forward transform and quantization on a 4*4 chroma block
332 *
333 * @par Description:
334 *   The function accepts source buffer and estimation buffer. From these, it
335 *   computes the residue. This is residue is then transformed and quantized.
336 *   The transform and quantization are in placed computed. They use the residue
337 *   buffer for this.
338 *
339 * @param[in] pu1_src
340 *   Pointer to source sub-block
341 *
342 * @param[in] pu1_pred
343 *   Pointer to prediction sub-block
344 *
345 * @param[in] pi2_out
346 *   Pointer to residual sub-block
347 *
348 * @param[in] src_strd
349 *   Source stride
350 *
351 * @param[in] pred_strd
352 *   Prediction stride
353 *
354 * @param[in] dst_strd
355 *   Destination stride
356 *
357 * @param[in] u4_qbits
358 *    QP_BITS_h264_4x4 + floor(QP/6)
359 *
360 * @param[in] pu2_threshold_matrix
361 *   Pointer to Forward Quant Threshold Matrix
362 *
363 * @param[in] pu2_scale_matrix
364 *   Pointer to Forward Quant Scale Matrix
365 *
366 * @param[in] u4_round_factor
367 *   Quantization Round factor
368 *
369 * @param[out] pu1_nnz
370 *   Total non-zero coefficients in the current sub-block
371 *
372 * @returns
373 *
374 * @remarks
375 *   None
376 *
377 *******************************************************************************
378 */
379void ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WORD16 *pi2_out,
380                                            WORD32 src_strd,WORD32 pred_strd,
381                                            const UWORD16 *pu2_scale_matrix,
382                                            const UWORD16 *pu2_threshold_matrix,
383                                            UWORD32 u4_qbits,UWORD32 u4_round_factor,
384                                            UWORD8  *pu1_nnz, WORD16 *pi2_alt_dc_addr)
385{
386    WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
387    WORD32 mask0, mask1;
388    __m128i cmp0, cmp1, sum0, sum1, sum2;
389    __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
390    __m128i temp_2 = _mm_set1_epi16(2);
391    __m128i temp_1 = _mm_set1_epi16(1);
392    __m128i src_r0, src_r1, src_r2, src_r3;
393    __m128i pred_r0, pred_r1, pred_r2, pred_r3;
394    __m128i temp0, temp1, temp2, temp3;
395    __m128i zero_8x16b = _mm_setzero_si128();          // all bits reset to zero
396    __m128i sign_reg0, sign_reg2;
397    __m128i scalemat_r0_r1, scalemat_r2_r3;
398    __m128i chroma_mask = _mm_set1_epi16 (0xFF);
399
400    UNUSED (pu2_threshold_matrix);
401
402    scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
403    scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
404    src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
405    src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
406    src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
407    src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
408
409    src_r0 = _mm_and_si128(src_r0, chroma_mask);
410    src_r1 = _mm_and_si128(src_r1, chroma_mask);
411    src_r2 = _mm_and_si128(src_r2, chroma_mask);
412    src_r3 = _mm_and_si128(src_r3, chroma_mask);
413//  src_r0 = _mm_cvtepu8_epi16(src_r0);
414//  src_r1 = _mm_cvtepu8_epi16(src_r1);
415//  src_r2 = _mm_cvtepu8_epi16(src_r2);
416//  src_r3 = _mm_cvtepu8_epi16(src_r3);
417
418    pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
419    pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
420    pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
421    pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
422
423    pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
424    pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
425    pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
426    pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
427//  pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
428//  pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
429//  pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
430//  pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
431
432    src_r0 = _mm_sub_epi16(src_r0, pred_r0);
433    src_r1 = _mm_sub_epi16(src_r1, pred_r1);
434    src_r2 = _mm_sub_epi16(src_r2, pred_r2);
435    src_r3 = _mm_sub_epi16(src_r3, pred_r3);
436
437    /* Perform Forward transform */
438    /*-------------------------------------------------------------*/
439    /* DCT [ Horizontal transformation ]                          */
440    /*-------------------------------------------------------------*/
441    // Matrix transpose
442    /*
443     *  a0 a1 a2 a3
444     *  b0 b1 b2 b3
445     *  c0 c1 c2 c3
446     *  d0 d1 d2 d3
447     */
448    temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 b0 a1 b1 a2 b2 a3 b3
449    temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //c0 d0 c1 d1 c2 d2 c3 d3
450    temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 b0 c0 d0 a1 b1 c1 d1
451    temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //a2 b2 c2 d2 a3 b3 c3 d3
452
453    src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 b0 c0 d0
454    src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //a1 b1 c1 d1
455    src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //a2 b2 c2 d2
456    src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //a3 b3 c3 d3
457
458    /*----------------------------------------------------------*/
459    /* x0 = z0 + z3                                             */
460    temp0 = _mm_add_epi16(src_r0, src_r3);
461    /* x1 = z1 + z2                                             */
462    temp1 = _mm_add_epi16(src_r1, src_r2);
463    /* x2 = z1 - z2                                             */
464    temp2 = _mm_sub_epi16(src_r1, src_r2);
465    /* x3 = z0 - z3                                             */
466    temp3 = _mm_sub_epi16(src_r0, src_r3);
467
468    /* z0 = x0 + x1                                             */
469    src_r0 = _mm_add_epi16(temp0, temp1);
470    /* z1 = (x3 << 1) + x2                                      */
471    src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
472    src_r1 = _mm_add_epi16(src_r1, temp2);
473    /* z2 = x0 - x1                                             */
474    src_r2 = _mm_sub_epi16(temp0, temp1);
475    /* z3 = x3 - (x2 << 1)                                      */
476    src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
477    src_r3 = _mm_sub_epi16(temp3, src_r3);
478
479    // Matrix transpose
480    /*
481     *  a0 b0 c0 d0
482     *  a1 b1 c1 d1
483     *  a2 b2 c2 d2
484     *  a3 b3 c3 d3
485     */
486    temp0 = _mm_unpacklo_epi16(src_r0, src_r1);                 //a0 a1 b0 b1 c0 c1 d0 d1
487    temp2 = _mm_unpacklo_epi16(src_r2, src_r3);                 //a2 a3 b2 b3 c2 c3 d2 d3
488    temp1 = _mm_unpacklo_epi32(temp0, temp2);                   //a0 a1 a2 a3 b0 b1 b2 b3
489    temp3 = _mm_unpackhi_epi32(temp0, temp2);                   //c0 c1 c2 c3 d0 d1 d2 d3
490
491    src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);             //a0 a1 a2 a3
492    src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);             //b0 b1 b2 b3
493    src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);             //c0 c1 c2 c3
494    src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);             //d0 d1 d2 d3
495
496    /*----------------------------------------------------------*/
497    /* x0 = z0 + z3                                             */
498    temp0 = _mm_add_epi16(src_r0, src_r3);
499    /* x1 = z1 + z2                                             */
500    temp1 = _mm_add_epi16(src_r1, src_r2);
501    /* x2 = z1 - z2                                             */
502    temp2 = _mm_sub_epi16(src_r1, src_r2);
503    /* x3 = z0 - z3                                             */
504    temp3 = _mm_sub_epi16(src_r0, src_r3);
505
506    /* z0 = x0 + x1                                             */
507    src_r0 = _mm_add_epi16(temp0, temp1);
508    /* z1 = (x3 << 1) + x2                                      */
509    src_r1 = _mm_slli_epi16(temp3, 1);                          //(x3<<1)
510    src_r1 = _mm_add_epi16(src_r1, temp2);
511    /* z2 = x0 - x1                                             */
512    src_r2 = _mm_sub_epi16(temp0, temp1);
513    /* z3 = x3 - (x2 << 1)                                      */
514    src_r3 = _mm_slli_epi16(temp2, 1);                          //(x2<<1)
515    src_r3 = _mm_sub_epi16(temp3, src_r3);
516
517    tmp_dc = _mm_extract_epi16(src_r0,0);                       //a0
518    *pi2_alt_dc_addr = tmp_dc;
519
520    src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);                //a0 a1 a2 a3 b0 b1 b2 b3
521    src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);                //c0 c1 c2 c3 d0 d1 d2 d3
522    sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
523    sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
524
525    sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
526    sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
527
528    sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
529    sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
530
531    src_r0 = _mm_abs_epi16(src_r0);
532    src_r2 = _mm_abs_epi16(src_r2);
533
534    src_r1 = _mm_srli_si128(src_r0, 8);
535    src_r0 = _mm_cvtepu16_epi32(src_r0);
536    src_r1 = _mm_cvtepu16_epi32(src_r1);
537    src_r3 = _mm_srli_si128(src_r2, 8);
538    src_r2 = _mm_cvtepu16_epi32(src_r2);
539    src_r3 = _mm_cvtepu16_epi32(src_r3);
540
541    temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
542    scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
543    temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
544    scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
545    temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
546    temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
547
548    temp0 = _mm_mullo_epi32(temp0, src_r0);
549    temp1 = _mm_mullo_epi32(temp1, src_r1);
550    temp2 = _mm_mullo_epi32(temp2, src_r2);
551    temp3 = _mm_mullo_epi32(temp3, src_r3);
552
553    temp0 = _mm_add_epi32(temp0,rnd_fact);
554    temp1 = _mm_add_epi32(temp1,rnd_fact);
555    temp2 = _mm_add_epi32(temp2,rnd_fact);
556    temp3 = _mm_add_epi32(temp3,rnd_fact);
557
558    temp0 = _mm_srli_epi32(temp0,u4_qbits);
559    temp1 = _mm_srli_epi32(temp1,u4_qbits);
560    temp2 = _mm_srli_epi32(temp2,u4_qbits);
561    temp3 = _mm_srli_epi32(temp3,u4_qbits);
562
563    temp0 =  _mm_packs_epi32 (temp0,temp1);
564    temp2 =  _mm_packs_epi32 (temp2,temp3);
565
566    temp0 =  _mm_sign_epi16(temp0, sign_reg0);
567    temp2 =  _mm_sign_epi16(temp2, sign_reg2);
568
569    //temp0 = _mm_insert_epi16(temp0, tmp_dc, 0);
570
571    _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
572    _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
573
574    cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
575    cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
576
577    mask0 = _mm_movemask_epi8(cmp0);
578    mask1 = _mm_movemask_epi8(cmp1);
579    u4_zero_coeff = 0;
580    if(mask0)
581    {
582        if(mask0 == 0xffff)
583            u4_zero_coeff+=8;
584        else
585        {
586            cmp0 = _mm_and_si128(temp_1, cmp0);
587            sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
588            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
589            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
590            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
591        }
592    }
593    if(mask1)
594    {
595        if(mask1 == 0xffff)
596            u4_zero_coeff+=8;
597        else
598        {
599            cmp1 = _mm_and_si128(temp_1, cmp1);
600            sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
601            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
602            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
603            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
604        }
605    }
606
607    /* Return total nonzero coefficients in the current sub block */
608    u4_nonzero_coeff = 16 - u4_zero_coeff;
609    *pu1_nnz =  u4_nonzero_coeff;
610
611}
612
613
614/**
615 *******************************************************************************
616 *
617 * @brief
618 *   This function performs forward hadamard transform and quantization on a 4*4 block
619 *
620 * @par Description:
621 *   The function accepts source buffer and estimation buffer. From these, it
622 *   computes the residue. This is residue is then transformed and quantized.
623 *   The transform and quantization are in placed computed. They use the residue
624 *   buffer for this.
625 *
626 * @param[in] pu1_src
627 *   Pointer to source sub-block
628 *
629 * @param[in] pu1_pred
630 *   Pointer to prediction sub-block
631 *
632 * @param[in] pi2_out
633 *   Pointer to residual sub-block
634 *
635 * @param[in] src_strd
636 *   Source stride
637 *
638 * @param[in] pred_strd
639 *   Prediction stride
640 *
641 * @param[in] dst_strd
642 *   Destination stride
643 *
644 * @param[in] u4_qbits
645 *    QP_BITS_h264_4x4 + floor(QP/6)
646 *
647 * @param[in] pu2_threshold_matrix
648 *   Pointer to Forward Quant Threshold Matrix
649 *
650 * @param[in] pu2_scale_matrix
651 *   Pointer to Forward Quant Scale Matrix
652 *
653 * @param[in] u4_round_factor
654 *   Quantization Round factor
655 *
656 * @param[out] pu1_nnz
657 *   Total non-zero coefficients in the current sub-block
658 *
659 * @returns
660 *
661 * @remarks
662 *   None
663 *
664 */
665
666void ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
667                          const UWORD16 *pu2_scale_matrix,
668                          const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
669                          UWORD32 u4_round_factor,UWORD8  *pu1_nnz
670                          )
671{
672    WORD32 u4_zero_coeff,u4_nonzero_coeff=0;
673    __m128i cmp0, cmp1, sum0, sum1, sum2;
674    WORD32 mask0, mask1;
675    __m128i src_r0_r1, src_r2_r3, sign_reg;
676    __m128i src_r0, src_r1, src_r2, src_r3;
677    __m128i zero_8x16b = _mm_setzero_si128();
678    __m128i temp0, temp1, temp2, temp3;
679    __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3;
680    __m128i temp_1 = _mm_set1_epi16(1);
681    __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
682    __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
683
684    UNUSED (pu2_threshold_matrix);
685
686    src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
687    src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
688    sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
689    src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg);   //a0 a1 a2 a3
690    src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg);   //b0 b1 b2 b3
691    sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3);
692    src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg);   //c0 c1 c2 c3
693    src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg);   //d0 d1 d2 d3
694
695    /* Perform Inverse transform */
696    /*-------------------------------------------------------------*/
697    /* Forward DC transform [ Horizontal transformation ]                          */
698    /*-------------------------------------------------------------*/
699    // Matrix transpose
700    /*
701     *  a0 a1 a2 a3
702     *  b0 b1 b2 b3
703     *  c0 c1 c2 c3
704     *  d0 d1 d2 d3
705     */
706    temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 b0 a1 b1
707    temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //c0 d0 c1 d1
708    temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //a2 b2 a3 b3
709    temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 d2 c3 d3
710    src_r0 = _mm_unpacklo_epi64(temp0, temp2);                    //a0 b0 c0 d0
711    src_r1 = _mm_unpackhi_epi64(temp0, temp2);                    //a1 b1 c1 d1
712    src_r2 = _mm_unpacklo_epi64(temp1, temp3);                    //a2 b2 c2 d2
713    src_r3 = _mm_unpackhi_epi64(temp1, temp3);                    //a3 b3 c3 d3
714
715    temp0 = _mm_add_epi32(src_r0, src_r3);
716    temp1 = _mm_add_epi32(src_r1, src_r2);
717    temp2 = _mm_sub_epi32(src_r1, src_r2);
718    temp3 = _mm_sub_epi32(src_r0, src_r3);
719
720    src_r0 = _mm_add_epi32(temp0, temp1);
721    src_r1 = _mm_add_epi32(temp2, temp3);
722    src_r2 = _mm_sub_epi32(temp0, temp1);
723    src_r3 = _mm_sub_epi32(temp3, temp2);
724
725    /*-------------------------------------------------------------*/
726    /* Forward DC transform [ Vertical transformation ]                          */
727    /*-------------------------------------------------------------*/
728    // Matrix transpose
729    /*
730     *  a0 b0 c0 d0
731     *  a1 b1 c1 d1
732     *  a2 b2 c2 d2
733     *  a3 b3 c3 d3
734     */
735    temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 a1 b0 b1
736    temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //a2 a3 b2 b3
737    temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //c0 c1 d0 d1
738    temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 c3 d2 d3
739    src_r0 = _mm_unpacklo_epi64(temp0, temp2);                   //a0 a1 a2 a3
740    src_r1 = _mm_unpackhi_epi64(temp0, temp2);                   //b0 b1 b2 b3
741    src_r2 = _mm_unpacklo_epi64(temp1, temp3);                   //c0 c1 c2 c3
742    src_r3 = _mm_unpackhi_epi64(temp1, temp3);                   //d0 d1 d2 d3
743
744    temp0 = _mm_add_epi32(src_r0, src_r3);
745    temp1 = _mm_add_epi32(src_r1, src_r2);
746    temp2 = _mm_sub_epi32(src_r1, src_r2);
747    temp3 = _mm_sub_epi32(src_r0, src_r3);
748
749    src_r0 = _mm_add_epi32(temp0, temp1);
750    src_r1 = _mm_add_epi32(temp2, temp3);
751    src_r2 = _mm_sub_epi32(temp0, temp1);
752    src_r3 = _mm_sub_epi32(temp3, temp2);
753
754    src_r0 = _mm_srai_epi32(src_r0, 1);
755    src_r1 = _mm_srai_epi32(src_r1, 1);
756    src_r2 = _mm_srai_epi32(src_r2, 1);
757    src_r3 = _mm_srai_epi32(src_r3, 1);
758
759    // Quantization
760    sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, src_r0);        //Find sign of each value for later restoration
761    sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1);
762    sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2);
763    sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3);
764
765    sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1);      //Sign = -1 or 0 depending on <0 or >0 respectively
766    sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3);
767
768    sign_reg0 = _mm_slli_epi16(sign_reg0, 1);               //Sign = -2 or 0 depending on <0 or >0 respectively
769    sign_reg2 = _mm_slli_epi16(sign_reg2, 1);
770
771    sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);            //Sign = -1 or 1 depending on <0 or >0 respectively
772    sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
773
774    src_r0 = _mm_abs_epi32(src_r0);                         //Absolute values
775    src_r1 = _mm_abs_epi32(src_r1);
776    src_r2 = _mm_abs_epi32(src_r2);
777    src_r3 = _mm_abs_epi32(src_r3);
778
779    temp0 = _mm_mullo_epi32(scale_val, src_r0);             //multiply by pu2_scale_matrix[0]
780    temp1 = _mm_mullo_epi32(scale_val, src_r1);
781    temp2 = _mm_mullo_epi32(scale_val, src_r2);
782    temp3 = _mm_mullo_epi32(scale_val, src_r3);
783
784    temp0 = _mm_add_epi32(temp0,rnd_fact);                  //Add round factor
785    temp1 = _mm_add_epi32(temp1,rnd_fact);
786    temp2 = _mm_add_epi32(temp2,rnd_fact);
787    temp3 = _mm_add_epi32(temp3,rnd_fact);
788
789    temp0 = _mm_srli_epi32(temp0,u4_qbits);                 //RIght shift by qbits, unsigned variable, so shift right immediate works
790    temp1 = _mm_srli_epi32(temp1,u4_qbits);
791    temp2 = _mm_srli_epi32(temp2,u4_qbits);
792    temp3 = _mm_srli_epi32(temp3,u4_qbits);
793
794    temp0 =  _mm_packs_epi32 (temp0,temp1);                 //Final values are 16-bits only.
795    temp2 =  _mm_packs_epi32 (temp2,temp3);
796
797    temp0 =  _mm_sign_epi16(temp0, sign_reg0);              //Sign restoration
798    temp2 =  _mm_sign_epi16(temp2, sign_reg2);
799
800    _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
801    _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2);
802
803    cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
804    cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
805
806    mask0 = _mm_movemask_epi8(cmp0);
807    mask1 = _mm_movemask_epi8(cmp1);
808    u4_zero_coeff = 0;
809    if(mask0)
810    {
811        if(mask0 == 0xffff)
812            u4_zero_coeff+=8;
813        else
814        {
815            cmp0 = _mm_and_si128(temp_1, cmp0);
816            sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
817            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
818            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
819            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
820        }
821    }
822    if(mask1)
823    {
824        if(mask1 == 0xffff)
825            u4_zero_coeff+=8;
826        else
827        {
828            cmp1 = _mm_and_si128(temp_1, cmp1);
829            sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
830            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
831            sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
832            u4_zero_coeff += _mm_cvtsi128_si32(sum2);
833        }
834    }
835
836    /* Return total nonzero coefficients in the current sub block */
837    u4_nonzero_coeff = 16 - u4_zero_coeff;
838    pu1_nnz[0] =  u4_nonzero_coeff;
839}
840
841
842/**
843 *******************************************************************************
844 *
845 * @brief
846 *   This function performs forward hadamard transform and quantization on a 2*2 block
847 *   for both U and V planes
848 *
849 * @par Description:
850 *   The function accepts source buffer and estimation buffer. From these, it
851 *   computes the residue. This is residue is then transformed and quantized.
852 *   The transform and quantization are in placed computed. They use the residue
853 *   buffer for this.
854 *
855 * @param[in] pu1_src
856 *   Pointer to source sub-block
857 *
858 * @param[in] pu1_pred
859 *   Pointer to prediction sub-block
860 *
861 * @param[in] pi2_out
862 *   Pointer to residual sub-block
863 *
864 * @param[in] src_strd
865 *   Source stride
866 *
867 * @param[in] pred_strd
868 *   Prediction stride
869 *
870 * @param[in] dst_strd
871 *   Destination stride
872 *
873 * @param[in] u4_qbits
874 *    QP_BITS_h264_4x4 + floor(QP/6)
875 *
876 * @param[in] pu2_threshold_matrix
877 *   Pointer to Forward Quant Threshold Matrix
878 *
879 * @param[in] pu2_scale_matrix
880 *   Pointer to Forward Quant Scale Matrix
881 *
882 * @param[in] u4_round_factor
883 *   Quantization Round factor
884 *
885 * @param[out] pu1_nnz
886 *   Total non-zero coefficients in the current sub-block
887 *
888 * @returns
889 *
890 * @remarks
891 *   NNZ for dc is populated at 0 and 5th position of pu1_nnz
892 *
893 */
894
895void ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
896                            const UWORD16 *pu2_scale_matrix,
897                            const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
898                            UWORD32 u4_round_factor,UWORD8  *pu1_nnz)
899{
900    WORD32 val, nonzero_coeff_0=0, nonzero_coeff_1=0;
901    __m128i cmp, cmp0, cmp1;
902    __m128i sum0, sum1;
903    WORD32 mask, mask0, mask1;
904    __m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
905    __m128i zero_8x16b = _mm_setzero_si128();
906    __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
907    __m128i sign_reg0, sign_reg1;
908    __m128i temp_1 = _mm_set1_epi16(1);
909    __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
910
911    UNUSED (pu2_threshold_matrix);
912
913    src = _mm_loadu_si128((__m128i *)pi2_src);          //a0 a1 a2 a3 b0 b1 b2 b3
914    sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
915    plane_0 = _mm_unpacklo_epi16(src, sign_reg);        //a0 a1 a2 a3 -- 32 bits
916    plane_1 = _mm_unpackhi_epi16(src, sign_reg);        //b0 b1 b2 b3 -- 32 bits
917
918    temp0 = _mm_hadd_epi32(plane_0, plane_1);           //a0+a1 a2+a3 b0+b1 b2+b3
919    temp1 = _mm_hsub_epi32(plane_0, plane_1);           //a0-a1 a2-a3 b0-b1 b2-b3
920
921    plane_0 = _mm_hadd_epi32(temp0, temp1);             //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3
922    plane_1 = _mm_hsub_epi32(temp0, temp1);             //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3
923
924    temp0 = _mm_unpacklo_epi32(plane_0, plane_1);       //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3
925    temp1 = _mm_unpackhi_epi32(plane_0, plane_1);       //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3
926
927    plane_0 = _mm_unpacklo_epi64(temp0, temp1);         //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3
928    plane_1 = _mm_unpackhi_epi64(temp0, temp1);         //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3
929
930    plane_0 = _mm_shuffle_epi32(plane_0, 0xd8);         //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3
931    plane_1 = _mm_shuffle_epi32(plane_1, 0xd8);         //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3
932    // Quantization
933    sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, plane_0);       //Find sign of each value for later restoration
934    sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1);
935
936    sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1);      //Sign = -1 or 0 depending on <0 or >0 respectively
937    sign_reg0 = _mm_slli_epi16(sign_reg0, 1);               //Sign = -2 or 0 depending on <0 or >0 respectively
938    sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);            //Sign = -1 or 1 depending on <0 or >0 respectively
939
940    plane_0 = _mm_abs_epi32(plane_0);                           //Absolute values
941    plane_1 = _mm_abs_epi32(plane_1);
942
943    temp0 = _mm_mullo_epi32(scale_val, plane_0);                //multiply by pu2_scale_matrix[0]
944    temp1 = _mm_mullo_epi32(scale_val, plane_1);                //multiply by pu2_scale_matrix[0]
945
946    temp0 = _mm_add_epi32(temp0,rnd_fact);                  //Add round factor
947    temp1 = _mm_add_epi32(temp1,rnd_fact);
948
949    temp0 = _mm_srli_epi32(temp0,u4_qbits);                 //RIght shift by qbits, unsigned variable, so shift right immediate works
950    temp1 = _mm_srli_epi32(temp1,u4_qbits);
951
952    temp0 =  _mm_packs_epi32 (temp0,temp1);                 //Final values are 16-bits only.
953    temp0 =  _mm_sign_epi16(temp0, sign_reg0);              //Sign restoration
954
955    _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
956
957    cmp = _mm_cmpeq_epi16(temp0, zero_8x16b);
958    mask = _mm_movemask_epi8(cmp);
959    mask0 = mask & 0xff;
960    mask1 = mask>>8;
961    if(mask0)
962    {
963        if(mask0 == 0xff)
964            nonzero_coeff_0 += 4;
965        else
966        {
967            cmp0 = _mm_and_si128(temp_1, cmp);
968            sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
969            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
970            val = _mm_cvtsi128_si32(sum1);
971            val = val & 0xffff;
972            nonzero_coeff_0 += val;
973        }
974    }
975    if(mask1)
976    {
977        if(mask1 == 0xff)
978            nonzero_coeff_1 += 4;
979        else
980        {
981            cmp1 = _mm_srli_si128(cmp, 8);
982            cmp1 = _mm_and_si128(temp_1, cmp1);
983            sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
984            sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
985            nonzero_coeff_1 += _mm_cvtsi128_si32(sum1);
986        }
987    }
988
989    pu1_nnz[0] = 4 - nonzero_coeff_0;
990    pu1_nnz[1] = 4 - nonzero_coeff_1;
991
992}
993