1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19 *******************************************************************************
20 * @file
21 *  ihevc_itrans_recon_x86_intr.c
22 *
23 * @brief
24 *  Contains function definitions for inverse  quantization, inverse
25 * transform and reconstruction
26 *
27 * @author
28 *  100470
29 *  100592 (edited by)
30 *
31 * @par List of Functions:
32 *  - ihevc_itrans_recon_4x4_ttype1_sse42()
33 *  - ihevc_itrans_recon_4x4_sse42()
34 *  - ihevc_itrans_recon_8x8_sse42()
35 *
36 * @remarks
37 *  None
38 *
39 *******************************************************************************
40 */
41#include <stdio.h>
42#include <string.h>
43#include "ihevc_typedefs.h"
44#include "ihevc_macros.h"
45#include "ihevc_platform_macros.h"
46#include "ihevc_defs.h"
47#include "ihevc_trans_tables.h"
48#include "ihevc_iquant_itrans_recon.h"
49#include "ihevc_func_selector.h"
50#include "ihevc_trans_macros.h"
51
52#include <immintrin.h>
53#include <emmintrin.h>
54#include <smmintrin.h>
55#include <tmmintrin.h>
56
57/**
58 *******************************************************************************
59 *
60 * @brief
61 *  This function performs inverse quantization, inverse  transform
62 * type1(DST) and reconstruction for 4x4  input block
63 *
64 * @par Description:
65 *  Performs inverse quantization , inverse transform type 1  and adds
66 * prediction data and clips output to 8 bit
67 *
68 * @param[in] pi2_src
69 *  Input 4x4 coefficients
70 *
71 * @param[in] pi2_tmp
72 *  Temporary 4x4 buffer for storing inverse
73 *  transform 1st stage output
74 *
75 * @param[in] pu1_pred
76 *  Prediction 4x4 block
77 *
78 * @param[in] pi2_dequant_coeff
79 *  Dequant Coeffs
80 *
81 * @param[out] pu1_dst
82 *  Output 4x4 block
83 *
84 * @param[in] qp_div
85 *  Quantization parameter / 6
86 *
87 * @param[in] qp_rem
88 *  Quantization parameter % 6
89 *
90 * @param[in] src_strd
91 *  Input stride
92 *
93 * @param[in] pred_strd
94 *  Prediction stride
95 *
96 * @param[in] dst_strd
97 *  Output Stride
98 *
99 * @param[in] zero_cols
100 *  Zero columns in pi2_src
101 *
102 * @returns  Void
103 *
104 * @remarks
105 *  None
106 *
107 *******************************************************************************
108 */
109
110
111void ihevc_itrans_recon_4x4_ttype1_sse42(WORD16 *pi2_src,
112                                         WORD16 *pi2_tmp,
113                                         UWORD8 *pu1_pred,
114                                         UWORD8 *pu1_dst,
115                                         WORD32 src_strd,
116                                         WORD32 pred_strd,
117                                         WORD32 dst_strd,
118                                         WORD32 zero_cols,
119                                         WORD32 zero_rows)
120{
121    __m128i m_temp_reg_0;
122    __m128i m_temp_reg_1;
123    __m128i m_temp_reg_2;
124    __m128i m_temp_reg_3;
125    __m128i m_temp_reg_4;
126    __m128i m_temp_reg_10;
127    __m128i m_temp_reg_11;
128    __m128i m_temp_reg_12;
129    __m128i m_temp_reg_13;
130    __m128i m_temp_reg_14;
131    __m128i m_temp_reg_20;
132    __m128i m_temp_reg_21;
133    __m128i m_temp_reg_22;
134    __m128i m_temp_reg_23;
135    __m128i m_temp_reg_24;
136    __m128i m_temp_reg_25;
137    __m128i m_temp_reg_30;
138    __m128i m_temp_reg_31;
139    __m128i m_temp_reg_32;
140    __m128i m_temp_reg_33;
141    __m128i m_temp_reg_34;
142    __m128i m_temp_reg_35;
143    __m128i m_temp_reg_36;
144    __m128i m_coeff1, m_coeff2, m_coeff3;
145    __m128i m_rdng_factor;
146    __m128i m_count;
147
148    WORD32 i4_shift = IT_SHIFT_STAGE_1;
149    UNUSED(zero_rows);
150    UNUSED(zero_cols);
151    UNUSED(pi2_tmp);
152
153    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[2][0]); //74
154
155    m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
156    pi2_src += src_strd;
157    m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
158    pi2_src += src_strd;
159    m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
160    pi2_src += src_strd;
161    m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
162
163    m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
164    m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
165
166    m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
167    m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
168
169    /* c[4] in m_temp_reg_14 */
170    /* c[4] = src[0] - src[2] + src[3] */
171    {
172        m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2);
173    }
174
175    /* c[3] in m_temp_reg_13 */
176    {
177        m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3);
178    }
179
180    /* c[0] in m_temp_reg_10 */
181    {
182        m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2);
183    }
184
185    /* c[1] in m_temp_reg_11 */
186    {
187        m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3);
188    }
189
190    /* c[2] in m_temp_reg_12 */
191    {
192        m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3);
193    }
194
195    /* c[4] in m_temp_reg_14 */
196    /* c[4] = src[0] - src[2] + src[3] */
197    {
198        m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3);
199    }
200
201    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[1][0]); //29
202    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[0][0]); //55
203
204    /* Stage 1 outputs stored in m_temp_reg_20-23 */
205    {
206        m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1); //29*c0
207        m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2); //55*c1
208
209        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
210
211        m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1); //29*c1
212        m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2); //55*c2
213
214        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
215
216        m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2); //55*c0
217        m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1); //29*c2
218        m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3); //74*c4
219
220        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
221        m_count = _mm_cvtsi32_si128(i4_shift);
222
223        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
224        m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13);
225        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
226
227        m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
228        m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
229
230        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
231        m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13);
232        m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
233
234        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
235
236        m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
237        m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
238        m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
239        m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
240
241        m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
242        m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
243        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
244        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
245
246        m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
247        m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
248
249        m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
250        m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
251
252    }
253
254    /* Stage 2 */
255    {
256        i4_shift = IT_SHIFT_STAGE_2;
257
258        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
259        m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
260        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
261        m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
262        m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
263        m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);
264
265        /* c[4] stored in m_temp_reg_4 */
266        {
267            m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
268        }
269
270        /* c[3] stored in m_temp_reg_3 */
271        {
272            m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3);
273        }
274
275        /* c[0] stored in m_temp_reg_0 */
276        {
277            m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
278        }
279
280        /* c[1] stored in m_temp_reg_1 */
281        {
282            m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21);
283        }
284
285        /* c[2] stored in m_temp_reg_2 */
286        {
287            m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23);
288        }
289
290        /* c[4] stored in m_temp_reg_4 */
291        {
292            m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23);
293        }
294
295        /* Stage 2 output generation */
296        {
297            m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1); //29*c0
298            m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2); //55*c1
299
300            m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
301
302            m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //29*c1
303            m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2); //55*c2
304
305            m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
306
307            m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2); //55*c0
308            m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1); //29*c2
309            m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3); //74*c4
310
311            m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
312            m_count = _mm_cvtsi32_si128(i4_shift);
313
314            m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3);
315            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
316            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4);
317
318            m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
319            m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4);
320
321            m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3);
322            m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35);
323            m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4);
324
325            m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor);
326
327            m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
328            m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
329            m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
330            m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
331
332            m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
333            m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
334            m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
335            m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
336
337            m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
338            m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
339
340            m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
341            m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
342        }
343
344        /* Recon and store */
345        {
346            WORD32 *pi4_dst = (WORD32 *)pu1_dst;
347
348            m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
349            pu1_pred += pred_strd;
350            m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
351            pu1_pred += pred_strd;
352            m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
353            pu1_pred += pred_strd;
354            m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
355
356            m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
357            m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
358            m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
359            m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
360            m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
361            m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
362
363            m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
364            m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
365
366            m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
367
368            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
369            m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
370            m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
371            m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
372            pu1_dst += dst_strd;
373            pi4_dst = (WORD32 *)(pu1_dst);
374
375            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
376            pu1_dst += dst_strd;
377            pi4_dst = (WORD32 *)(pu1_dst);
378
379            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
380            pu1_dst += dst_strd;
381            pi4_dst = (WORD32 *)(pu1_dst);
382
383            *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
384        }
385    }
386}
387
388/**
389 *******************************************************************************
390 *
391 * @brief
392 *  This function performs inverse quantization, inverse  transform
393 * (DCT) and reconstruction for 4x4  input block
394 *
395 * @par Description:
396 *  Performs inverse quantization , inverse transform and adds
397 * prediction data and clips output to 8 bit
398 *
399 * @param[in] pi2_src
400 *  Input 4x4 coefficients
401 *
402 * @param[in] pi2_tmp
403 *  Temporary 4x4 buffer for storing inverse
404 *  transform 1st stage output
405 *
406 * @param[in] pu1_pred
407 *  Prediction 4x4 block
408 *
409 * @param[in] pi2_dequant_coeff
410 *  Dequant Coeffs
411 *
412 * @param[out] pu1_dst
413 *  Output 4x4 block
414 *
415 * @param[in] qp_div
416 *  Quantization parameter / 6
417 *
418 * @param[in] qp_rem
419 *  Quantization parameter % 6
420 *
421 * @param[in] src_strd
422 *  Input stride
423 *
424 * @param[in] pred_strd
425 *  Prediction stride
426 *
427 * @param[in] dst_strd
428 *  Output Stride
429 *
430 * @param[in] zero_cols
431 *  Zero columns in pi2_src
432 *
433 * @returns  Void
434 *
435 * @remarks
436 *  None
437 *
438 *******************************************************************************
439 */
440
441void ihevc_itrans_recon_4x4_sse42(WORD16 *pi2_src,
442                                  WORD16 *pi2_tmp,
443                                  UWORD8 *pu1_pred,
444                                  UWORD8 *pu1_dst,
445                                  WORD32 src_strd,
446                                  WORD32 pred_strd,
447                                  WORD32 dst_strd,
448                                  WORD32 zero_cols,
449                                  WORD32 zero_rows)
450{
451
452
453    __m128i m_temp_reg_0;
454    __m128i m_temp_reg_1;
455    __m128i m_temp_reg_2;
456    __m128i m_temp_reg_3;
457    __m128i m_temp_reg_10;
458    __m128i m_temp_reg_11;
459    __m128i m_temp_reg_12;
460    __m128i m_temp_reg_13;
461    __m128i m_temp_reg_14;
462    __m128i m_temp_reg_15;
463    __m128i m_temp_reg_20;
464    __m128i m_temp_reg_21;
465    __m128i m_temp_reg_22;
466    __m128i m_temp_reg_23;
467    __m128i m_temp_reg_24;
468    __m128i m_temp_reg_25;
469    __m128i m_temp_reg_30;
470    __m128i m_temp_reg_31;
471    __m128i m_temp_reg_33;
472    __m128i m_temp_reg_34;
473    __m128i m_coeff1, m_coeff3;
474    __m128i m_rdng_factor;
475    __m128i m_count;
476
477
478    WORD32 i4_shift = IT_SHIFT_STAGE_1;
479    UNUSED(zero_rows);
480    UNUSED(zero_cols);
481    UNUSED(pi2_tmp);
482
483
484    m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src);
485    pi2_src += src_strd;
486    m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src);
487    pi2_src += src_strd;
488    m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src);
489    pi2_src += src_strd;
490    m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src);
491
492    m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0);
493    m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2);
494
495    m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1);
496    m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3);
497
498
499    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[0][0]); //36
500    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[2][0]); //83
501
502    /* e */
503    {
504        m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6);
505        m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6);
506    }
507
508    /* o */
509    {
510        m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //src[1]*36
511        m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3); //src[3]*83
512        m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3); //src[1]*83
513        m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1); //src[3]*36
514    }
515
516    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
517
518    /* e1 stored in m_temp_reg_31 */
519    {
520        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
521    }
522
523    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
524
525    /* e0 stored in m_temp_reg_30 */
526    {
527        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
528    }
529
530    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
531    m_count = _mm_cvtsi32_si128(i4_shift);
532
533    /* o1 stored in m_temp_reg_33 */
534    {
535        m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
536    }
537
538    /* e1 + add */
539    {
540        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
541    }
542
543    /* e0 + add */
544    {
545        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
546    }
547
548    /* o0 stored in m_temp_reg_34 */
549    {
550        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
551    }
552
553    /* Stage 1 outputs */
554    {
555        m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
556        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
557
558        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
559        m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
560
561
562        m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
563        m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
564        m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
565        m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
566
567        m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
568        m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
569        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
570        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
571
572        m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
573        m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
574
575        m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
576        m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
577    }
578
579    /* Stage 2 */
580    {
581        i4_shift = IT_SHIFT_STAGE_2;
582
583        m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
584        m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
585
586        m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20);
587        m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21);
588
589        m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22);
590        m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23);
591
592        /* e */
593        {
594            m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6);
595        }
596
597        /* o */
598        {
599            m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1); //src[1]*36
600            m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3); //src[1]*83
601            m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3); //src[3]*83
602            m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1); //src[3]*36
603        }
604
605        /* e */
606        {
607            m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6);
608        }
609
610        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
611
612        /* e1 stored in m_temp_reg_31 */
613        {
614            m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11);
615        }
616
617        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
618
619        /* e0 stored in m_temp_reg_30 */
620        {
621            m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11);
622        }
623
624        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
625        m_count = _mm_cvtsi32_si128(i4_shift);
626
627        /* o1 stored in m_temp_reg_33 */
628        {
629            m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13);
630        }
631
632        /* e1 + add */
633        {
634            m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
635        }
636
637        /* e0 + add */
638        {
639            m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
640        }
641
642        /* o0 stored in m_temp_reg_34 */
643        {
644            m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15);
645        }
646
647        /* Stage 2 outputs */
648        {
649            m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33);
650            m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33);
651            m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34);
652            m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34);
653
654            m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count);
655            m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count);
656            m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count);
657            m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count);
658
659            m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
660            m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
661            m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8);
662            m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8);
663
664            m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22);
665            m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23);
666
667            m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25);
668            m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25);
669        }
670
671        /* Recon and store */
672        {
673            UWORD32 *pu4_dst = (UWORD32 *)pu1_dst;
674
675            m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
676            pu1_pred += pred_strd;
677            m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
678            pu1_pred += pred_strd;
679            m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
680            pu1_pred += pred_strd;
681            m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
682
683            m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
684            m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1);
685            m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1);
686            m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2);
687            m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3);
688            m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3);
689
690            m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0);
691            m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1);
692
693            m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21);
694
695            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0);
696            m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4);
697            m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8);
698            m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12);
699            pu1_dst += dst_strd;
700            pu4_dst = (UWORD32 *)(pu1_dst);
701
702            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
703            pu1_dst += dst_strd;
704            pu4_dst = (UWORD32 *)(pu1_dst);
705
706            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
707            pu1_dst += dst_strd;
708            pu4_dst = (UWORD32 *)(pu1_dst);
709
710            *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
711        }
712    }
713}
714
715
716
717/**
718 *******************************************************************************
719 *
720 * @brief
721 *  This function performs inverse quantization, inverse  transform and
722 * reconstruction for 8c8 input block
723 *
724 * @par Description:
725 *  Performs inverse quantization , inverse transform  and adds the
726 * prediction data and clips output to 8 bit
727 *
728 * @param[in] pi2_src
729 *  Input 8x8 coefficients
730 *
731 * @param[in] pi2_tmp
732 *  Temporary 8x8 buffer for storing inverse
733 *  transform 1st stage output
734 *
735 * @param[in] pu1_pred
736 *  Prediction 8x8 block
737 *
738 * @param[in] pi2_dequant_coeff
739 *  Dequant Coeffs
740 *
741 * @param[out] pu1_dst
742 *  Output 8x8 block
743 *
744 * @param[in] src_strd
745 *  Input stride
746 *
747 * @param[in] qp_div
748 *  Quantization parameter / 6
749 *
750 * @param[in] qp_rem
751 *  Quantization parameter % 6
752 *
753 * @param[in] pred_strd
754 *  Prediction stride
755 *
756 * @param[in] dst_strd
757 *  Output Stride
758 *
759 * @param[in] zero_cols
760 *  Zero columns in pi2_src
761 *
762 * @returns  Void
763 *
764 * @remarks
765 *  None
766 *
767 *******************************************************************************
768 */
769
770
771void ihevc_itrans_recon_8x8_sse42(WORD16 *pi2_src,
772                                  WORD16 *pi2_tmp,
773                                  UWORD8 *pu1_pred,
774                                  UWORD8 *pu1_dst,
775                                  WORD32 src_strd,
776                                  WORD32 pred_strd,
777                                  WORD32 dst_strd,
778                                  WORD32 zero_cols,
779                                  WORD32 zero_rows)
780{
781    __m128i m_temp_reg_0;
782    __m128i m_temp_reg_1;
783    __m128i m_temp_reg_2;
784    __m128i m_temp_reg_3;
785    __m128i m_temp_reg_5;
786    __m128i m_temp_reg_6;
787    __m128i m_temp_reg_7;
788    __m128i m_temp_reg_4;
789    __m128i m_temp_reg_10;
790    __m128i m_temp_reg_11;
791    __m128i m_temp_reg_12;
792    __m128i m_temp_reg_13;
793    __m128i m_temp_reg_14;
794    __m128i m_temp_reg_15;
795    __m128i m_temp_reg_16;
796    __m128i m_temp_reg_17;
797    __m128i m_temp_reg_20;
798    __m128i m_temp_reg_21;
799    __m128i m_temp_reg_22;
800    __m128i m_temp_reg_23;
801    __m128i m_temp_reg_24;
802    __m128i m_temp_reg_25;
803    __m128i m_temp_reg_26;
804    __m128i m_temp_reg_27;
805    __m128i m_temp_reg_30;
806    __m128i m_temp_reg_31;
807    __m128i m_temp_reg_32;
808    __m128i m_temp_reg_33;
809    __m128i m_temp_reg_34;
810    __m128i m_temp_reg_35;
811    __m128i m_temp_reg_36;
812    __m128i m_temp_reg_37;
813    __m128i m_temp_reg_40;
814    __m128i m_temp_reg_41;
815    __m128i m_temp_reg_42;
816    __m128i m_temp_reg_43;
817    __m128i m_temp_reg_44;
818    __m128i m_temp_reg_45;
819    __m128i m_temp_reg_46;
820    __m128i m_temp_reg_47;
821    __m128i m_temp_reg_50;
822    __m128i m_temp_reg_51;
823    __m128i m_temp_reg_52;
824    __m128i m_temp_reg_53;
825    __m128i m_temp_reg_54;
826    __m128i m_temp_reg_55;
827    __m128i m_temp_reg_56;
828    __m128i m_temp_reg_57;
829    __m128i m_temp_reg_60;
830    __m128i m_temp_reg_61;
831    __m128i m_temp_reg_62;
832    __m128i m_temp_reg_63;
833    __m128i m_temp_reg_64;
834    __m128i m_temp_reg_65;
835    __m128i m_temp_reg_66;
836    __m128i m_temp_reg_67;
837    __m128i m_temp_reg_70;
838    __m128i m_temp_reg_71;
839    __m128i m_temp_reg_72;
840    __m128i m_temp_reg_73;
841    __m128i m_temp_reg_74;
842    __m128i m_temp_reg_75;
843    __m128i m_temp_reg_76;
844    __m128i m_temp_reg_77;
845    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
846
847    WORD32 check_row_stage_1;   /* Lokesh */
848    WORD32 check_row_stage_2;   /* Lokesh */
849
850    __m128i m_rdng_factor;
851    WORD32 i4_shift = IT_SHIFT_STAGE_1;
852    UNUSED(pi2_tmp);
853    check_row_stage_1   = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
854    check_row_stage_2   = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
855
856    m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
857    pi2_src += src_strd;
858    m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
859    pi2_src += src_strd;
860    m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
861    pi2_src += src_strd;
862    m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
863    pi2_src += src_strd;
864
865    m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
866    pi2_src += src_strd;
867    m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
868    pi2_src += src_strd;
869    m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
870    pi2_src += src_strd;
871    m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);
872
873    if(!check_row_stage_2)
874    {
875        if(!check_row_stage_1)
876        {
877            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
878            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
879            {
880                //Interleaving 0,4 row in 0 , 1 Rishab
881                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
882                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
883                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
884
885                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
886
887                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
888                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
889
890            }
891
892
893            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
894            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
895            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
896            {
897
898                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
899                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
900
901                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
902                //Interleaving 2,6 row in 4, 5 Rishab
903                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
904
905                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
906                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
907
908
909                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
910
911                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
912                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
913
914                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
915                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
916
917
918
919                /* e */
920
921                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
922                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
923                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
924                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
925                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
926                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
927
928                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
929                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
930
931            }
932
933            /* o */
934            {
935
936                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
937                {
938
939                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
940                    //o0:1B*89+3B*75,5B*50+7B*18
941                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
942
943                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
944                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
945
946
947
948                    /* Column 0 of destination computed here */
949                    /* It is stored in m_temp_reg_50 */
950                    /* Column 7 of destination computed here */
951                    /* It is stored in m_temp_reg_57 */
952                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
953
954
955
956                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
957                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
958
959                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
960                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
961
962                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
963                    m_temp_reg_63 = _mm_setzero_si128();
964                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
965
966                    //o1:1B*75-3B*18,5B*89+7B*50
967                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
968
969                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
970                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
971
972                    /* Loading coeff for computing o2  in the next block */
973
974                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
975                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
976
977                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
978
979
980
981                    /* Column 1 of destination computed here */
982                    /* It is stored in m_temp_reg_51 */
983                    /* Column 6 of destination computed here */
984                    /* It is stored in m_temp_reg_56 */
985
986                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
987                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
988
989                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
990                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
991
992                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
993                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
994
995                    //o2:1B*50-3B*89,5B*18+7B*75
996                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
997
998                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
999                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1000
1001
1002                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1003
1004                    /* Loading coeff for computing o3  in the next block */
1005
1006                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1007                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
1008
1009
1010
1011                    /* Column 2 of destination computed here */
1012                    /* It is stored in m_temp_reg_52 */
1013                    /* Column 5 of destination computed here */
1014                    /* It is stored in m_temp_reg_55 */
1015
1016                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1017                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1018
1019                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1020                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1021
1022                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1023                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1024
1025                    //o3:1B*18-3B*50,5B*75-7B*89
1026                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1027
1028                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1029                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1030
1031
1032
1033                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1034
1035
1036
1037                    /* Column 3 of destination computed here */
1038                    /* It is stored in m_temp_reg_53 */
1039                    /* Column 4 of destination computed here */
1040                    /* It is stored in m_temp_reg_54 */
1041
1042                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1043                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1044
1045                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1046                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1047
1048                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1049                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1050
1051
1052                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1053                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1054                }
1055            }
1056
1057            /* Transpose of the destination 8x8 matrix done here */
1058            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1059            /* respectively */
1060            {
1061                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1062                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1063                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1064                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1065
1066                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1067                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1068
1069                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1070                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1071
1072                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1073                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1074                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1075                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1076
1077                m_temp_reg_54 = _mm_setzero_si128();
1078                m_temp_reg_55 = _mm_setzero_si128();
1079                m_temp_reg_56 = _mm_setzero_si128();
1080                m_temp_reg_57 = _mm_setzero_si128();
1081            }
1082        }
1083        else
1084        {
1085            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1086            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1087            {
1088                //Interleaving 0,4 row in 0 , 1 Rishab
1089                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1090                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
1091                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
1092
1093                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1094
1095                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1096                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1097
1098            }
1099
1100
1101            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1102            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1103            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
1104            {
1105
1106                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1107                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1108
1109                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1110                //Interleaving 2,6 row in 4, 5 Rishab
1111                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1112
1113                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1114                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1115
1116
1117                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1118
1119                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1120                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
1121
1122                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1123                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
1124
1125
1126
1127                /* e */
1128
1129                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1130                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1131                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1132                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1133                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1134                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1135
1136                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1137                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1138
1139            }
1140
1141            /* o */
1142            {
1143
1144                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1145                {
1146
1147                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1148                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1149                    //o0:1B*89+3B*75,5B*50+7B*18
1150                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1151                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1152
1153                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1154                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1155
1156                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1157
1158
1159
1160                    /* Column 0 of destination computed here */
1161                    /* It is stored in m_temp_reg_50 */
1162                    /* Column 7 of destination computed here */
1163                    /* It is stored in m_temp_reg_57 */
1164                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
1165
1166
1167
1168                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1169                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1170
1171                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1172                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1173
1174                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1175                    m_temp_reg_63 = _mm_setzero_si128();
1176                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1177
1178                    //o1:1B*75-3B*18,5B*89+7B*50
1179                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1180                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1181
1182                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1183                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1184
1185                    /* Loading coeff for computing o2  in the next block */
1186
1187                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
1188                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
1189
1190                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1191                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
1192
1193
1194
1195                    /* Column 1 of destination computed here */
1196                    /* It is stored in m_temp_reg_51 */
1197                    /* Column 6 of destination computed here */
1198                    /* It is stored in m_temp_reg_56 */
1199
1200                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1201                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1202
1203                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1204                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1205
1206                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1207                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1208
1209                    //o2:1B*50-3B*89,5B*18+7B*75
1210                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1211                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1212
1213                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1214                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1215
1216
1217                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1218
1219                    /* Loading coeff for computing o3  in the next block */
1220
1221                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1222                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
1223
1224                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1225
1226
1227                    /* Column 2 of destination computed here */
1228                    /* It is stored in m_temp_reg_52 */
1229                    /* Column 5 of destination computed here */
1230                    /* It is stored in m_temp_reg_55 */
1231
1232                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1233                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1234
1235                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1236                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1237
1238                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1239                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1240
1241                    //o3:1B*18-3B*50,5B*75-7B*89
1242                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1243                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1244
1245                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1246                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1247
1248
1249
1250                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1251
1252                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
1253
1254
1255                    /* Column 3 of destination computed here */
1256                    /* It is stored in m_temp_reg_53 */
1257                    /* Column 4 of destination computed here */
1258                    /* It is stored in m_temp_reg_54 */
1259
1260                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1261                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1262
1263                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1264                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1265
1266                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1267                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1268
1269
1270                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1271                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
1272                }
1273            }
1274
1275            /* Transpose of the destination 8x8 matrix done here */
1276            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1277            /* respectively */
1278            {
1279                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1280                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1281                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1282                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1283
1284                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1285                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1286                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1287                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1288
1289                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1290                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1291                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1292                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1293
1294                m_temp_reg_54 = _mm_setzero_si128();
1295                m_temp_reg_55 = _mm_setzero_si128();
1296                m_temp_reg_56 = _mm_setzero_si128();
1297                m_temp_reg_57 = _mm_setzero_si128();
1298            }
1299        }
1300
1301        /* Stage 2 */
1302        i4_shift = IT_SHIFT_STAGE_2;
1303        {
1304            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1305            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1306            {
1307                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
1308                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
1309
1310                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
1311                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
1312
1313                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1314                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1315                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1316                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1317
1318
1319                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
1320                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
1321            }
1322
1323
1324            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1325            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1326            {
1327
1328                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
1329                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
1330
1331
1332                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1333                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1334                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1335                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1336
1337                /* Loading coeff for computing o0 in the next block */
1338                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1339
1340
1341                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
1342                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
1343
1344
1345
1346                /* e */
1347
1348                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1349                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1350                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1351                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1352                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1353                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1354
1355                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1356                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1357
1358                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1359                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1360
1361                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1362                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1363
1364            }
1365
1366            /* o */
1367            {
1368
1369                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1370                {
1371                    //o0:1B*89+3B*75,1T*89+3T*75
1372                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1373                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1374
1375                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1376                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1377                    /* Loading coeff for computing o1 in the next block */
1378                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1379
1380
1381
1382                    /* Column 0 of destination computed here */
1383                    /* It is stored in m_temp_reg_50 */
1384                    /* Column 7 of destination computed here */
1385                    /* It is stored in m_temp_reg_57 */
1386
1387                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1388                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1389
1390                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1391                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1392
1393                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1394                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1395                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1396                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1397
1398                    //o1:1B*75-3B*18,1T*75-3T*18
1399                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1400                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1401
1402                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1403                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1404                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1405                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1406
1407                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1408                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1409
1410
1411                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1412
1413
1414                    /* Loading coeff for computing o2  in the next block */
1415                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
1416
1417
1418
1419                    /* Column 1 of destination computed here */
1420                    /* It is stored in m_temp_reg_51 */
1421                    /* Column 6 of destination computed here */
1422                    /* It is stored in m_temp_reg_56 */
1423
1424                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1425                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1426
1427                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1428                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1429
1430                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1431                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1432                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1433                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1434
1435                    //o2:1B*50-3B*89,5T*18+7T*75.
1436                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1437                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1438
1439                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1440                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1441                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1442                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1443
1444                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1445                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1446
1447
1448                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1449
1450                    /* Loading coeff for computing o3  in the next block */
1451
1452                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1453
1454
1455                    /* Column 2 of destination computed here */
1456                    /* It is stored in m_temp_reg_52 */
1457                    /* Column 5 of destination computed here */
1458                    /* It is stored in m_temp_reg_55 */
1459
1460                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1461                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1462
1463                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1464                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1465
1466                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1467                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1468                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1469                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1470
1471                    //o3:1B*18-3B*50,1T*18-3T*50
1472                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1473                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1474
1475                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1476                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1477                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1478                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1479
1480
1481                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1482                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1483
1484
1485
1486                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1487
1488
1489                    /* Column 3 of destination computed here */
1490                    /* It is stored in m_temp_reg_53 */
1491                    /* Column 4 of destination computed here */
1492                    /* It is stored in m_temp_reg_54 */
1493
1494                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1495                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1496
1497                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1498                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1499
1500                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
1501                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
1502                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
1503                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
1504
1505                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
1506                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
1507                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
1508                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
1509
1510                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
1511                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
1512                }
1513            }
1514
1515            /* Transpose of the destination 8x8 matrix done here */
1516            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1517            /* respectively */
1518            {
1519                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1520                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1521                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1522                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1523                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1524                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1525                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1526                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1527
1528                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1529                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1530                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1531                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1532                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1533                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1534                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1535                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1536                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1537                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1538                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1539                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1540
1541                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1542                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1543                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1544                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1545            }
1546
1547            /* Recon and store */
1548            {
1549                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
1550                pu1_pred += pred_strd;
1551                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
1552                pu1_pred += pred_strd;
1553                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
1554                pu1_pred += pred_strd;
1555                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
1556                pu1_pred += pred_strd;
1557                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
1558                pu1_pred += pred_strd;
1559                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
1560                pu1_pred += pred_strd;
1561                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
1562                pu1_pred += pred_strd;
1563                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
1564
1565                m_temp_reg_50 = _mm_setzero_si128();
1566                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
1567                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
1568                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
1569                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
1570                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
1571                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
1572                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
1573                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
1574
1575                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
1576                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
1577                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
1578                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
1579                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
1580                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
1581                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
1582                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
1583
1584                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
1585                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
1586                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
1587                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
1588                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
1589                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
1590                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
1591                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
1592
1593                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
1594                pu1_dst += dst_strd;
1595                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
1596                pu1_dst += dst_strd;
1597                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
1598                pu1_dst += dst_strd;
1599                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
1600                pu1_dst += dst_strd;
1601                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
1602                pu1_dst += dst_strd;
1603                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
1604                pu1_dst += dst_strd;
1605                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
1606                pu1_dst += dst_strd;
1607                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
1608                pu1_dst += dst_strd;
1609            }
1610        }
1611    }
1612    else
1613
1614    {
1615
1616        /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1617        /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1618        if(!check_row_stage_1)
1619        {
1620            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1621            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1622            {
1623                //Interleaving 0,4 row in 0 , 1 Rishab
1624                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1625                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
1626                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
1627
1628                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1629                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
1630
1631                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1632                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1633
1634
1635                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1636                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1637            }
1638
1639
1640            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1641            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1642            {
1643
1644                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1645                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1646
1647                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1648                //Interleaving 2,6 row in 4, 5 Rishab
1649                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1650                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
1651
1652                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1653                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1654
1655                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
1656                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1657
1658
1659
1660                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1661
1662                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1663                //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]);
1664
1665                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1666                //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]);
1667
1668            }
1669
1670            /* e */
1671            {
1672                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1673                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1674                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1675                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1676                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1677                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1678
1679                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1680                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1681
1682                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1683                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1684
1685                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1686                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1687
1688            }
1689
1690            /* o */
1691            {
1692
1693                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1694                {
1695
1696                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1697                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1698                    //o0:1B*89+3B*75,1T*89+3T*75
1699                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1700                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1701
1702                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1703                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1704
1705                }
1706
1707                /* Column 0 of destination computed here */
1708                /* It is stored in m_temp_reg_50 */
1709                /* Column 7 of destination computed here */
1710                /* It is stored in m_temp_reg_57 */
1711                {
1712
1713
1714                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1715                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1716
1717                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1718                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1719
1720                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1721                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1722                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1723                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1724
1725                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1726                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1727                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1728                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1729
1730                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1731                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1732                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1733
1734                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1735                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1736
1737                    /* Loading coeff for computing o2  in the next block */
1738
1739                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
1740
1741                }
1742
1743                /* Column 1 of destination computed here */
1744                /* It is stored in m_temp_reg_51 */
1745                /* Column 6 of destination computed here */
1746                /* It is stored in m_temp_reg_56 */
1747                {
1748                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1749                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1750
1751                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1752                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1753
1754                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1755                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1756                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1757                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1758
1759                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1760                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1761                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1762                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1763
1764                    //o2:1B*50-3B*89,1T*50-3T*89
1765                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1766                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1767
1768                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1769                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1770
1771
1772                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1773
1774
1775                    /* Loading coeff for computing o3  in the next block */
1776
1777                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
1778
1779                }
1780
1781                /* Column 2 of destination computed here */
1782                /* It is stored in m_temp_reg_52 */
1783                /* Column 5 of destination computed here */
1784                /* It is stored in m_temp_reg_55 */
1785                {
1786                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1787                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1788
1789                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1790                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1791
1792                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1793                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1794                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1795                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1796
1797                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1798                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1799                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1800                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1801
1802                    //o3:1B*18-3B*50,1T*18-3T*50
1803                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1804                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1805
1806                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1807                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1808
1809
1810
1811                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1812
1813
1814                }
1815
1816                /* Column 3 of destination computed here */
1817                /* It is stored in m_temp_reg_53 */
1818                /* Column 4 of destination computed here */
1819                /* It is stored in m_temp_reg_54 */
1820                {
1821                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1822                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1823
1824                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1825                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1826
1827                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1828                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1829                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1830                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1831
1832                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1833                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1834                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1835                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1836
1837                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1838                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1839                }
1840            }
1841
1842            /* Transpose of the destination 8x8 matrix done here */
1843            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1844            /* respectively */
1845            {
1846
1847
1848                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1849                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1850                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1851                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1852                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1853                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1854                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1855                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1856
1857                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1858                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1859                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1860                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1861                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1862                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1863                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1864                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1865
1866                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1867                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1868                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1869                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1870
1871                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1872                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1873                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1874                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1875            }
1876        }
1877        else
1878        {
1879
1880            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1881            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1882            {
1883                //Interleaving 0,4 row in 0 , 1 Rishab
1884                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1885                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]);
1886                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]);
1887
1888                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1889                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
1890
1891                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1892                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1893
1894
1895                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1896                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1897            }
1898
1899
1900            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1901            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1902            {
1903
1904                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1905                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1906
1907                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1908                //Interleaving 2,6 row in 4, 5 Rishab
1909                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1910                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
1911
1912                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1913                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1914
1915                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
1916                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1917
1918
1919
1920                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1921
1922                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
1923                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
1924
1925                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
1926                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
1927
1928            }
1929
1930            /* e */
1931            {
1932                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1933                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1934                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1935                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1936                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1937                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1938
1939                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1940                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1941
1942                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1943                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1944
1945                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1946                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1947
1948            }
1949
1950            /* o */
1951            {
1952
1953                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1954                {
1955
1956                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1957                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1958                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1959                    m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
1960                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
1961                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1962                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1963                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1964                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
1965
1966
1967                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1968                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1969
1970                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1971                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1972                }
1973
1974                /* Column 0 of destination computed here */
1975                /* It is stored in m_temp_reg_50 */
1976                /* Column 7 of destination computed here */
1977                /* It is stored in m_temp_reg_57 */
1978                {
1979
1980
1981                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1982                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1983
1984                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1985                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1986
1987                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1988                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1989                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1990                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1991
1992                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1993                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1994                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1995                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1996
1997                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1998                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1999                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
2000                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
2001                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
2002
2003                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2004                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2005
2006                    /* Loading coeff for computing o2  in the next block */
2007
2008                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
2009                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
2010
2011                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
2012                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
2013                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
2014                }
2015
2016                /* Column 1 of destination computed here */
2017                /* It is stored in m_temp_reg_51 */
2018                /* Column 6 of destination computed here */
2019                /* It is stored in m_temp_reg_56 */
2020                {
2021                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
2022                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
2023
2024                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
2025                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
2026
2027                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2028                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2029                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2030                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2031
2032                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2033                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2034                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2035                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2036
2037                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
2038                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
2039                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
2040                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
2041                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
2042
2043                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2044                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2045
2046
2047                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
2048
2049
2050                    /* Loading coeff for computing o3  in the next block */
2051
2052                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
2053                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
2054
2055                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
2056                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
2057                }
2058
2059                /* Column 2 of destination computed here */
2060                /* It is stored in m_temp_reg_52 */
2061                /* Column 5 of destination computed here */
2062                /* It is stored in m_temp_reg_55 */
2063                {
2064                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
2065                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
2066
2067                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
2068                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
2069
2070                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2071                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2072                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2073                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2074
2075                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2076                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2077                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2078                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2079
2080                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
2081                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
2082                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
2083                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
2084                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
2085
2086                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2087                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2088
2089
2090
2091                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
2092
2093
2094                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
2095                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
2096                }
2097
2098                /* Column 3 of destination computed here */
2099                /* It is stored in m_temp_reg_53 */
2100                /* Column 4 of destination computed here */
2101                /* It is stored in m_temp_reg_54 */
2102                {
2103                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
2104                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
2105
2106                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
2107                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
2108
2109                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
2110                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
2111                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
2112                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
2113
2114                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
2115                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
2116                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
2117                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
2118
2119                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
2120                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
2121                }
2122            }
2123
2124            /* Transpose of the destination 8x8 matrix done here */
2125            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
2126            /* respectively */
2127            {
2128
2129
2130                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
2131                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
2132                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
2133                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
2134                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
2135                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
2136                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
2137                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
2138
2139                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
2140                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
2141                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
2142                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
2143                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
2144                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
2145                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
2146                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
2147
2148                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
2149                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
2150                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
2151                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
2152
2153                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
2154                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
2155                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
2156                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
2157            }
2158        }
2159        /* Stage 2 */
2160
2161        i4_shift = IT_SHIFT_STAGE_2;
2162
2163        {
2164
2165            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
2166            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
2167            {
2168                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add
2169                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub
2170
2171                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
2172                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
2173
2174                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2175                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
2176                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2177                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
2178
2179
2180                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]);
2181                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]);
2182            }
2183
2184
2185            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
2186            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
2187            {
2188                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
2189                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
2190
2191
2192                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2193                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
2194                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2195                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
2196
2197                /* Loading coeff for computing o0 in the next block */
2198                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]);
2199                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]);
2200
2201
2202                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
2203                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
2204            }
2205
2206            /* e */
2207            {
2208                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
2209                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
2210                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
2211                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
2212                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
2213                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
2214
2215                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
2216                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
2217
2218                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
2219                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
2220
2221                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
2222                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
2223
2224            }
2225
2226            /* o */
2227            {
2228                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
2229                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
2230
2231                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
2232                {
2233                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
2234                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2235                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2236                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
2237                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
2238
2239                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2240                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
2241                    /* Loading coeff for computing o1 in the next block */
2242                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]);
2243                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]);
2244
2245                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
2246                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
2247                }
2248
2249                /* Column 0 of destination computed here */
2250                /* It is stored in m_temp_reg_50 */
2251                /* Column 7 of destination computed here */
2252                /* It is stored in m_temp_reg_57 */
2253                {
2254                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
2255                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
2256
2257                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2258                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2259
2260                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
2261                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
2262                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
2263                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
2264
2265                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
2266                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
2267                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
2268                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
2269
2270                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
2271                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
2272                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
2273                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
2274                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
2275
2276                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
2277                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
2278
2279
2280                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
2281
2282
2283                    /* Loading coeff for computing o2  in the next block */
2284                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]);
2285                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]);
2286
2287                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
2288                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
2289                }
2290
2291                /* Column 1 of destination computed here */
2292                /* It is stored in m_temp_reg_51 */
2293                /* Column 6 of destination computed here */
2294                /* It is stored in m_temp_reg_56 */
2295                {
2296                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
2297                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
2298
2299                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
2300                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
2301
2302                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
2303                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
2304                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
2305                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
2306
2307                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
2308                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
2309                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
2310                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
2311
2312                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
2313                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2314                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
2315                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2316                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
2317
2318                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
2319                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
2320
2321
2322                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
2323
2324                    /* Loading coeff for computing o3  in the next block */
2325
2326                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]);
2327                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]);
2328
2329                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
2330                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
2331                }
2332
2333                /* Column 2 of destination computed here */
2334                /* It is stored in m_temp_reg_52 */
2335                /* Column 5 of destination computed here */
2336                /* It is stored in m_temp_reg_55 */
2337                {
2338                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
2339                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
2340
2341                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
2342                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
2343
2344                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
2345                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
2346                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
2347                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
2348
2349                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
2350                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
2351                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
2352                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
2353
2354                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
2355                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
2356                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
2357                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
2358                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
2359
2360                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
2361                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
2362
2363
2364
2365                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
2366
2367
2368                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
2369                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
2370                }
2371
2372                /* Column 3 of destination computed here */
2373                /* It is stored in m_temp_reg_53 */
2374                /* Column 4 of destination computed here */
2375                /* It is stored in m_temp_reg_54 */
2376                {
2377                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
2378                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
2379
2380                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
2381                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
2382
2383                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
2384                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
2385                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
2386                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
2387
2388                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
2389                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
2390                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
2391                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
2392
2393                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
2394                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
2395                }
2396            }
2397
2398            /* Transpose of the destination 8x8 matrix done here */
2399            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
2400            /* respectively */
2401            {
2402                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
2403                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
2404                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
2405                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
2406                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
2407                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
2408                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
2409                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
2410
2411                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
2412                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
2413                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
2414                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
2415                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
2416                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
2417                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
2418                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
2419                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
2420                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
2421                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
2422                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
2423
2424                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
2425                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
2426                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
2427                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
2428            }
2429
2430            /* Recon and store */
2431            {
2432                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
2433                pu1_pred += pred_strd;
2434                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
2435                pu1_pred += pred_strd;
2436                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
2437                pu1_pred += pred_strd;
2438                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
2439                pu1_pred += pred_strd;
2440                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
2441                pu1_pred += pred_strd;
2442                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
2443                pu1_pred += pred_strd;
2444                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
2445                pu1_pred += pred_strd;
2446                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
2447
2448
2449                m_temp_reg_50 = _mm_setzero_si128();
2450                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
2451                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
2452                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
2453                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
2454                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
2455                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
2456                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
2457                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
2458
2459                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
2460                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
2461                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
2462                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
2463                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
2464                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
2465                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
2466                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
2467
2468                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
2469                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
2470                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
2471                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
2472                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
2473                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
2474                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
2475                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
2476
2477                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
2478                pu1_dst += dst_strd;
2479                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
2480                pu1_dst += dst_strd;
2481                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
2482                pu1_dst += dst_strd;
2483                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
2484                pu1_dst += dst_strd;
2485                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
2486                pu1_dst += dst_strd;
2487                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
2488                pu1_dst += dst_strd;
2489                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
2490                pu1_dst += dst_strd;
2491                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
2492                pu1_dst += dst_strd;
2493
2494            }
2495
2496
2497        }
2498
2499
2500    }
2501}
2502