impeg2_idct_recon_sse42_intr.c revision aed24eee7ddfc93f1436b0c1679431bd286879b4
1/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20
21/**
22 *******************************************************************************
23 * @file
24 *  impeg2_itrans_recon_x86_intr.c
25 *
26 * @brief
27 *  Contains function definitions for inverse  quantization, inverse
28 * transform and reconstruction
29 *
30 * @author
31 *  100470
32 *  100592 (edited by)
33 *
34 * @par List of Functions:
35 *  - impeg2_itrans_recon_8x8_sse42()
36 *
37 * @remarks
38 *  None
39 *
40 *******************************************************************************
41 */
42#include <stdio.h>
43#include <string.h>
44#include "iv_datatypedef.h"
45#include "impeg2_macros.h"
46#include "impeg2_defs.h"
47#include "impeg2_globals.h"
48
49#include <immintrin.h>
50#include <emmintrin.h>
51#include <smmintrin.h>
52#include <tmmintrin.h>
53
54
55/**
56 *******************************************************************************
57 *
58 * @brief
59 *  This function performs inverse quantization, inverse  transform and
60 * reconstruction for 8c8 input block
61 *
62 * @par Description:
63 *  Performs inverse quantization , inverse transform  and adds the
64 * prediction data and clips output to 8 bit
65 *
66 * @param[in] pi2_src
67 *  Input 8x8 coefficients
68 *
69 * @param[in] pi2_tmp
70 *  Temporary 8x8 buffer for storing inverse
71 *  transform 1st stage output
72 *
73 * @param[in] pu1_pred
74 *  Prediction 8x8 block
75 *
76 * @param[in] pi2_dequant_coeff
77 *  Dequant Coeffs
78 *
79 * @param[out] pu1_dst
80 *  Output 8x8 block
81 *
82 * @param[in] src_strd
83 *  Input stride
84 *
85 * @param[in] qp_div
86 *  Quantization parameter / 6
87 *
88 * @param[in] qp_rem
89 *  Quantization parameter % 6
90 *
91 * @param[in] pred_strd
92 *  Prediction stride
93 *
94 * @param[in] dst_strd
95 *  Output Stride
96 *
97 * @param[in] zero_cols
98 *  Zero columns in pi2_src
99 *
100 * @returns  Void
101 *
102 * @remarks
103 *  None
104 *
105 *******************************************************************************
106 */
107
108
109void impeg2_idct_recon_sse42(WORD16 *pi2_src,
110                                  WORD16 *pi2_tmp,
111                                  UWORD8 *pu1_pred,
112                                  UWORD8 *pu1_dst,
113                                  WORD32 src_strd,
114                                  WORD32 pred_strd,
115                                  WORD32 dst_strd,
116                                  WORD32 zero_cols,
117                                  WORD32 zero_rows)
118{
119    __m128i m_temp_reg_0;
120    __m128i m_temp_reg_1;
121    __m128i m_temp_reg_2;
122    __m128i m_temp_reg_3;
123    __m128i m_temp_reg_5;
124    __m128i m_temp_reg_6;
125    __m128i m_temp_reg_7;
126    __m128i m_temp_reg_4;
127    __m128i m_temp_reg_10;
128    __m128i m_temp_reg_11;
129    __m128i m_temp_reg_12;
130    __m128i m_temp_reg_13;
131    __m128i m_temp_reg_14;
132    __m128i m_temp_reg_15;
133    __m128i m_temp_reg_16;
134    __m128i m_temp_reg_17;
135    __m128i m_temp_reg_20;
136    __m128i m_temp_reg_21;
137    __m128i m_temp_reg_22;
138    __m128i m_temp_reg_23;
139    __m128i m_temp_reg_24;
140    __m128i m_temp_reg_25;
141    __m128i m_temp_reg_26;
142    __m128i m_temp_reg_27;
143    __m128i m_temp_reg_30;
144    __m128i m_temp_reg_31;
145    __m128i m_temp_reg_32;
146    __m128i m_temp_reg_33;
147    __m128i m_temp_reg_34;
148    __m128i m_temp_reg_35;
149    __m128i m_temp_reg_36;
150    __m128i m_temp_reg_37;
151    __m128i m_temp_reg_40;
152    __m128i m_temp_reg_41;
153    __m128i m_temp_reg_42;
154    __m128i m_temp_reg_43;
155    __m128i m_temp_reg_44;
156    __m128i m_temp_reg_45;
157    __m128i m_temp_reg_46;
158    __m128i m_temp_reg_47;
159    __m128i m_temp_reg_50;
160    __m128i m_temp_reg_51;
161    __m128i m_temp_reg_52;
162    __m128i m_temp_reg_53;
163    __m128i m_temp_reg_54;
164    __m128i m_temp_reg_55;
165    __m128i m_temp_reg_56;
166    __m128i m_temp_reg_57;
167    __m128i m_temp_reg_60;
168    __m128i m_temp_reg_61;
169    __m128i m_temp_reg_62;
170    __m128i m_temp_reg_63;
171    __m128i m_temp_reg_64;
172    __m128i m_temp_reg_65;
173    __m128i m_temp_reg_66;
174    __m128i m_temp_reg_67;
175    __m128i m_temp_reg_70;
176    __m128i m_temp_reg_71;
177    __m128i m_temp_reg_72;
178    __m128i m_temp_reg_73;
179    __m128i m_temp_reg_74;
180    __m128i m_temp_reg_75;
181    __m128i m_temp_reg_76;
182    __m128i m_temp_reg_77;
183    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
184
185    WORD32 check_row_stage_1;   /* Lokesh */
186    WORD32 check_row_stage_2;   /* Lokesh */
187
188    __m128i m_rdng_factor;
189    WORD32 i4_shift = IDCT_STG1_SHIFT;
190    UNUSED(pi2_tmp);
191    check_row_stage_1   = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
192    check_row_stage_2   = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
193
194    m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
195    pi2_src += src_strd;
196    m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
197    pi2_src += src_strd;
198    m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
199    pi2_src += src_strd;
200    m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
201    pi2_src += src_strd;
202
203    m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
204    pi2_src += src_strd;
205    m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
206    pi2_src += src_strd;
207    m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
208    pi2_src += src_strd;
209    m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);
210
211    if(!check_row_stage_2)
212    {
213        if(!check_row_stage_1)
214        {
215            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
216            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
217            {
218                //Interleaving 0,4 row in 0 , 1 Rishab
219                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
220                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
221                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
222
223                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
224
225                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
226                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
227
228            }
229
230
231            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
232            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
233            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
234            {
235
236                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
237                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
238
239                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
240                //Interleaving 2,6 row in 4, 5 Rishab
241                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
242
243                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
244                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
245
246
247                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
248
249                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
250                m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
251
252                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
253                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
254
255
256
257                /* e */
258
259                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
260                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
261                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
262                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
263                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
264                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
265
266                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
267                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
268
269            }
270
271            /* o */
272            {
273
274                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
275                {
276
277                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
278                    //o0:1B*89+3B*75,5B*50+7B*18
279                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
280
281                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
282                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
283
284
285
286                    /* Column 0 of destination computed here */
287                    /* It is stored in m_temp_reg_50 */
288                    /* Column 7 of destination computed here */
289                    /* It is stored in m_temp_reg_57 */
290                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
291
292
293
294                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
295                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
296
297                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
298                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
299
300                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
301                    m_temp_reg_63 = _mm_setzero_si128();
302                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
303
304                    //o1:1B*75-3B*18,5B*89+7B*50
305                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
306
307                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
308                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
309
310                    /* Loading coeff for computing o2  in the next block */
311
312                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
313                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
314
315                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
316
317
318
319                    /* Column 1 of destination computed here */
320                    /* It is stored in m_temp_reg_51 */
321                    /* Column 6 of destination computed here */
322                    /* It is stored in m_temp_reg_56 */
323
324                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
325                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
326
327                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
328                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
329
330                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
331                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
332
333                    //o2:1B*50-3B*89,5B*18+7B*75
334                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
335
336                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
337                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
338
339
340                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
341
342                    /* Loading coeff for computing o3  in the next block */
343
344                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
345                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
346
347
348
349                    /* Column 2 of destination computed here */
350                    /* It is stored in m_temp_reg_52 */
351                    /* Column 5 of destination computed here */
352                    /* It is stored in m_temp_reg_55 */
353
354                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
355                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
356
357                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
358                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
359
360                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
361                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
362
363                    //o3:1B*18-3B*50,5B*75-7B*89
364                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
365
366                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
367                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
368
369
370
371                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
372
373
374
375                    /* Column 3 of destination computed here */
376                    /* It is stored in m_temp_reg_53 */
377                    /* Column 4 of destination computed here */
378                    /* It is stored in m_temp_reg_54 */
379
380                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
381                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
382
383                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
384                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
385
386                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
387                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
388
389
390                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
391                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
392                }
393            }
394
395            /* Transpose of the destination 8x8 matrix done here */
396            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
397            /* respectively */
398            {
399                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
400                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
401                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
402                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
403
404                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
405                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
406
407                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
408                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
409
410                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
411                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
412                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
413                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
414
415                m_temp_reg_54 = _mm_setzero_si128();
416                m_temp_reg_55 = _mm_setzero_si128();
417                m_temp_reg_56 = _mm_setzero_si128();
418                m_temp_reg_57 = _mm_setzero_si128();
419            }
420        }
421        else
422        {
423            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
424            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
425            {
426                //Interleaving 0,4 row in 0 , 1 Rishab
427                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
428                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
429                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
430
431                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
432
433                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
434                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
435
436            }
437
438
439            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
440            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
441            /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
442            {
443
444                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
445                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
446
447                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
448                //Interleaving 2,6 row in 4, 5 Rishab
449                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
450
451                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
452                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
453
454
455                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
456
457                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
458                m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
459
460                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
461                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
462
463
464
465                /* e */
466
467                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
468                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
469                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
470                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
471                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
472                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
473
474                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
475                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
476
477            }
478
479            /* o */
480            {
481
482                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
483                {
484
485                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
486                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
487                    //o0:1B*89+3B*75,5B*50+7B*18
488                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
489                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
490
491                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
492                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
493
494                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
495
496
497
498                    /* Column 0 of destination computed here */
499                    /* It is stored in m_temp_reg_50 */
500                    /* Column 7 of destination computed here */
501                    /* It is stored in m_temp_reg_57 */
502                    /* Upper 8 bytes of both registers are zero due to zero_cols*/
503
504
505
506                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
507                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
508
509                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
510                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
511
512                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
513                    m_temp_reg_63 = _mm_setzero_si128();
514                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
515
516                    //o1:1B*75-3B*18,5B*89+7B*50
517                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
518                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
519
520                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
521                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
522
523                    /* Loading coeff for computing o2  in the next block */
524
525                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
526                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
527
528                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
529                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
530
531
532
533                    /* Column 1 of destination computed here */
534                    /* It is stored in m_temp_reg_51 */
535                    /* Column 6 of destination computed here */
536                    /* It is stored in m_temp_reg_56 */
537
538                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
539                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
540
541                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
542                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
543
544                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
545                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
546
547                    //o2:1B*50-3B*89,5B*18+7B*75
548                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
549                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
550
551                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
552                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
553
554
555                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
556
557                    /* Loading coeff for computing o3  in the next block */
558
559                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
560                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
561
562                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
563
564
565                    /* Column 2 of destination computed here */
566                    /* It is stored in m_temp_reg_52 */
567                    /* Column 5 of destination computed here */
568                    /* It is stored in m_temp_reg_55 */
569
570                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
571                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
572
573                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
574                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
575
576                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
577                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
578
579                    //o3:1B*18-3B*50,5B*75-7B*89
580                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
581                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
582
583                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
584                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
585
586
587
588                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
589
590                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
591
592
593                    /* Column 3 of destination computed here */
594                    /* It is stored in m_temp_reg_53 */
595                    /* Column 4 of destination computed here */
596                    /* It is stored in m_temp_reg_54 */
597
598                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
599                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
600
601                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
602                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
603
604                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
605                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
606
607
608                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
609                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
610                }
611            }
612
613            /* Transpose of the destination 8x8 matrix done here */
614            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
615            /* respectively */
616            {
617                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
618                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
619                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
620                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
621
622                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
623                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
624                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
625                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
626
627                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
628                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
629                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
630                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
631
632                m_temp_reg_54 = _mm_setzero_si128();
633                m_temp_reg_55 = _mm_setzero_si128();
634                m_temp_reg_56 = _mm_setzero_si128();
635                m_temp_reg_57 = _mm_setzero_si128();
636            }
637        }
638
639        /* Stage 2 */
640        i4_shift = IDCT_STG2_SHIFT;
641        {
642            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
643            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
644            {
645                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
646                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
647
648                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
649                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
650
651                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
652                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
653                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
654                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
655
656
657                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
658                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
659            }
660
661
662            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
663            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
664            {
665
666                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
667                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
668
669
670                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
671                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
672                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
673                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
674
675                /* Loading coeff for computing o0 in the next block */
676                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
677
678
679                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
680                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
681
682
683
684                /* e */
685
686                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
687                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
688                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
689                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
690                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
691                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
692
693                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
694                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
695
696                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
697                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
698
699                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
700                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
701
702            }
703
704            /* o */
705            {
706
707                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
708                {
709                    //o0:1B*89+3B*75,1T*89+3T*75
710                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
711                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
712
713                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
714                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
715                    /* Loading coeff for computing o1 in the next block */
716                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
717
718
719
720                    /* Column 0 of destination computed here */
721                    /* It is stored in m_temp_reg_50 */
722                    /* Column 7 of destination computed here */
723                    /* It is stored in m_temp_reg_57 */
724
725                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
726                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
727
728                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
729                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
730
731                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
732                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
733                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
734                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
735
736                    //o1:1B*75-3B*18,1T*75-3T*18
737                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
738                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
739
740                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
741                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
742                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
743                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
744
745                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
746                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
747
748
749                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
750
751
752                    /* Loading coeff for computing o2  in the next block */
753                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
754
755
756
757                    /* Column 1 of destination computed here */
758                    /* It is stored in m_temp_reg_51 */
759                    /* Column 6 of destination computed here */
760                    /* It is stored in m_temp_reg_56 */
761
762                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
763                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
764
765                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
766                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
767
768                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
769                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
770                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
771                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
772
773                    //o2:1B*50-3B*89,5T*18+7T*75.
774                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
775                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
776
777                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
778                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
779                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
780                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
781
782                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
783                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
784
785
786                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
787
788                    /* Loading coeff for computing o3  in the next block */
789
790                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
791
792
793                    /* Column 2 of destination computed here */
794                    /* It is stored in m_temp_reg_52 */
795                    /* Column 5 of destination computed here */
796                    /* It is stored in m_temp_reg_55 */
797
798                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
799                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
800
801                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
802                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
803
804                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
805                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
806                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
807                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
808
809                    //o3:1B*18-3B*50,1T*18-3T*50
810                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
811                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
812
813                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
814                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
815                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
816                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
817
818
819                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
820                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
821
822
823
824                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
825
826
827                    /* Column 3 of destination computed here */
828                    /* It is stored in m_temp_reg_53 */
829                    /* Column 4 of destination computed here */
830                    /* It is stored in m_temp_reg_54 */
831
832                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
833                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
834
835                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
836                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
837
838                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
839                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
840                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
841                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
842
843                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
844                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
845                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
846                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
847
848                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
849                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
850                }
851            }
852
853            /* Transpose of the destination 8x8 matrix done here */
854            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
855            /* respectively */
856            {
857                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
858                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
859                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
860                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
861                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
862                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
863                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
864                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
865
866                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
867                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
868                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
869                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
870                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
871                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
872                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
873                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
874                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
875                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
876                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
877                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
878
879                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
880                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
881                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
882                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
883            }
884
885            /* Recon and store */
886            {
887                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
888                pu1_pred += pred_strd;
889                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
890                pu1_pred += pred_strd;
891                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
892                pu1_pred += pred_strd;
893                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
894                pu1_pred += pred_strd;
895                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
896                pu1_pred += pred_strd;
897                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
898                pu1_pred += pred_strd;
899                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
900                pu1_pred += pred_strd;
901                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
902
903                m_temp_reg_50 = _mm_setzero_si128();
904                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
905                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
906                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
907                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
908                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
909                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
910                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
911                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
912
913                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
914                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
915                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
916                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
917                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
918                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
919                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
920                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
921
922                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
923                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
924                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
925                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
926                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
927                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
928                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
929                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
930
931                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
932                pu1_dst += dst_strd;
933                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
934                pu1_dst += dst_strd;
935                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
936                pu1_dst += dst_strd;
937                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
938                pu1_dst += dst_strd;
939                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
940                pu1_dst += dst_strd;
941                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
942                pu1_dst += dst_strd;
943                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
944                pu1_dst += dst_strd;
945                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
946                pu1_dst += dst_strd;
947            }
948        }
949    }
950    else
951
952    {
953
954        /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
955        /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
956        if(!check_row_stage_1)
957        {
958            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
959            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
960            {
961                //Interleaving 0,4 row in 0 , 1 Rishab
962                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
963                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
964                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
965
966                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
967                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
968
969                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
970                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
971
972
973                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
974                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
975            }
976
977
978            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
979            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
980            {
981
982                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
983                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
984
985                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
986                //Interleaving 2,6 row in 4, 5 Rishab
987                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
988                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
989
990                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
991                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
992
993                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
994                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
995
996
997
998                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
999
1000                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
1001                //m_coeff4 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[3][0]);
1002
1003                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
1004                //m_coeff2 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[1][0]);
1005
1006            }
1007
1008            /* e */
1009            {
1010                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1011                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1012                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1013                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1014                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1015                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1016
1017                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1018                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1019
1020                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1021                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1022
1023                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1024                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1025
1026            }
1027
1028            /* o */
1029            {
1030
1031                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1032                {
1033
1034                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1035                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1036                    //o0:1B*89+3B*75,1T*89+3T*75
1037                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1038                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1039
1040                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1041                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1042
1043                }
1044
1045                /* Column 0 of destination computed here */
1046                /* It is stored in m_temp_reg_50 */
1047                /* Column 7 of destination computed here */
1048                /* It is stored in m_temp_reg_57 */
1049                {
1050
1051
1052                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1053                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1054
1055                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1056                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1057
1058                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1059                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1060                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1061                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1062
1063                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1064                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1065                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1066                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1067
1068                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1069                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1070                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1071
1072                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1073                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1074
1075                    /* Loading coeff for computing o2  in the next block */
1076
1077                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
1078
1079                }
1080
1081                /* Column 1 of destination computed here */
1082                /* It is stored in m_temp_reg_51 */
1083                /* Column 6 of destination computed here */
1084                /* It is stored in m_temp_reg_56 */
1085                {
1086                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1087                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1088
1089                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1090                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1091
1092                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1093                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1094                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1095                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1096
1097                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1098                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1099                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1100                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1101
1102                    //o2:1B*50-3B*89,1T*50-3T*89
1103                    m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1104                    m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1105
1106                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1107                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1108
1109
1110                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1111
1112
1113                    /* Loading coeff for computing o3  in the next block */
1114
1115                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
1116
1117                }
1118
1119                /* Column 2 of destination computed here */
1120                /* It is stored in m_temp_reg_52 */
1121                /* Column 5 of destination computed here */
1122                /* It is stored in m_temp_reg_55 */
1123                {
1124                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1125                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1126
1127                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1128                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1129
1130                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1131                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1132                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1133                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1134
1135                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1136                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1137                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1138                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1139
1140                    //o3:1B*18-3B*50,1T*18-3T*50
1141                    m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1142                    m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1143
1144                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1145                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1146
1147
1148
1149                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1150
1151
1152                }
1153
1154                /* Column 3 of destination computed here */
1155                /* It is stored in m_temp_reg_53 */
1156                /* Column 4 of destination computed here */
1157                /* It is stored in m_temp_reg_54 */
1158                {
1159                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1160                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1161
1162                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1163                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1164
1165                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1166                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1167                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1168                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1169
1170                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1171                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1172                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1173                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1174
1175                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1176                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1177                }
1178            }
1179
1180            /* Transpose of the destination 8x8 matrix done here */
1181            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1182            /* respectively */
1183            {
1184
1185
1186                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1187                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1188                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1189                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1190                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1191                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1192                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1193                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1194
1195                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1196                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1197                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1198                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1199                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1200                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1201                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1202                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1203
1204                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1205                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1206                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1207                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1208
1209                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1210                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1211                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1212                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1213            }
1214        }
1215        else
1216        {
1217
1218            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1219            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1220            {
1221                //Interleaving 0,4 row in 0 , 1 Rishab
1222                /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1223                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
1224                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
1225
1226                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1227                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
1228
1229                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1230                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1231
1232
1233                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1234                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1235            }
1236
1237
1238            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1239            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1240            {
1241
1242                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1243                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1244
1245                /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1246                //Interleaving 2,6 row in 4, 5 Rishab
1247                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1248                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
1249
1250                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1251                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1252
1253                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
1254                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1255
1256
1257
1258                /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1259
1260                m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
1261                m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
1262
1263                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
1264                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
1265
1266            }
1267
1268            /* e */
1269            {
1270                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1271                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1272                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1273                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1274                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1275                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1276
1277                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1278                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1279
1280                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1281                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1282
1283                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1284                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1285
1286            }
1287
1288            /* o */
1289            {
1290
1291                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1292                {
1293
1294                    m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1295                    m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1296                    m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1297                    m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
1298                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
1299                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1300                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1301                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1302                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
1303
1304
1305                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1306                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1307
1308                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1309                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1310                }
1311
1312                /* Column 0 of destination computed here */
1313                /* It is stored in m_temp_reg_50 */
1314                /* Column 7 of destination computed here */
1315                /* It is stored in m_temp_reg_57 */
1316                {
1317
1318
1319                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1320                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1321
1322                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1323                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1324
1325                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1326                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1327                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1328                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1329
1330                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1331                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1332                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1333                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1334
1335                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1336                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1337                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1338                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1339                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
1340
1341                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1342                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1343
1344                    /* Loading coeff for computing o2  in the next block */
1345
1346                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
1347                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
1348
1349                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1350                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
1351                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
1352                }
1353
1354                /* Column 1 of destination computed here */
1355                /* It is stored in m_temp_reg_51 */
1356                /* Column 6 of destination computed here */
1357                /* It is stored in m_temp_reg_56 */
1358                {
1359                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1360                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1361
1362                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1363                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1364
1365                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1366                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1367                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1368                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1369
1370                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1371                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1372                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1373                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1374
1375                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
1376                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1377                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1378                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1379                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
1380
1381                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1382                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1383
1384
1385                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1386
1387
1388                    /* Loading coeff for computing o3  in the next block */
1389
1390                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
1391                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
1392
1393                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1394                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1395                }
1396
1397                /* Column 2 of destination computed here */
1398                /* It is stored in m_temp_reg_52 */
1399                /* Column 5 of destination computed here */
1400                /* It is stored in m_temp_reg_55 */
1401                {
1402                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1403                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1404
1405                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1406                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1407
1408                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1409                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1410                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1411                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1412
1413                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1414                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1415                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1416                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1417
1418                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
1419                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1420                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1421                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1422                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
1423
1424                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1425                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1426
1427
1428
1429                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1430
1431
1432                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
1433                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
1434                }
1435
1436                /* Column 3 of destination computed here */
1437                /* It is stored in m_temp_reg_53 */
1438                /* Column 4 of destination computed here */
1439                /* It is stored in m_temp_reg_54 */
1440                {
1441                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1442                    m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1443
1444                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1445                    m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1446
1447                    m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1448                    m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1449                    m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1450                    m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1451
1452                    m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1453                    m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1454                    m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1455                    m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1456
1457                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1458                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1459                }
1460            }
1461
1462            /* Transpose of the destination 8x8 matrix done here */
1463            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1464            /* respectively */
1465            {
1466
1467
1468                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1469                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1470                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1471                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1472                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1473                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1474                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1475                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1476
1477                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1478                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1479                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1480                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1481                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1482                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1483                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1484                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1485
1486                m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1487                m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1488                m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1489                m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1490
1491                m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1492                m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1493                m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1494                m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1495            }
1496        }
1497        /* Stage 2 */
1498
1499        i4_shift = IDCT_STG2_SHIFT;
1500
1501        {
1502
1503            /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1504            /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1505            {
1506                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
1507                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
1508
1509                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
1510                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
1511
1512                m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1513                m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1514                m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1515                m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1516
1517
1518                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
1519                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
1520            }
1521
1522
1523            /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1524            /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1525            {
1526                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
1527                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
1528
1529
1530                m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1531                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1532                m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1533                m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1534
1535                /* Loading coeff for computing o0 in the next block */
1536                m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
1537                m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[1][0]);
1538
1539
1540                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
1541                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
1542            }
1543
1544            /* e */
1545            {
1546                /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1547                /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1548                /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1549                /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1550                m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1551                m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1552
1553                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1554                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1555
1556                m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1557                m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1558
1559                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1560                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1561
1562            }
1563
1564            /* o */
1565            {
1566                m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
1567                m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
1568
1569                /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1570                {
1571                    //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
1572                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1573                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1574                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1575                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1576
1577                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1578                    m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1579                    /* Loading coeff for computing o1 in the next block */
1580                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
1581                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[3][0]);
1582
1583                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1584                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1585                }
1586
1587                /* Column 0 of destination computed here */
1588                /* It is stored in m_temp_reg_50 */
1589                /* Column 7 of destination computed here */
1590                /* It is stored in m_temp_reg_57 */
1591                {
1592                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1593                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1594
1595                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1596                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1597
1598                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1599                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1600                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1601                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1602
1603                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1604                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1605                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1606                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1607
1608                    //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1609                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1610                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
1611                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1612                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
1613
1614                    m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1615                    m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1616
1617
1618                    /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1619
1620
1621                    /* Loading coeff for computing o2  in the next block */
1622                    m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
1623                    m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[5][0]);
1624
1625                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
1626                    m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
1627                }
1628
1629                /* Column 1 of destination computed here */
1630                /* It is stored in m_temp_reg_51 */
1631                /* Column 6 of destination computed here */
1632                /* It is stored in m_temp_reg_56 */
1633                {
1634                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1635                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1636
1637                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1638                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1639
1640                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1641                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1642                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1643                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1644
1645                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1646                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1647                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1648                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1649
1650                    //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
1651                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1652                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1653                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1654                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1655
1656                    m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1657                    m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1658
1659
1660                    /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1661
1662                    /* Loading coeff for computing o3  in the next block */
1663
1664                    m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
1665                    m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[7][0]);
1666
1667                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1668                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1669                }
1670
1671                /* Column 2 of destination computed here */
1672                /* It is stored in m_temp_reg_52 */
1673                /* Column 5 of destination computed here */
1674                /* It is stored in m_temp_reg_55 */
1675                {
1676                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1677                    m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1678
1679                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1680                    m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1681
1682                    m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1683                    m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1684                    m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1685                    m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1686
1687                    m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1688                    m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1689                    m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1690                    m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1691
1692                    //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
1693                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1694                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
1695                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1696                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
1697
1698                    m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1699                    m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1700
1701
1702
1703                    /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1704
1705
1706                    m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
1707                    m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
1708                }
1709
1710                /* Column 3 of destination computed here */
1711                /* It is stored in m_temp_reg_53 */
1712                /* Column 4 of destination computed here */
1713                /* It is stored in m_temp_reg_54 */
1714                {
1715                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1716                    m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1717
1718                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1719                    m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1720
1721                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
1722                    m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
1723                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
1724                    m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
1725
1726                    m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
1727                    m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
1728                    m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
1729                    m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
1730
1731                    m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
1732                    m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
1733                }
1734            }
1735
1736            /* Transpose of the destination 8x8 matrix done here */
1737            /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1738            /* respectively */
1739            {
1740                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1741                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1742                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1743                m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1744                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1745                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1746                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1747                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1748
1749                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1750                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1751                m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1752                m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1753                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1754                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1755                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1756                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1757                m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1758                m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1759                m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1760                m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1761
1762                m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1763                m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1764                m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1765                m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1766            }
1767
1768            /* Recon and store */
1769            {
1770                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
1771                pu1_pred += pred_strd;
1772                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
1773                pu1_pred += pred_strd;
1774                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
1775                pu1_pred += pred_strd;
1776                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
1777                pu1_pred += pred_strd;
1778                m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
1779                pu1_pred += pred_strd;
1780                m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
1781                pu1_pred += pred_strd;
1782                m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
1783                pu1_pred += pred_strd;
1784                m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
1785
1786
1787                m_temp_reg_50 = _mm_setzero_si128();
1788                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
1789                m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
1790                m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
1791                m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
1792                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
1793                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
1794                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
1795                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
1796
1797                m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
1798                m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
1799                m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
1800                m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
1801                m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
1802                m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
1803                m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
1804                m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
1805
1806                m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
1807                m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
1808                m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
1809                m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
1810                m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
1811                m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
1812                m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
1813                m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
1814
1815                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
1816                pu1_dst += dst_strd;
1817                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
1818                pu1_dst += dst_strd;
1819                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
1820                pu1_dst += dst_strd;
1821                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
1822                pu1_dst += dst_strd;
1823                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
1824                pu1_dst += dst_strd;
1825                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
1826                pu1_dst += dst_strd;
1827                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
1828                pu1_dst += dst_strd;
1829                _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
1830                pu1_dst += dst_strd;
1831
1832            }
1833
1834
1835        }
1836
1837
1838    }
1839}
1840
1841void impeg2_idct_recon_dc_mismatch_sse42(WORD16 *pi2_src,
1842                            WORD16 *pi2_tmp,
1843                            UWORD8 *pu1_pred,
1844                            UWORD8 *pu1_dst,
1845                            WORD32 src_strd,
1846                            WORD32 pred_strd,
1847                            WORD32 dst_strd,
1848                            WORD32 zero_cols,
1849                            WORD32 zero_rows)
1850{
1851    WORD32 val;
1852    __m128i value_4x32b, mismatch_stg2_additive;
1853    __m128i pred_r, pred_half0, pred_half1;
1854    __m128i temp0, temp1;
1855    __m128i round_stg2 = _mm_set1_epi32(IDCT_STG2_ROUND);
1856
1857    UNUSED(pi2_tmp);
1858    UNUSED(src_strd);
1859    UNUSED(zero_cols);
1860    UNUSED(zero_rows);
1861
1862    val = pi2_src[0] * gai2_impeg2_idct_q15[0];
1863    val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
1864    val *= gai2_impeg2_idct_q11[0];
1865    value_4x32b = _mm_set1_epi32(val);
1866
1867    // Row 0 processing
1868    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) gai2_impeg2_mismatch_stg2_additive);
1869    pred_r = _mm_loadl_epi64((__m128i *) pu1_pred);
1870    pred_r =  _mm_cvtepu8_epi16(pred_r);
1871    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1872    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1873    pred_half0 = _mm_cvtepu16_epi32(pred_r);
1874    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1875
1876    pred_r = _mm_srli_si128(pred_r, 8);
1877
1878    temp0 = _mm_add_epi32(temp0, value_4x32b);
1879    temp1 = _mm_add_epi32(temp1, value_4x32b);
1880    temp0 = _mm_add_epi32(temp0, round_stg2);
1881    temp1 = _mm_add_epi32(temp1, round_stg2);
1882    pred_half1 = _mm_cvtepu16_epi32(pred_r);
1883    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1884    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1885    temp0 = _mm_add_epi32(temp0, pred_half0);
1886    temp1 = _mm_add_epi32(temp1, pred_half1);
1887
1888    temp0 = _mm_packus_epi32(temp0, temp1);
1889    temp0 = _mm_packus_epi16(temp0, temp1);
1890
1891    _mm_storel_epi64((__m128i *)pu1_dst, temp0);
1892
1893    // Row 1 processing
1894    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 8));
1895    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
1896    pred_r =  _mm_cvtepu8_epi16(pred_r);
1897    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1898    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1899    pred_half0 = _mm_cvtepu16_epi32(pred_r);
1900    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1901
1902    pred_r = _mm_srli_si128(pred_r, 8);
1903
1904    temp0 = _mm_add_epi32(temp0, value_4x32b);
1905    temp1 = _mm_add_epi32(temp1, value_4x32b);
1906    temp0 = _mm_add_epi32(temp0, round_stg2);
1907    temp1 = _mm_add_epi32(temp1, round_stg2);
1908    pred_half1 = _mm_cvtepu16_epi32(pred_r);
1909    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1910    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1911    temp0 = _mm_add_epi32(temp0, pred_half0);
1912    temp1 = _mm_add_epi32(temp1, pred_half1);
1913
1914    temp0 = _mm_packus_epi32(temp0, temp1);
1915    temp0 = _mm_packus_epi16(temp0, temp1);
1916
1917    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp0);
1918
1919    // Row 2 processing
1920    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 16));
1921    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 2 * pred_strd));
1922    pred_r =  _mm_cvtepu8_epi16(pred_r);
1923    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1924    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1925    pred_half0 = _mm_cvtepu16_epi32(pred_r);
1926    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1927
1928    pred_r = _mm_srli_si128(pred_r, 8);
1929
1930    temp0 = _mm_add_epi32(temp0, value_4x32b);
1931    temp1 = _mm_add_epi32(temp1, value_4x32b);
1932    temp0 = _mm_add_epi32(temp0, round_stg2);
1933    temp1 = _mm_add_epi32(temp1, round_stg2);
1934    pred_half1 = _mm_cvtepu16_epi32(pred_r);
1935    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1936    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1937    temp0 = _mm_add_epi32(temp0, pred_half0);
1938    temp1 = _mm_add_epi32(temp1, pred_half1);
1939
1940    temp0 = _mm_packus_epi32(temp0, temp1);
1941    temp0 = _mm_packus_epi16(temp0, temp1);
1942
1943    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), temp0);
1944
1945    // Row 3 processing
1946    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 24));
1947    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 3 * pred_strd));
1948    pred_r =  _mm_cvtepu8_epi16(pred_r);
1949    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1950    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1951    pred_half0 = _mm_cvtepu16_epi32(pred_r);
1952    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1953
1954    pred_r = _mm_srli_si128(pred_r, 8);
1955
1956    temp0 = _mm_add_epi32(temp0, value_4x32b);
1957    temp1 = _mm_add_epi32(temp1, value_4x32b);
1958    temp0 = _mm_add_epi32(temp0, round_stg2);
1959    temp1 = _mm_add_epi32(temp1, round_stg2);
1960    pred_half1 = _mm_cvtepu16_epi32(pred_r);
1961    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1962    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1963    temp0 = _mm_add_epi32(temp0, pred_half0);
1964    temp1 = _mm_add_epi32(temp1, pred_half1);
1965
1966    temp0 = _mm_packus_epi32(temp0, temp1);
1967    temp0 = _mm_packus_epi16(temp0, temp1);
1968
1969    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), temp0);
1970
1971    // Row 4 processing
1972    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 32));
1973    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 4 * pred_strd));
1974    pred_r =  _mm_cvtepu8_epi16(pred_r);
1975    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1976    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1977    pred_half0 = _mm_cvtepu16_epi32(pred_r);
1978    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1979
1980    pred_r = _mm_srli_si128(pred_r, 8);
1981
1982    temp0 = _mm_add_epi32(temp0, value_4x32b);
1983    temp1 = _mm_add_epi32(temp1, value_4x32b);
1984    temp0 = _mm_add_epi32(temp0, round_stg2);
1985    temp1 = _mm_add_epi32(temp1, round_stg2);
1986    pred_half1 = _mm_cvtepu16_epi32(pred_r);
1987    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1988    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1989    temp0 = _mm_add_epi32(temp0, pred_half0);
1990    temp1 = _mm_add_epi32(temp1, pred_half1);
1991
1992    temp0 = _mm_packus_epi32(temp0, temp1);
1993    temp0 = _mm_packus_epi16(temp0, temp1);
1994
1995    _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), temp0);
1996
1997    // Row 5 processing
1998    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 40));
1999    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 5 * pred_strd));
2000    pred_r =  _mm_cvtepu8_epi16(pred_r);
2001    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2002    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
2003    pred_half0 = _mm_cvtepu16_epi32(pred_r);
2004    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2005
2006    pred_r = _mm_srli_si128(pred_r, 8);
2007
2008    temp0 = _mm_add_epi32(temp0, value_4x32b);
2009    temp1 = _mm_add_epi32(temp1, value_4x32b);
2010    temp0 = _mm_add_epi32(temp0, round_stg2);
2011    temp1 = _mm_add_epi32(temp1, round_stg2);
2012    pred_half1 = _mm_cvtepu16_epi32(pred_r);
2013    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
2014    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
2015    temp0 = _mm_add_epi32(temp0, pred_half0);
2016    temp1 = _mm_add_epi32(temp1, pred_half1);
2017
2018    temp0 = _mm_packus_epi32(temp0, temp1);
2019    temp0 = _mm_packus_epi16(temp0, temp1);
2020
2021    _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), temp0);
2022
2023    // Row 6 processing
2024    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 48));
2025    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 6 * pred_strd));
2026    pred_r =  _mm_cvtepu8_epi16(pred_r);
2027    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2028    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
2029    pred_half0 = _mm_cvtepu16_epi32(pred_r);
2030    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2031
2032    pred_r = _mm_srli_si128(pred_r, 8);
2033
2034    temp0 = _mm_add_epi32(temp0, value_4x32b);
2035    temp1 = _mm_add_epi32(temp1, value_4x32b);
2036    temp0 = _mm_add_epi32(temp0, round_stg2);
2037    temp1 = _mm_add_epi32(temp1, round_stg2);
2038    pred_half1 = _mm_cvtepu16_epi32(pred_r);
2039    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
2040    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
2041    temp0 = _mm_add_epi32(temp0, pred_half0);
2042    temp1 = _mm_add_epi32(temp1, pred_half1);
2043
2044    temp0 = _mm_packus_epi32(temp0, temp1);
2045    temp0 = _mm_packus_epi16(temp0, temp1);
2046
2047    _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), temp0);
2048
2049    // Row 7 processing
2050    mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 56));
2051    pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 7 * pred_strd));
2052    pred_r =  _mm_cvtepu8_epi16(pred_r);
2053    temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2054    mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
2055    pred_half0 = _mm_cvtepu16_epi32(pred_r);
2056    temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2057
2058    pred_r = _mm_srli_si128(pred_r, 8);
2059
2060    temp0 = _mm_add_epi32(temp0, value_4x32b);
2061    temp1 = _mm_add_epi32(temp1, value_4x32b);
2062    temp0 = _mm_add_epi32(temp0, round_stg2);
2063    temp1 = _mm_add_epi32(temp1, round_stg2);
2064    pred_half1 = _mm_cvtepu16_epi32(pred_r);
2065    temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
2066    temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
2067    temp0 = _mm_add_epi32(temp0, pred_half0);
2068    temp1 = _mm_add_epi32(temp1, pred_half1);
2069
2070    temp0 = _mm_packus_epi32(temp0, temp1);
2071    temp0 = _mm_packus_epi16(temp0, temp1);
2072
2073    _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), temp0);
2074}
2075
2076void impeg2_idct_recon_dc_sse42(WORD16 *pi2_src,
2077                            WORD16 *pi2_tmp,
2078                            UWORD8 *pu1_pred,
2079                            UWORD8 *pu1_dst,
2080                            WORD32 src_strd,
2081                            WORD32 pred_strd,
2082                            WORD32 dst_strd,
2083                            WORD32 zero_cols,
2084                            WORD32 zero_rows)
2085{
2086    WORD32 val;
2087    __m128i value_4x32b, pred_r0, pred_r1, temp0, temp1, temp2, temp3;
2088
2089    UNUSED(pi2_tmp);
2090    UNUSED(src_strd);
2091    UNUSED(zero_cols);
2092    UNUSED(zero_rows);
2093
2094    val = pi2_src[0] * gai2_impeg2_idct_q15[0];
2095    val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
2096    val = val * gai2_impeg2_idct_q11[0];
2097    val = ((val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT);
2098
2099    value_4x32b = _mm_set1_epi32(val);
2100
2101    //Row 0-1 processing
2102    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2103    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2104    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
2105    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
2106
2107    temp0 = _mm_cvtepu16_epi32(pred_r0);
2108    pred_r0 = _mm_srli_si128(pred_r0, 8);
2109    temp2 = _mm_cvtepu16_epi32(pred_r1);
2110    pred_r1 = _mm_srli_si128(pred_r1, 8);
2111    temp1 = _mm_cvtepu16_epi32(pred_r0);
2112    temp3 = _mm_cvtepu16_epi32(pred_r1);
2113
2114    temp0 = _mm_add_epi32(temp0, value_4x32b);
2115    temp2 = _mm_add_epi32(temp2, value_4x32b);
2116    temp1 = _mm_add_epi32(temp1, value_4x32b);
2117    temp3 = _mm_add_epi32(temp3, value_4x32b);
2118    temp0 = _mm_packus_epi32(temp0, temp1);
2119    temp2 = _mm_packus_epi32(temp2, temp3);
2120    temp0 = _mm_packus_epi16(temp0, temp1);
2121    temp2 = _mm_packus_epi16(temp2, temp3);
2122    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2123    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2124
2125    //Row 2-3 processing
2126    pu1_pred += 2 * pred_strd;
2127    pu1_dst += 2 * dst_strd;
2128
2129    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2130    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2131    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
2132    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
2133
2134    temp0 = _mm_cvtepu16_epi32(pred_r0);
2135    pred_r0 = _mm_srli_si128(pred_r0, 8);
2136    temp2 = _mm_cvtepu16_epi32(pred_r1);
2137    pred_r1 = _mm_srli_si128(pred_r1, 8);
2138    temp1 = _mm_cvtepu16_epi32(pred_r0);
2139    temp3 = _mm_cvtepu16_epi32(pred_r1);
2140
2141    temp0 = _mm_add_epi32(temp0, value_4x32b);
2142    temp2 = _mm_add_epi32(temp2, value_4x32b);
2143    temp1 = _mm_add_epi32(temp1, value_4x32b);
2144    temp3 = _mm_add_epi32(temp3, value_4x32b);
2145    temp0 = _mm_packus_epi32(temp0, temp1);
2146    temp2 = _mm_packus_epi32(temp2, temp3);
2147    temp0 = _mm_packus_epi16(temp0, temp1);
2148    temp2 = _mm_packus_epi16(temp2, temp3);
2149    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2150    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2151
2152    //Row 4-5 processing
2153    pu1_pred += 2 * pred_strd;
2154    pu1_dst += 2 * dst_strd;
2155
2156    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2157    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2158    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
2159    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
2160
2161    temp0 = _mm_cvtepu16_epi32(pred_r0);
2162    pred_r0 = _mm_srli_si128(pred_r0, 8);
2163    temp2 = _mm_cvtepu16_epi32(pred_r1);
2164    pred_r1 = _mm_srli_si128(pred_r1, 8);
2165    temp1 = _mm_cvtepu16_epi32(pred_r0);
2166    temp3 = _mm_cvtepu16_epi32(pred_r1);
2167
2168    temp0 = _mm_add_epi32(temp0, value_4x32b);
2169    temp2 = _mm_add_epi32(temp2, value_4x32b);
2170    temp1 = _mm_add_epi32(temp1, value_4x32b);
2171    temp3 = _mm_add_epi32(temp3, value_4x32b);
2172    temp0 = _mm_packus_epi32(temp0, temp1);
2173    temp2 = _mm_packus_epi32(temp2, temp3);
2174    temp0 = _mm_packus_epi16(temp0, temp1);
2175    temp2 = _mm_packus_epi16(temp2, temp3);
2176    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2177    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2178
2179    //Row 6-7 processing
2180    pu1_pred += 2 * pred_strd;
2181    pu1_dst += 2 * dst_strd;
2182
2183    pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2184    pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2185    pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
2186    pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
2187
2188    temp0 = _mm_cvtepu16_epi32(pred_r0);
2189    pred_r0 = _mm_srli_si128(pred_r0, 8);
2190    temp2 = _mm_cvtepu16_epi32(pred_r1);
2191    pred_r1 = _mm_srli_si128(pred_r1, 8);
2192    temp1 = _mm_cvtepu16_epi32(pred_r0);
2193    temp3 = _mm_cvtepu16_epi32(pred_r1);
2194
2195    temp0 = _mm_add_epi32(temp0, value_4x32b);
2196    temp2 = _mm_add_epi32(temp2, value_4x32b);
2197    temp1 = _mm_add_epi32(temp1, value_4x32b);
2198    temp3 = _mm_add_epi32(temp3, value_4x32b);
2199    temp0 = _mm_packus_epi32(temp0, temp1);
2200    temp2 = _mm_packus_epi32(temp2, temp3);
2201    temp0 = _mm_packus_epi16(temp0, temp1);
2202    temp2 = _mm_packus_epi16(temp2, temp3);
2203    _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2204    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2205}
2206