1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19 *******************************************************************************
20 * @file
21 *  ihevc_32x32_itrans_recon_x86_intr.c
22 *
23 * @brief
24 *  Contains function definitions for inverse  quantization, inverse
25 * transform and reconstruction
26 *
27 * @author
28 *  100470
29 *
30 * @par List of Functions:
31 *  - ihevc_itrans_recon_32x32_sse42()
32 *
33 * @remarks
34 *  None
35 *
36 *******************************************************************************
37 */
38#include <stdio.h>
39#include <string.h>
40#include "ihevc_typedefs.h"
41#include "ihevc_platform_macros.h"
42#include "ihevc_macros.h"
43#include "ihevc_defs.h"
44#include "ihevc_trans_tables.h"
45#include "ihevc_iquant_itrans_recon.h"
46#include "ihevc_func_selector.h"
47#include "ihevc_trans_macros.h"
48
49#include <emmintrin.h>
50#include <smmintrin.h>
51#include <tmmintrin.h>
52
53/**
54 *******************************************************************************
55 *
56 * @brief
57 *  This function performs inverse quantization, inverse  transform and
58 * reconstruction for 16x16 input block
59 *
60 * @par Description:
61 *  Performs inverse quantization , inverse transform  and adds the
62 * prediction data and clips output to 8 bit
63 *
64 * @param[in] pi2_src
65 *  Input 16x16 coefficients
66 *
67 * @param[in] pi2_tmp
68 *  Temporary 16x16 buffer for storing inverse
69 *  transform 1st stage output
70 *
71 * @param[in] pu1_pred
72 *  Prediction 16x16 block
73 *
74 * @param[in] pi2_dequant_coeff
75 *  Dequant Coeffs
76 *
77 * @param[out] pu1_dst
78 *  Output 16x16 block
79 *
80 * @param[in] qp_div
81 *  Quantization parameter / 6
82 *
83 * @param[in] qp_rem
84 *  Quantization parameter % 6
85 *
86 * @param[in] src_strd
87 *  Input stride
88 *
89 * @param[in] pred_strd
90 *  Prediction stride
91 *
92 * @param[in] dst_strd
93 *  Output Stride
94 *
95 * @param[in] zero_cols
96 *  Zero columns in pi2_src
97 *
98 * @returns  Void
99 *
100 * @remarks
101 *  None
102 *
103 *******************************************************************************
104 */
105/**/
106
107void ihevc_itrans_recon_32x32_sse42(WORD16 *pi2_src,
108                                    WORD16 *pi2_tmp,
109                                    UWORD8 *pu1_pred,
110                                    UWORD8 *pu1_dst,
111                                    WORD32 src_strd,
112                                    WORD32 pred_strd,
113                                    WORD32 dst_strd,
114                                    WORD32 zero_cols,
115                                    WORD32 zero_rows)
116{
117    /* Inverse Transform */
118
119    WORD32 j;
120
121
122    WORD16 *pi2_tmp_orig;
123
124
125    WORD16 *o_temp_ptr;
126    WORD16 *temp_ptr;
127
128    __m128i m_temp_reg_0;
129    __m128i m_temp_reg_1;
130    __m128i m_temp_reg_2;
131    __m128i m_temp_reg_3;
132    __m128i m_temp_reg_4;
133    __m128i m_temp_reg_5;
134    __m128i m_temp_reg_6;
135    __m128i m_temp_reg_7;
136    __m128i m_temp_reg_10;
137    __m128i m_temp_reg_11;
138    __m128i m_temp_reg_12;
139    __m128i m_temp_reg_13;
140    __m128i m_temp_reg_14;
141    __m128i m_temp_reg_15;
142    __m128i m_temp_reg_16;
143    __m128i m_temp_reg_17;
144    __m128i m_temp_reg_18;
145    __m128i m_temp_reg_19;
146    __m128i m_temp_reg_20;
147    __m128i m_temp_reg_21;
148    __m128i m_temp_reg_22;
149    __m128i m_temp_reg_23;
150    __m128i m_temp_reg_30;
151    __m128i m_temp_reg_31;
152    __m128i m_temp_reg_32;
153    __m128i m_temp_reg_33;
154    __m128i m_temp_reg_34;
155    __m128i m_temp_reg_35;
156    __m128i m_temp_reg_36;
157    __m128i m_temp_reg_37;
158    __m128i m_temp_reg_40;
159    __m128i m_temp_reg_41;
160    __m128i m_temp_reg_42;
161    __m128i m_temp_reg_43;
162    __m128i m_temp_reg_44;
163    __m128i m_temp_reg_45;
164    __m128i m_temp_reg_46;
165    __m128i m_temp_reg_47;
166
167    __m128i m_temp_reg_70;
168    __m128i m_temp_reg_71;
169    __m128i m_temp_reg_72;
170    __m128i m_temp_reg_73;
171    __m128i m_temp_reg_74;
172    __m128i m_temp_reg_75;
173    __m128i m_temp_reg_76;
174    __m128i m_temp_reg_77;
175
176    __m128i m_temp_reg_80;
177    __m128i m_temp_reg_81;
178    __m128i m_temp_reg_82;
179    __m128i m_temp_reg_83;
180    __m128i m_temp_reg_84;
181    __m128i m_temp_reg_85;
182    __m128i m_temp_reg_86;
183    __m128i m_temp_reg_87;
184
185    __m128i m_temp_reg_90;
186    __m128i m_temp_reg_91;
187    __m128i m_temp_reg_92;
188    __m128i m_temp_reg_93;
189    __m128i m_temp_reg_94;
190    __m128i m_temp_reg_95;
191    __m128i m_temp_reg_96;
192    __m128i m_temp_reg_97;
193
194    __m128i m_rdng_factor;
195    __m128i m_count;
196    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
197    __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
198
199    __m128i temp1, temp2, temp3, temp4;
200    __m128i temp5, temp6, temp7, temp8;
201
202    __m128i all_zero_reg;
203    WORD32 i;
204
205    /*Lokesh*/
206    WORD32  zero_last24_cols_stg1;
207    WORD32  zero_last24_rows_stg1;
208    WORD32  zero_last28_rows_stg1;
209
210    WORD32  zero_last28_rows_stg2;
211    WORD32  zero_last24_rows_stg2;
212
213    WORD32  trans_size_stg1;
214
215    WORD32 i4_shift = IT_SHIFT_STAGE_1;
216    WORD32 trans_size = TRANS_SIZE_32;
217
218
219    /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
220    zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
221    zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
222    zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
223
224    zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
225    zero_last24_rows_stg2 = zero_last24_cols_stg1;
226
227    if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
228    {
229        trans_size_stg1 = 8;
230
231    }
232    else
233    {
234        trans_size_stg1 = 32;
235    }
236
237    all_zero_reg = _mm_setzero_si128();
238
239    o_temp_ptr  = pi2_tmp;
240    temp_ptr = (pi2_tmp + 1024);
241
242    pi2_tmp += 2048;
243    pi2_tmp_orig = pi2_tmp;
244
245    for(i = 0; i < trans_size_stg1; i += 8)
246    {
247
248        {
249            WORD16 *pi2_tmp_src = pi2_src;
250
251            m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
252            pi2_tmp_src += (src_strd << 1);
253            m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
254            pi2_tmp_src += (src_strd << 1);
255            m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
256            pi2_tmp_src += (src_strd << 1);
257            m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
258            pi2_tmp_src += (src_strd << 1);
259            m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
260            pi2_tmp_src += (src_strd << 1);
261            m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
262            pi2_tmp_src += (src_strd << 1);
263            m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
264            pi2_tmp_src += (src_strd << 1);
265            m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
266            pi2_tmp_src += (src_strd << 1);
267
268            m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
269            pi2_tmp_src += (src_strd << 1);
270            m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
271            pi2_tmp_src += (src_strd << 1);
272            m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
273            pi2_tmp_src += (src_strd << 1);
274            m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
275            pi2_tmp_src += (src_strd << 1);
276            m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
277            pi2_tmp_src += (src_strd << 1);
278            m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
279            pi2_tmp_src += (src_strd << 1);
280            m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
281            pi2_tmp_src += (src_strd << 1);
282            m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
283        }
284
285        if(zero_last28_rows_stg1)
286        {
287            /* eeo */
288            /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
289            /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
290            {
291                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
292
293                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
294
295                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
296
297/* eeeo[0]= m_temp_reg_20  */
298/* eeeo[1]= m_temp_reg_21  */
299/* eeee[0]= m_temp_reg_22  */
300/* eeee[1]= m_temp_reg_23  */
301
302                /* eee[0] = eeee[0] + eeeo[0]; */
303                m_temp_reg_40 = m_temp_reg_14;
304
305                /* eee[3] = eeee[0] - eeeo[0]; */
306                m_temp_reg_43 = m_temp_reg_14;
307
308                /* eee[2] = eeee[1] - eeeo[1]; */
309                m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
310
311                /* eee[1] = eeee[1] + eeeo[1];*/
312                m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
313
314                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
315
316                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
317
318                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
319
320/* eeeo[0]= m_temp_reg_20  */
321/* eeeo[1]= m_temp_reg_21  */
322/* eeee[0]= m_temp_reg_22  */
323/* eeee[1]= m_temp_reg_23  */
324
325                /* eee[0] = eeee[0] + eeeo[0]; */
326                m_temp_reg_44 = m_temp_reg_14;
327
328                /* eee[3] = eeee[0] - eeeo[0]; */
329                m_temp_reg_47 = m_temp_reg_14;
330
331                /* eee[2] = eeee[1] - eeeo[1]; */
332                m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
333
334                /* eee[1] = eeee[1] + eeeo[1];*/
335                m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
336
337
338            }
339            /* eo */
340            {
341                WORD16 *pi2_scratch = o_temp_ptr;
342
343                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
344                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
345                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
346                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
347                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
348                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
349                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
350                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
351
352                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
353
354                m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
355
356                /* eo0[0-3] */
357                {
358                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
359
360                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
361
362                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
363                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
364
365                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
366                    pi2_scratch += 8;
367                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
368                    pi2_scratch += 8;
369
370                }
371
372                /* eo0[4-7] */
373                {
374                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
375
376                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
377                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
378
379                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
380                    pi2_scratch += 8;
381                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
382                    pi2_scratch += 8;
383
384                }
385                /* eo1[0-3] */
386                {
387                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
388
389                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
390                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
391
392                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
393                    pi2_scratch += 8;
394                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
395                    pi2_scratch += 8;
396
397                }
398
399                /* eo1[4-7] */
400                {
401                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
402
403                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
404                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
405
406                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
407                    pi2_scratch += 8;
408                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
409                    pi2_scratch += 8;
410
411                }
412
413                /* eo2[0-3] */
414                {
415                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
416
417                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
418                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
419
420                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
421                    pi2_scratch += 8;
422                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
423                    pi2_scratch += 8;
424
425                }
426
427                /* eo2[4-7] */
428                {
429                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
430
431                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
432                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
433
434                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
435                    pi2_scratch += 8;
436                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
437                    pi2_scratch += 8;
438
439                }
440
441                /**************************************************************************/
442
443
444                /* eo3[0-3] */
445                {
446                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
447
448                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
449                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
450
451                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
452                    pi2_scratch += 8;
453                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
454                    pi2_scratch += 8;
455
456                }
457
458                /* eo3[4-7] */
459                {
460                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
461
462                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
463                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
464
465                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
466                    pi2_scratch += 8;
467                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
468                    pi2_scratch += 8;
469
470                }
471
472
473                /* eo4[0-3] */
474                {
475                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
476
477                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
478                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
479
480                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
481                    pi2_scratch += 8;
482                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
483                    pi2_scratch += 8;
484
485                }
486                /* eo4[4-7] */
487                {
488                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
489
490                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
491                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
492
493                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
494                    pi2_scratch += 8;
495                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
496                    pi2_scratch += 8;
497
498                }
499
500                /***********************************************************************/
501
502                /* eo5[0-3] */
503                {
504                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
505
506                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
507                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
508
509                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
510                    pi2_scratch += 8;
511                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
512                    pi2_scratch += 8;
513
514                }
515
516
517                /* eo5[4-7] */
518                {
519                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
520
521                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
522                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
523
524                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
525                    pi2_scratch += 8;
526                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
527                    pi2_scratch += 8;
528
529                }
530
531                /* eo6[0-3] */
532                {
533                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
534
535                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
536                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
537
538                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
539                    pi2_scratch += 8;
540                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
541                    pi2_scratch += 8;
542
543                }
544
545
546                /* eo6[4-7] */
547                {
548                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
549
550                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
551                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
552
553                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
554                    pi2_scratch += 8;
555                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
556                    pi2_scratch += 8;
557
558                }
559
560
561                /* eo7[0-3] */
562                {
563                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
564
565                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
566                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
567
568                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
569                    pi2_scratch += 8;
570                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
571                    pi2_scratch += 8;
572
573                }
574
575
576                /* eo7[4-7] */
577                {
578                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
579
580                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
581                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
582
583                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
584                    pi2_scratch += 8;
585                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
586                    pi2_scratch += 8;
587
588                }
589
590            }
591        }
592        else if(zero_last24_rows_stg1)
593        {
594            {
595                /* eeo */
596                /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
597                /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
598
599                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
600                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
601
602                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
603
604                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
605
606                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
607
608                /* eeeo[0]= m_temp_reg_20  */
609                /* eeeo[1]= m_temp_reg_21  */
610                /* eeee[0]= m_temp_reg_22  */
611                /* eeee[1]= m_temp_reg_23  */
612
613                /* eee[0] = eeee[0] + eeeo[0]; */
614                m_temp_reg_40 = m_temp_reg_14;
615
616                /* eee[3] = eeee[0] - eeeo[0]; */
617                m_temp_reg_43 = m_temp_reg_14;
618
619                /* eee[2] = eeee[1] - eeeo[1]; */
620                m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
621
622                /* eee[1] = eeee[1] + eeeo[1];*/
623                m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
624
625                /* for row 4 to 7 */
626
627                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
628
629                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
630
631                m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
632
633                /* eeeo[0]= m_temp_reg_20  */
634                /* eeeo[1]= m_temp_reg_21  */
635                /* eeee[0]= m_temp_reg_22  */
636                /* eeee[1]= m_temp_reg_23  */
637
638                /* eee[0] = eeee[0] + eeeo[0]; */
639                m_temp_reg_44 = m_temp_reg_14;
640
641                /* eee[3] = eeee[0] - eeeo[0]; */
642                m_temp_reg_47 = m_temp_reg_14;
643
644                /* eee[2] = eeee[1] - eeeo[1]; */
645                m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
646
647                /* eee[1] = eeee[1] + eeeo[1];*/
648                m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
649
650
651                // eeo[]
652                /* for(k = 0; k < 4; k++) */
653
654                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
655                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
656                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
657                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
658
659                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
660
661                m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
662
663                m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
664
665                m_temp_reg_33 = _mm_setzero_si128();
666
667                /* eeo */
668                {
669                    /* eeo0[0-3] */
670                    {
671                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
672
673                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
674                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
675
676                        m_temp_reg_90 = m_temp_reg_34;
677                        m_temp_reg_97 = m_temp_reg_35;
678                    }
679                    /* eeo0[4-7] */
680                    {
681                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
682
683                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
684                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
685
686                        m_temp_reg_91 = m_temp_reg_34;
687                        m_temp_reg_96 = m_temp_reg_35;
688
689                    }
690
691                    /* eeo1[0-3] */
692                    {
693                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
694
695                        /* e[1][0-3] stored in pi2_tmp[2][0-7] */
696                        /* e[6][0-3] stored in pi2_tmp[2][8-15] */
697                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
698                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
699
700                        m_temp_reg_92 = m_temp_reg_34;
701                        m_temp_reg_95 = m_temp_reg_35;
702
703                    }
704
705                    /* eo1[4-7] */
706                    {
707                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
708
709                        /* e[1][4-7] stored in pi2_tmp[3][0-7] */
710                        /* e[6][4-7] stored in pi2_tmp[3][8-15] */
711                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
712                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
713
714                        m_temp_reg_93 = m_temp_reg_34;
715                        m_temp_reg_94 = m_temp_reg_35;
716
717
718                    }
719
720                    /* eo2[0-3] */
721                    {
722                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
723
724                        /* e[2][0-3] stored in pi2_tmp[4][0-7] */
725                        /* e[5][0-3] stored in pi2_tmp[4][8-15] */
726                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
727                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
728
729                        temp1 = m_temp_reg_34;
730                        temp7 = m_temp_reg_35;
731
732                    }
733
734                    /* eo2[4-7] */
735                    {
736                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
737
738                        /* e[2][4-7] stored in pi2_tmp[5][0-7] */
739                        /* e[5][4-7] stored in pi2_tmp[5][8-15] */
740                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
741                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
742
743                        temp2 = m_temp_reg_34;
744                        temp6 = m_temp_reg_35;
745
746                    }
747
748                    /* eo3[0-3] */
749                    {
750                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
751
752                        /* e[3][0-3] stored in pi2_tmp[6][0-7] */
753                        /* e[4][0-3] stored in pi2_tmp[6][8-15] */
754                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
755                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
756
757                        temp3 = m_temp_reg_34;
758                        temp5 = m_temp_reg_35;
759
760                    }
761
762
763                    /* eo3[4-7] */
764                    {
765                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
766
767                        /* e[3][4-7] stored in pi2_tmp[7][0-7] */
768                        /* e[4][4-7] stored in pi2_tmp[7][8-15] */
769                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
770                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
771
772                        temp4 = m_temp_reg_34;
773                        temp8 = m_temp_reg_35;
774
775
776                    }
777                    /* All values of ee[] array in pi2_temp */
778
779                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
780                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
781                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
782                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
783
784                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
785
786                    m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
787                    m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
788
789                }
790            }
791            /* eo */
792            {
793
794                WORD16 *pi2_scratch = o_temp_ptr;
795
796                /* eo0[0-3] */
797                {
798                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
799
800                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
801                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
802
803                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
804                    pi2_scratch += 8;
805                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
806                    pi2_scratch += 8;
807
808                }
809
810
811                /* eo0[4-7] */
812                {
813                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
814
815                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
816
817                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
818                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
819
820                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
821                    pi2_scratch += 8;
822                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
823                    pi2_scratch += 8;
824
825                }
826
827                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
828
829                /* eo1[0-3] */
830                {
831                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
832
833                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
834                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
835
836                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
837                    pi2_scratch += 8;
838                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
839                    pi2_scratch += 8;
840
841                }
842
843
844                /* eo1[4-7] */
845                {
846                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
847
848                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
849                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
850
851                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
852                    pi2_scratch += 8;
853                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
854                    pi2_scratch += 8;
855
856                }
857
858                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
859
860                /* eo2[0-3] */
861                {
862
863                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
864
865                    m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
866                    m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
867
868                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
869                    pi2_scratch += 8;
870                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
871                    pi2_scratch += 8;
872
873                }
874
875                /* eo2[4-7] */
876                {
877
878                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
879
880                    m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
881                    m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
882
883                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
884                    pi2_scratch += 8;
885                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
886                    pi2_scratch += 8;
887
888                }
889
890                /**************************************************************************/
891
892
893
894                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
895
896                /* eo3[0-3] */
897                {
898
899                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
900
901                    m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
902                    m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
903
904                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
905                    pi2_scratch += 8;
906                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
907                    pi2_scratch += 8;
908
909                }
910
911
912                /* eo3[4-7] */
913                {
914
915                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
916
917                    m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
918                    m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
919
920                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
921                    pi2_scratch += 8;
922                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
923                    pi2_scratch += 8;
924
925                }
926
927                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
928
929                /* eo4[0-3] */
930                {
931                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
932
933                    m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
934                    m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
935
936                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
937                    pi2_scratch += 8;
938                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
939                    pi2_scratch += 8;
940
941                }
942                /* eo4[4-7] */
943                {
944                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
945
946                    m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
947                    m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
948
949                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
950                    pi2_scratch += 8;
951                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
952                    pi2_scratch += 8;
953
954                }
955
956                /***********************************************************************/
957
958                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
959
960                /* eo5[0-3] */
961                {
962
963                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
964
965                    m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
966                    m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
967
968                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
969                    pi2_scratch += 8;
970                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
971                    pi2_scratch += 8;
972
973                }
974
975
976                /* eo5[4-7] */
977                {
978                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
979
980                    m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
981                    m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
982
983                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
984                    pi2_scratch += 8;
985                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
986                    pi2_scratch += 8;
987
988                }
989
990                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
991
992                /* eo6[0-3] */
993                {
994                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
995
996                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
997                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
998
999                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1000                    pi2_scratch += 8;
1001                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1002                    pi2_scratch += 8;
1003
1004                }
1005
1006
1007                /* eo6[4-7] */
1008                {
1009
1010                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1011
1012                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
1013                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
1014
1015                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1016                    pi2_scratch += 8;
1017                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1018                    pi2_scratch += 8;
1019
1020                }
1021
1022                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
1023
1024                /* eo7[0-3] */
1025                {
1026
1027                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1028
1029                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
1030                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
1031
1032                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1033                    pi2_scratch += 8;
1034                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1035                    pi2_scratch += 8;
1036
1037                }
1038
1039
1040                /* eo7[4-7] */
1041                {
1042                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1043
1044                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
1045                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
1046
1047                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1048                    pi2_scratch += 8;
1049                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1050                    pi2_scratch += 8;
1051
1052                }
1053
1054            }
1055
1056        }
1057        else
1058        {
1059
1060            {
1061                /* eeo */
1062                /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1063                /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1064
1065                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
1066                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
1067
1068                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
1069                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
1070
1071                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
1072
1073                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
1074
1075                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
1076                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
1077
1078                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
1079                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
1080
1081
1082                /* eeeo[0]= m_temp_reg_20  */
1083                /* eeeo[1]= m_temp_reg_21  */
1084                /* eeee[0]= m_temp_reg_22  */
1085                /* eeee[1]= m_temp_reg_23  */
1086
1087                /* eee[0] = eeee[0] + eeeo[0]; */
1088                m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
1089
1090                /* eee[3] = eeee[0] - eeeo[0]; */
1091                m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
1092
1093                /* eee[2] = eeee[1] - eeeo[1]; */
1094                m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
1095
1096                /* eee[1] = eeee[1] + eeeo[1];*/
1097                m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
1098
1099                /* for row 4 to 7 */
1100
1101                m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
1102                m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
1103
1104                /* Interleaving row 8 and row 24*/
1105                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
1106
1107                m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
1108                m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
1109
1110                m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
1111
1112                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
1113                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
1114
1115                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
1116                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
1117
1118
1119                /* eeeo[0]= m_temp_reg_20  */
1120                /* eeeo[1]= m_temp_reg_21  */
1121                /* eeee[0]= m_temp_reg_22  */
1122                /* eeee[1]= m_temp_reg_23  */
1123
1124                /* eee[0] = eeee[0] + eeeo[0]; */
1125                m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
1126
1127                /* eee[3] = eeee[0] - eeeo[0]; */
1128                m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
1129
1130                /* eee[2] = eeee[1] - eeeo[1]; */
1131                m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
1132
1133                /* eee[1] = eeee[1] + eeeo[1];*/
1134                m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
1135
1136
1137                // eeo[]
1138                /* for(k = 0; k < 4; k++) */
1139
1140                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
1141                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
1142
1143                /* eeo */
1144                {
1145                    /* eeo0[0-3] */
1146                    {
1147                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1148                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
1149
1150                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1151                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1152
1153                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1154
1155                        m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1156                        m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1157
1158                    }
1159
1160                    m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
1161                    m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
1162                    m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
1163                    m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
1164
1165                    /* eeo0[4-7] */
1166                    {
1167                        m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1168                        m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
1169
1170                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1171                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1172
1173                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1174
1175                        m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
1176                        m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
1177
1178                    }
1179
1180
1181                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
1182                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
1183
1184                    /* eeo1[0-3] */
1185                    {
1186                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1187                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1188
1189                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
1190                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
1191
1192                        m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
1193                        m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
1194
1195                    }
1196
1197                    /* eeo1[4-7] */
1198                    {
1199
1200                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1201                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1202
1203                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
1204                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
1205
1206                        m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
1207                        m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
1208
1209
1210                    }
1211
1212                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
1213                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
1214
1215                    /* eeo2[0-3] */
1216                    {
1217
1218                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1219                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1220
1221                        /* e[2][0-3] stored in pi2_tmp[4][0-7] */
1222                        /* e[5][0-3] stored in pi2_tmp[4][8-15] */
1223
1224                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
1225                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
1226
1227                        temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1228                        temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1229
1230                    }
1231
1232                    /* eeo2[4-7] */
1233                    {
1234
1235                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1236                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1237
1238                        /* e[2][4-7] stored in pi2_tmp[5][0-7] */
1239                        /* e[5][4-7] stored in pi2_tmp[5][8-15] */
1240
1241                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
1242                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
1243
1244                        temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1245                        temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1246
1247                    }
1248
1249                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
1250                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
1251
1252                    /* eeo3[0-3] */
1253                    {
1254
1255                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1256                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1257
1258                        /* e[3][0-3] stored in pi2_tmp[6][0-7] */
1259                        /* e[4][0-3] stored in pi2_tmp[6][8-15] */
1260
1261                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
1262                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
1263
1264                        temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1265                        temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1266
1267
1268                    }
1269
1270                    /* eeo3[4-7] */
1271                    {
1272
1273                        m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1274                        m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1275
1276                        /* e[3][4-7] stored in pi2_tmp[7][0-7] */
1277                        /* e[4][4-7] stored in pi2_tmp[7][8-15] */
1278
1279                        m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
1280                        m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
1281                        temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1282                        temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1283
1284                    }
1285
1286
1287                    /* All values of ee[] array in pi2_temp */
1288
1289                    /* for(k = 0; k < 8; k++) */
1290                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
1291                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
1292                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
1293                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
1294                }
1295            }
1296            /* eo */
1297            {
1298
1299                WORD16 *pi2_scratch = o_temp_ptr;
1300
1301                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1302                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1303                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
1304                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
1305
1306                m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
1307                m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
1308                m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
1309                m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
1310
1311                m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
1312                m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
1313                m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
1314                m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
1315
1316                /* eo0[0-3] */
1317                {
1318                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1319                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1320
1321                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1322
1323                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1324                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1325
1326                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1327
1328                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1329
1330                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
1331                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
1332
1333                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1334                    pi2_scratch += 8;
1335                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1336                    pi2_scratch += 8;
1337
1338                }
1339                /* eo0[4-7] */
1340                {
1341                    m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1342                    m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1343                    m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
1344                    m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
1345
1346                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1347                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1348
1349                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1350
1351                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1352                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1353
1354                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1355
1356                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1357
1358                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
1359                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
1360
1361                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1362                    pi2_scratch += 8;
1363                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1364                    pi2_scratch += 8;
1365
1366                }
1367
1368                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
1369                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
1370                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
1371                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
1372
1373                /* eo1[0-3] */
1374                {
1375
1376                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1377                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1378
1379                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1380
1381                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1382                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1383
1384                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1385
1386                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
1387
1388                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
1389                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
1390
1391                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1392                    pi2_scratch += 8;
1393                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1394                    pi2_scratch += 8;
1395
1396                }
1397
1398                /* eo1[4-7] */
1399                {
1400                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1401                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1402
1403                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1404
1405                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1406                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1407
1408                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1409
1410                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
1411
1412                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
1413                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
1414
1415                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1416                    pi2_scratch += 8;
1417                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1418                    pi2_scratch += 8;
1419
1420                }
1421
1422                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
1423                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
1424                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
1425                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
1426
1427                /* eo2[0-3] */
1428                {
1429                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1430                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1431
1432                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
1433
1434                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1435                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1436
1437                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1438
1439                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1440
1441                    m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
1442                    m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
1443
1444                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1445                    pi2_scratch += 8;
1446                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1447                    pi2_scratch += 8;
1448
1449                }
1450
1451
1452                /* eo2[4-7] */
1453                {
1454
1455                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1456                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1457
1458                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
1459
1460                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1461                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1462
1463                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1464
1465                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1466
1467                    m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
1468                    m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
1469
1470                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1471                    pi2_scratch += 8;
1472                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1473                    pi2_scratch += 8;
1474
1475                }
1476                /**************************************************************************/
1477
1478                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
1479                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
1480                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
1481                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
1482
1483                /* eo3[0-3] */
1484                {
1485                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1486                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1487
1488                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1489
1490                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1491                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1492
1493                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
1494
1495                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1496
1497                    m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
1498                    m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
1499
1500                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1501                    pi2_scratch += 8;
1502                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1503                    pi2_scratch += 8;
1504
1505                }
1506
1507
1508                /* eo3[4-7] */
1509                {
1510                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1511                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1512
1513                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1514
1515                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1516                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1517
1518                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
1519
1520                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1521
1522                    m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
1523                    m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
1524
1525                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1526                    pi2_scratch += 8;
1527                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1528                    pi2_scratch += 8;
1529
1530                }
1531
1532                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
1533                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
1534                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
1535                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
1536
1537                /* eo4[0-3] */
1538                {
1539
1540                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1541                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1542
1543                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1544
1545                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1546                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1547
1548                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
1549
1550                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1551
1552                    m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
1553                    m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
1554
1555                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1556                    pi2_scratch += 8;
1557                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1558                    pi2_scratch += 8;
1559
1560                }
1561
1562
1563                /* eo4[4-7] */
1564                {
1565                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1566                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1567
1568                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1569
1570                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1571                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1572
1573                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
1574
1575                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1576
1577                    m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
1578                    m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
1579
1580                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1581                    pi2_scratch += 8;
1582                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1583                    pi2_scratch += 8;
1584
1585                }
1586
1587                /***********************************************************************/
1588
1589                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
1590                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
1591                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
1592                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
1593
1594                /* eo5[0-3] */
1595                {
1596                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1597                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1598
1599                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1600
1601                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1602                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1603
1604                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1605
1606                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1607
1608                    m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
1609                    m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
1610
1611                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1612                    pi2_scratch += 8;
1613                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1614                    pi2_scratch += 8;
1615
1616                }
1617
1618
1619                /* eo5[4-7] */
1620                {
1621                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1622                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1623
1624                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1625
1626                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1627                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1628
1629                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1630
1631                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1632
1633                    m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
1634                    m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
1635
1636                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1637                    pi2_scratch += 8;
1638                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1639                    pi2_scratch += 8;
1640
1641                }
1642
1643                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
1644                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
1645                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
1646                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
1647
1648                /* eo6[0-3] */
1649                {
1650
1651                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1652                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1653
1654                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1655
1656                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1657                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1658
1659                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1660
1661                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1662
1663                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
1664                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
1665
1666                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1667                    pi2_scratch += 8;
1668                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1669                    pi2_scratch += 8;
1670
1671                }
1672
1673
1674                /* eo6[4-7] */
1675                {
1676                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1677                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1678
1679                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1680
1681                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1682                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1683
1684                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1685
1686                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1687
1688                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
1689                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
1690
1691                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1692                    pi2_scratch += 8;
1693                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1694                    pi2_scratch += 8;
1695
1696                }
1697
1698                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
1699                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
1700                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
1701                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
1702
1703                /* eo7[0-3] */
1704                {
1705
1706                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1707                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1708
1709                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1710
1711                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1712                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1713
1714                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1715
1716                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1717
1718                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
1719                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
1720
1721                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1722                    pi2_scratch += 8;
1723                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1724                    pi2_scratch += 8;
1725
1726                }
1727
1728
1729                /* eo7[4-7] */
1730                {
1731
1732                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1733                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1734
1735                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1736
1737                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1738                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1739
1740                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1741
1742                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1743
1744                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
1745                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
1746
1747                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1748                    pi2_scratch += 8;
1749                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1750                    pi2_scratch += 8;
1751
1752                }
1753
1754            }
1755
1756        }
1757        /*  All e[] are done */
1758        /****************************/
1759
1760        {
1761
1762            WORD16 *pi2_tmp_src = pi2_src + src_strd;
1763
1764            m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1765            pi2_tmp_src += (src_strd << 1);
1766            m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1767            pi2_tmp_src += (src_strd << 1);
1768            m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1769            pi2_tmp_src += (src_strd << 1);
1770            m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1771            pi2_tmp_src += (src_strd << 1);
1772            m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1773            pi2_tmp_src += (src_strd << 1);
1774            m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1775            pi2_tmp_src += (src_strd << 1);
1776            m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1777            pi2_tmp_src += (src_strd << 1);
1778            m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1779            pi2_tmp_src += (src_strd << 1);
1780
1781            m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1782            pi2_tmp_src += (src_strd << 1);
1783            m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1784            pi2_tmp_src += (src_strd << 1);
1785            m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1786            pi2_tmp_src += (src_strd << 1);
1787            m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1788            pi2_tmp_src += (src_strd << 1);
1789            m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1790            pi2_tmp_src += (src_strd << 1);
1791            m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1792            pi2_tmp_src += (src_strd << 1);
1793            m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1794            pi2_tmp_src += (src_strd << 1);
1795            m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1796        }
1797
1798        if(zero_last28_rows_stg1)
1799        {
1800            /* o & stage 1 out */
1801            {
1802                WORD32 j;
1803                WORD16 *pi2_src_scratch = o_temp_ptr;
1804                WORD16 *pi2_dst_scratch = temp_ptr;
1805                WORD32 out_stride = (trans_size << 1);
1806                WORD32 in_stride = trans_size;
1807
1808                for(j = 0; j < 2; j++)
1809                {
1810                    if(j)
1811                    {
1812                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
1813                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
1814                    }
1815
1816                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
1817
1818                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
1819
1820                    /* o0[0-3] */
1821                    {
1822                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1823
1824                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1825                        pi2_src_scratch += in_stride;
1826
1827                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1828                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1829
1830                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1831                        m_count = _mm_cvtsi32_si128(i4_shift);
1832                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1833                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1834
1835                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1836                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1837                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1838                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1839
1840                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1841
1842                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1843                        pi2_dst_scratch += out_stride;
1844
1845                    }
1846
1847                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
1848
1849                    /* o1[0-3] */
1850                    {
1851
1852                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1853
1854                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1855                        pi2_src_scratch += in_stride;
1856
1857                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1858                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1859
1860                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1861                        m_count = _mm_cvtsi32_si128(i4_shift);
1862                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1863                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1864
1865                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1866                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1867                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1868                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1869
1870                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1871
1872                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1873                        pi2_dst_scratch += out_stride;
1874
1875                    }
1876
1877                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
1878
1879                    /* o2[0-3] */
1880                    {
1881
1882                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1883
1884                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1885                        pi2_src_scratch += in_stride;
1886
1887                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1888                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1889
1890                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1891                        m_count = _mm_cvtsi32_si128(i4_shift);
1892                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1893                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1894
1895                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1896                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1897                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1898                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1899
1900                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1901
1902                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1903                        pi2_dst_scratch += out_stride;
1904
1905                    }
1906
1907                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
1908
1909                    /* o3[0-3] */
1910                    {
1911                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1912
1913                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1914                        pi2_src_scratch += in_stride;
1915
1916                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1917                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1918
1919                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1920                        m_count = _mm_cvtsi32_si128(i4_shift);
1921                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1922                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1923
1924                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1925                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1926                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1927                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1928
1929                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1930
1931                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1932                        pi2_dst_scratch += out_stride;
1933
1934                    }
1935
1936                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
1937
1938                    /* o4[0-3] */
1939                    {
1940                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1941
1942                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1943                        pi2_src_scratch += in_stride;
1944
1945                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1946                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1947
1948                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1949                        m_count = _mm_cvtsi32_si128(i4_shift);
1950                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1951                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1952
1953                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1954                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1955                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1956                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1957
1958                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1959
1960                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1961                        pi2_dst_scratch += out_stride;
1962
1963                    }
1964
1965                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
1966
1967                    /* o5[0-3] */
1968                    {
1969
1970                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1971
1972                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1973                        pi2_src_scratch += in_stride;
1974
1975                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1976                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1977
1978                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1979                        m_count = _mm_cvtsi32_si128(i4_shift);
1980                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1981                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1982
1983                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1984                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1985                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1986                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1987
1988                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1989
1990                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1991                        pi2_dst_scratch += out_stride;
1992
1993                    }
1994
1995                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
1996
1997                    /* o6[0-3] */
1998                    {
1999                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2000
2001                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2002                        pi2_src_scratch += in_stride;
2003
2004                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2005                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2006
2007                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2008                        m_count = _mm_cvtsi32_si128(i4_shift);
2009                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2010                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2011
2012                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2013                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2014                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2015                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2016
2017                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2018
2019                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2020                        pi2_dst_scratch += out_stride;
2021
2022                    }
2023
2024                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
2025
2026                    /* o7[0-3] */
2027                    {
2028
2029                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2030
2031                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2032                        pi2_src_scratch += 8;
2033
2034                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2035                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2036
2037                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2038                        m_count = _mm_cvtsi32_si128(i4_shift);
2039                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2040                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2041
2042                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2043                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2044                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2045                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2046
2047                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2048
2049                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2050                        pi2_dst_scratch += 8;
2051
2052                    }
2053
2054                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
2055
2056                    /* o8[0-3] */
2057                    {
2058                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2059
2060                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2061                        pi2_src_scratch -= in_stride;
2062
2063                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2064                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2065
2066                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2067                        m_count = _mm_cvtsi32_si128(i4_shift);
2068                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2069                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2070
2071                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2072                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2073                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2074                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2075
2076                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2077
2078                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2079                        pi2_dst_scratch -= out_stride;
2080                    }
2081
2082                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
2083
2084                    /* o9[0-3] */
2085                    {
2086                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2087
2088                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2089                        pi2_src_scratch -= in_stride;
2090
2091                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2092                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2093
2094                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2095                        m_count = _mm_cvtsi32_si128(i4_shift);
2096                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2097                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2098
2099                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2100                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2101                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2102                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2103
2104                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2105
2106                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2107                        pi2_dst_scratch -= out_stride;
2108                    }
2109
2110                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
2111
2112                    /* o10[0-3] */
2113                    {
2114                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2115
2116                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2117                        pi2_src_scratch -= in_stride;
2118
2119                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2120                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2121
2122                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2123                        m_count = _mm_cvtsi32_si128(i4_shift);
2124                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2125                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2126
2127                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2128                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2129                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2130                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2131
2132                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2133
2134                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2135                        pi2_dst_scratch -= out_stride;
2136                    }
2137
2138                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
2139
2140                    /* o11[0-3] */
2141                    {
2142                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2143
2144                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2145                        pi2_src_scratch -= in_stride;
2146
2147                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2148                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2149
2150                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2151                        m_count = _mm_cvtsi32_si128(i4_shift);
2152                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2153                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2154
2155                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2156                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2157                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2158                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2159
2160                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2161
2162                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2163                        pi2_dst_scratch -= out_stride;
2164
2165                    }
2166
2167                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
2168
2169                    /* o12[0-3] */
2170                    {
2171                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2172
2173                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2174                        pi2_src_scratch -= in_stride;
2175
2176                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2177                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2178
2179                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2180                        m_count = _mm_cvtsi32_si128(i4_shift);
2181                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2182                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2183
2184                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2185                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2186                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2187                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2188
2189                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2190
2191                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2192                        pi2_dst_scratch -= out_stride;
2193
2194                    }
2195
2196                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
2197
2198                    /* o13[0-3] */
2199                    {
2200                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2201
2202                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2203                        pi2_src_scratch -= in_stride;
2204
2205                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2206                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2207
2208                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2209                        m_count = _mm_cvtsi32_si128(i4_shift);
2210                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2211                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2212
2213                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2214                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2215                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2216                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2217
2218                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2219
2220                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2221                        pi2_dst_scratch -= out_stride;
2222                    }
2223
2224                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
2225
2226                    /* o14[0-3] */
2227                    {
2228                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2229
2230                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2231                        pi2_src_scratch -= in_stride;
2232
2233                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2234                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2235
2236                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2237                        m_count = _mm_cvtsi32_si128(i4_shift);
2238                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2239                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2240
2241                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2242                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2243                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2244                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2245
2246                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2247
2248                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2249                        pi2_dst_scratch -= out_stride;
2250
2251                    }
2252
2253                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
2254
2255                    /* o15[0-3] */
2256                    {
2257                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2258
2259                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2260                        pi2_src_scratch += 8;
2261
2262                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2263                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2264
2265                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2266                        m_count = _mm_cvtsi32_si128(i4_shift);
2267                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2268                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2269
2270                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2271                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2272                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2273                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2274
2275                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2276
2277                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2278                        pi2_dst_scratch += 8;
2279                    }
2280
2281                }
2282            }
2283        }
2284        else if(zero_last24_rows_stg1)
2285        {
2286            /* o & stage 1 out */
2287            {
2288                WORD32 j;
2289
2290                WORD16 *pi2_src_scratch = o_temp_ptr;
2291                WORD16 *pi2_dst_scratch = temp_ptr;
2292                WORD32 out_stride = (trans_size << 1);
2293
2294                WORD32 in_stride = trans_size;
2295
2296                for(j = 0; j < 2; j++)
2297                {
2298                    if(j)
2299                    {
2300                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
2301                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
2302                        m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
2303                        m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
2304                    }
2305
2306                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
2307                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
2308
2309                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
2310                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
2311
2312                    /* o0[0-3] */
2313                    {
2314
2315                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2316                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2317
2318                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2319
2320                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2321                        pi2_src_scratch += in_stride;
2322
2323                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2324                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2325
2326                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2327                        m_count = _mm_cvtsi32_si128(i4_shift);
2328                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2329                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2330
2331                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2332                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2333                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2334                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2335
2336                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2337
2338                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2339                        pi2_dst_scratch += out_stride;
2340
2341                    }
2342
2343                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
2344                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
2345
2346                    /* o1[0-3] */
2347                    {
2348                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2349                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2350
2351                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2352
2353                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2354                        pi2_src_scratch += in_stride;
2355
2356                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2357                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2358
2359                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2360                        m_count = _mm_cvtsi32_si128(i4_shift);
2361                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2362                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2363
2364                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2365                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2366                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2367                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2368
2369                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2370
2371                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2372                        pi2_dst_scratch += out_stride;
2373
2374                    }
2375
2376                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
2377                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
2378
2379                    /* o2[0-3] */
2380                    {
2381                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2382                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2383
2384                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
2385
2386                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2387                        pi2_src_scratch += in_stride;
2388
2389                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2390                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2391
2392                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2393                        m_count = _mm_cvtsi32_si128(i4_shift);
2394                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2395                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2396
2397                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2398                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2399                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2400                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2401
2402                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2403
2404                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2405                        pi2_dst_scratch += out_stride;
2406
2407                    }
2408
2409                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
2410                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
2411
2412                    /* o3[0-3] */
2413                    {
2414                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2415                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2416
2417                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
2418
2419                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2420                        pi2_src_scratch += in_stride;
2421
2422                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2423                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2424
2425                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2426                        m_count = _mm_cvtsi32_si128(i4_shift);
2427                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2428                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2429
2430                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2431                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2432                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2433                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2434
2435                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2436
2437                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2438                        pi2_dst_scratch += out_stride;
2439
2440                    }
2441
2442                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
2443                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
2444
2445                    /* o4[0-3] */
2446                    {
2447                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2448                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2449
2450                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2451
2452                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2453                        pi2_src_scratch += in_stride;
2454
2455                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2456                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2457
2458                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2459                        m_count = _mm_cvtsi32_si128(i4_shift);
2460                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2461                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2462
2463                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2464                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2465                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2466                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2467
2468                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2469
2470                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2471                        pi2_dst_scratch += out_stride;
2472
2473                    }
2474
2475                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
2476                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
2477
2478                    /* o5[0-3] */
2479                    {
2480                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2481                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2482
2483                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2484
2485                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2486                        pi2_src_scratch += in_stride;
2487
2488                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2489                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2490
2491                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2492                        m_count = _mm_cvtsi32_si128(i4_shift);
2493                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2494                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2495
2496                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2497                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2498                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2499                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2500
2501                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2502
2503                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2504                        pi2_dst_scratch += out_stride;
2505
2506                    }
2507
2508                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
2509                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
2510
2511                    /* o6[0-3] */
2512                    {
2513                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2514                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2515
2516                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2517
2518                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2519                        pi2_src_scratch += in_stride;
2520
2521                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2522                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2523
2524                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2525                        m_count = _mm_cvtsi32_si128(i4_shift);
2526                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2527                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2528
2529                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2530                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2531                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2532                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2533
2534                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2535
2536                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2537                        pi2_dst_scratch += out_stride;
2538
2539                    }
2540
2541                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
2542                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
2543
2544                    /* o7[0-3] */
2545                    {
2546                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2547                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2548
2549                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2550
2551                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2552                        pi2_src_scratch += 8;
2553
2554                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2555                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2556
2557                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2558                        m_count = _mm_cvtsi32_si128(i4_shift);
2559                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2560                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2561
2562                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2563                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2564                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2565                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2566
2567                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2568
2569                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2570                        pi2_dst_scratch += 8;
2571
2572                    }
2573
2574                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
2575                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
2576
2577                    /* o8[0-3] */
2578                    {
2579                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2580                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2581
2582                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2583
2584                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2585                        pi2_src_scratch -= in_stride;
2586
2587                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2588                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2589
2590                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2591                        m_count = _mm_cvtsi32_si128(i4_shift);
2592                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2593                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2594
2595                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2596                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2597                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2598                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2599
2600                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2601
2602                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2603                        pi2_dst_scratch -= out_stride;
2604                    }
2605
2606                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
2607                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
2608
2609                    /* o9[0-3] */
2610                    {
2611                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2612                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2613
2614                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2615
2616                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2617                        pi2_src_scratch -= in_stride;
2618
2619                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2620                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2621
2622                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2623                        m_count = _mm_cvtsi32_si128(i4_shift);
2624                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2625                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2626
2627                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2628                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2629                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2630                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2631
2632                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2633
2634                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2635                        pi2_dst_scratch -= out_stride;
2636                    }
2637
2638                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
2639                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
2640
2641                    /* o10[0-3] */
2642                    {
2643                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2644                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2645
2646                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2647
2648                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2649                        pi2_src_scratch -= in_stride;
2650
2651                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2652                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2653
2654                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2655                        m_count = _mm_cvtsi32_si128(i4_shift);
2656                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2657                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2658
2659                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2660                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2661                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2662                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2663
2664                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2665
2666                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2667                        pi2_dst_scratch -= out_stride;
2668                    }
2669
2670                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
2671                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
2672
2673                    /* o11[0-3] */
2674                    {
2675
2676                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2677                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2678
2679                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2680
2681                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2682                        pi2_src_scratch -= in_stride;
2683
2684                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2685                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2686
2687                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2688                        m_count = _mm_cvtsi32_si128(i4_shift);
2689                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2690                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2691
2692                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2693                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2694                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2695                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2696
2697                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2698
2699                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2700                        pi2_dst_scratch -= out_stride;
2701
2702                    }
2703
2704                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
2705                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
2706
2707                    /* o12[0-3] */
2708                    {
2709                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2710                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2711
2712                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2713
2714                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2715                        pi2_src_scratch -= in_stride;
2716
2717                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2718                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2719
2720                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2721                        m_count = _mm_cvtsi32_si128(i4_shift);
2722                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2723                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2724
2725                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2726                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2727                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2728                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2729
2730                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2731
2732                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2733                        pi2_dst_scratch -= out_stride;
2734
2735                    }
2736
2737                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
2738                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
2739
2740                    /* o13[0-3] */
2741                    {
2742                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2743                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2744
2745                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2746
2747                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2748                        pi2_src_scratch -= in_stride;
2749
2750                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2751                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2752
2753                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2754                        m_count = _mm_cvtsi32_si128(i4_shift);
2755                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2756                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2757
2758                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2759                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2760                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2761                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2762
2763                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2764
2765                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2766                        pi2_dst_scratch -= out_stride;
2767                    }
2768
2769                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
2770                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
2771
2772                    /* o14[0-3] */
2773                    {
2774                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2775                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2776
2777                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2778
2779                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2780                        pi2_src_scratch -= in_stride;
2781
2782                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2783                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2784
2785                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2786                        m_count = _mm_cvtsi32_si128(i4_shift);
2787                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2788                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2789
2790                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2791                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2792                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2793                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2794
2795                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2796
2797                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2798                        pi2_dst_scratch -= out_stride;
2799
2800                    }
2801
2802                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
2803                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
2804
2805                    /* o15[0-3] */
2806                    {
2807                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2808                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2809
2810                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2811
2812                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2813                        pi2_src_scratch += 8;
2814
2815                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2816                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2817
2818                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2819                        m_count = _mm_cvtsi32_si128(i4_shift);
2820                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2821                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2822
2823                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2824                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2825                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2826                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2827
2828                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2829
2830                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2831                        pi2_dst_scratch += 8;
2832                    }
2833
2834                }
2835            }
2836        }
2837        else
2838        {
2839            /* o & stage 1 out */
2840            {
2841                WORD32 j;
2842
2843                WORD16 *pi2_src_scratch = o_temp_ptr;
2844                WORD16 *pi2_dst_scratch = temp_ptr;
2845                WORD32 out_stride = (trans_size << 1);
2846
2847                WORD32 in_stride = trans_size;
2848
2849
2850                for(j = 0; j < 2; j++)
2851                {
2852                    if(j)
2853                    {
2854                        m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
2855                        m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
2856                        m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
2857                        m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
2858                        m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
2859                        m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
2860                        m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
2861                        m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
2862
2863                        m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
2864                        m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
2865                        m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
2866                        m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
2867                        m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
2868                        m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
2869                        m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
2870                        m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
2871                    }
2872
2873                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
2874                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
2875                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
2876                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
2877                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
2878                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
2879                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
2880                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
2881
2882                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
2883                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
2884                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
2885                    m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
2886                    temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
2887                    temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
2888                    temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
2889                    temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
2890
2891
2892                    /* o0[0-3] */
2893                    {
2894                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2895                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2896                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2897                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2898
2899                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2900                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2901
2902                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
2903
2904                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
2905                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
2906                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
2907                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
2908
2909                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
2910                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
2911
2912                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
2913
2914                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
2915
2916                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2917                        pi2_src_scratch += in_stride;
2918
2919                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2920                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2921
2922                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2923                        m_count = _mm_cvtsi32_si128(i4_shift);
2924                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2925                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2926
2927                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2928                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2929                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2930                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2931
2932                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2933
2934                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2935                        pi2_dst_scratch += out_stride;
2936
2937                    }
2938
2939                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
2940                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
2941                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
2942                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
2943                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
2944                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
2945                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
2946                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
2947
2948
2949                    /* o1[0-3] */
2950                    {
2951                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2952                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2953                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2954                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2955
2956                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2957                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2958
2959                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
2960
2961                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
2962                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
2963                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
2964                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
2965
2966                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
2967                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
2968
2969                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
2970
2971                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
2972
2973                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2974                        pi2_src_scratch += in_stride;
2975
2976                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2977                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2978
2979                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2980                        m_count = _mm_cvtsi32_si128(i4_shift);
2981                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2982                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2983
2984                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2985                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2986                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2987                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2988
2989                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2990
2991                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2992                        pi2_dst_scratch += out_stride;
2993
2994                    }
2995
2996                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
2997                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
2998                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
2999                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
3000                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
3001                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
3002                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
3003                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
3004
3005                    /* o2[0-3] */
3006                    {
3007                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3008                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3009                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3010                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3011
3012                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
3013                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3014
3015                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3016
3017                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3018                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3019                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3020                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3021
3022                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
3023                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3024
3025                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
3026
3027                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3028
3029                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3030                        pi2_src_scratch += in_stride;
3031
3032                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3033                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3034
3035                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3036                        m_count = _mm_cvtsi32_si128(i4_shift);
3037                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3038                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3039
3040                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3041                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3042                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3043                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3044
3045                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3046
3047                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3048                        pi2_dst_scratch += out_stride;
3049
3050                    }
3051
3052
3053                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
3054                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
3055                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
3056                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
3057                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
3058                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
3059                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
3060                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
3061
3062                    /* o3[0-3] */
3063                    {
3064                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3065                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3066                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3067                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3068
3069                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
3070                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3071
3072                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3073
3074                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3075                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3076                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3077                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3078
3079                        m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
3080                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3081
3082                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3083
3084                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3085
3086                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3087                        pi2_src_scratch += in_stride;
3088
3089                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3090                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3091
3092                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3093                        m_count = _mm_cvtsi32_si128(i4_shift);
3094                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3095                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3096
3097                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3098                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3099                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3100                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3101
3102                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3103
3104                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3105                        pi2_dst_scratch += out_stride;
3106
3107                    }
3108
3109                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
3110                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
3111                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
3112                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
3113                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
3114                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
3115                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
3116                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
3117
3118                    /* o4[0-3] */
3119                    {
3120                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3121                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3122                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3123                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3124
3125                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3126                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3127
3128                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3129
3130                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3131                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3132                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3133                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3134
3135                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3136                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3137
3138                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3139
3140                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3141
3142                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3143                        pi2_src_scratch += in_stride;
3144
3145                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3146                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3147
3148                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3149                        m_count = _mm_cvtsi32_si128(i4_shift);
3150                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3151                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3152
3153                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3154                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3155                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3156                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3157
3158                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3159
3160                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3161                        pi2_dst_scratch += out_stride;
3162
3163                    }
3164
3165
3166                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
3167                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
3168                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
3169                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
3170                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
3171                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
3172                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
3173                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
3174
3175                    /* o5[0-3] */
3176                    {
3177                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3178                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3179                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3180                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3181
3182                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3183                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3184
3185                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3186
3187                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3188                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3189                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3190                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3191
3192                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3193                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3194
3195                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3196
3197                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3198
3199                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3200                        pi2_src_scratch += in_stride;
3201
3202                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3203                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3204
3205                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3206                        m_count = _mm_cvtsi32_si128(i4_shift);
3207                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3208                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3209
3210                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3211                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3212                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3213                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3214
3215                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3216
3217                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3218                        pi2_dst_scratch += out_stride;
3219
3220                    }
3221
3222                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
3223                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
3224                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
3225                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
3226                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
3227                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
3228                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
3229                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
3230
3231
3232                    /* o6[0-3] */
3233                    {
3234                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3235                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3236                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3237                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3238
3239                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3240                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3241
3242                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3243
3244                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3245                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3246                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3247                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3248
3249                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3250                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3251
3252                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3253
3254                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3255
3256                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3257                        pi2_src_scratch += in_stride;
3258
3259                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3260                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3261
3262                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3263                        m_count = _mm_cvtsi32_si128(i4_shift);
3264                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3265                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3266
3267                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3268                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3269                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3270                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3271
3272                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3273
3274                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3275                        pi2_dst_scratch += out_stride;
3276
3277                    }
3278
3279                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
3280                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
3281                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
3282                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
3283                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
3284                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
3285                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
3286                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
3287
3288                    /* o7[0-3] */
3289                    {
3290                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3291                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3292                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3293                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3294
3295                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3296                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3297
3298                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3299
3300                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3301                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3302                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3303                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3304
3305                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3306                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3307
3308                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3309
3310                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3311
3312                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3313                        pi2_src_scratch += 8;
3314
3315                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3316                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3317
3318                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3319                        m_count = _mm_cvtsi32_si128(i4_shift);
3320                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3321                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3322
3323                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3324                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3325                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3326                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3327
3328                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3329
3330                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3331                        pi2_dst_scratch += 8;
3332
3333                    }
3334
3335                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
3336                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
3337                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
3338                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
3339                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
3340                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
3341                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
3342                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
3343
3344
3345                    /* o8[0-3] */
3346                    {
3347
3348                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3349                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3350                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3351                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3352
3353                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3354                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3355
3356                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3357
3358                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3359                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3360                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3361                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3362
3363                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3364                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3365
3366                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3367
3368                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3369
3370                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3371                        pi2_src_scratch -= in_stride;
3372
3373                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3374                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3375
3376                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3377                        m_count = _mm_cvtsi32_si128(i4_shift);
3378                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3379                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3380
3381                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3382                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3383                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3384                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3385
3386                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3387
3388                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3389                        pi2_dst_scratch -= out_stride;
3390                    }
3391
3392                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
3393                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
3394                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
3395                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
3396                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
3397                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
3398                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
3399                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
3400
3401
3402                    /* o9[0-3] */
3403                    {
3404                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3405                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3406                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3407                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3408
3409                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3410                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3411
3412                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3413
3414                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3415                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3416                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3417                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3418
3419                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3420                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3421
3422                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3423
3424                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3425
3426                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3427                        pi2_src_scratch -= in_stride;
3428
3429                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3430                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3431
3432                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3433                        m_count = _mm_cvtsi32_si128(i4_shift);
3434                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3435                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3436
3437                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3438                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3439                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3440                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3441
3442                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3443
3444                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3445                        pi2_dst_scratch -= out_stride;
3446                    }
3447
3448                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
3449                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
3450                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
3451                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
3452                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
3453                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
3454                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
3455                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
3456
3457                    /* o10[0-3] */
3458                    {
3459                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3460                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3461                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3462                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3463
3464                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3465                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3466
3467                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3468
3469                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3470                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3471                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3472                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3473
3474                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3475                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3476
3477                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3478
3479                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3480
3481                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3482                        pi2_src_scratch -= in_stride;
3483
3484                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3485                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3486
3487                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3488                        m_count = _mm_cvtsi32_si128(i4_shift);
3489                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3490                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3491
3492                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3493                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3494                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3495                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3496
3497                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3498
3499                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3500                        pi2_dst_scratch -= out_stride;
3501                    }
3502
3503                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
3504                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
3505                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
3506                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
3507                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
3508                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
3509                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
3510                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
3511
3512                    /* o11[0-3] */
3513                    {
3514                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3515                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3516                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3517                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3518
3519                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3520                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3521
3522                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3523
3524                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3525                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3526                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3527                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3528
3529                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3530                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3531
3532                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3533
3534                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3535
3536                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3537                        pi2_src_scratch -= in_stride;
3538
3539                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3540                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3541
3542                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3543                        m_count = _mm_cvtsi32_si128(i4_shift);
3544                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3545                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3546
3547                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3548                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3549                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3550                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3551
3552                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3553
3554                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3555                        pi2_dst_scratch -= out_stride;
3556
3557                    }
3558
3559                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
3560                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
3561                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
3562                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
3563                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
3564                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
3565                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
3566                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
3567
3568
3569                    /* o12[0-3] */
3570                    {
3571                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3572                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3573                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3574                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3575
3576                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3577                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3578
3579                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3580
3581                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3582                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3583                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3584                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3585
3586                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3587                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3588
3589                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3590
3591                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3592
3593                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3594                        pi2_src_scratch -= in_stride;
3595
3596                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3597                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3598
3599                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3600                        m_count = _mm_cvtsi32_si128(i4_shift);
3601                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3602                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3603
3604                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3605                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3606                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3607                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3608
3609                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3610
3611                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3612                        pi2_dst_scratch -= out_stride;
3613
3614                    }
3615
3616                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
3617                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
3618                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
3619                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
3620                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
3621                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
3622                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
3623                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
3624
3625
3626                    /* o13[0-3] */
3627                    {
3628                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3629                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3630                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3631                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3632
3633                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3634                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3635
3636                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3637
3638                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3639                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3640                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3641                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3642
3643                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3644                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3645
3646                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3647
3648                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3649
3650                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3651                        pi2_src_scratch -= in_stride;
3652
3653                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3654                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3655
3656                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3657                        m_count = _mm_cvtsi32_si128(i4_shift);
3658                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3659                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3660
3661                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3662                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3663                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3664                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3665
3666                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3667
3668                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3669                        pi2_dst_scratch -= out_stride;
3670                    }
3671
3672                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
3673                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
3674                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
3675                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
3676                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
3677                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
3678                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
3679                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
3680
3681
3682                    /* o14[0-3] */
3683                    {
3684                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3685                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3686                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3687                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3688
3689                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3690                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3691
3692                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3693
3694                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3695                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3696                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3697                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3698
3699                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3700                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3701
3702                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3703
3704                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3705
3706                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3707                        pi2_src_scratch -= in_stride;
3708
3709                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3710                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3711
3712                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3713                        m_count = _mm_cvtsi32_si128(i4_shift);
3714                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3715                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3716
3717                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3718                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3719                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3720                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3721
3722                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3723
3724                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3725                        pi2_dst_scratch -= out_stride;
3726
3727                    }
3728
3729                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
3730                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
3731                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
3732                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
3733                    m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
3734                    m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
3735                    m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
3736                    m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
3737
3738                    /* o15[0-3] */
3739                    {
3740                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3741                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3742                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3743                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3744
3745                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3746                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3747
3748                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3749
3750                        m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3751                        m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3752                        m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3753                        m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3754
3755                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3756                        m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3757
3758                        m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3759
3760                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3761
3762                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3763                        pi2_src_scratch += 8;
3764
3765                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3766                        m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3767
3768                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3769                        m_count = _mm_cvtsi32_si128(i4_shift);
3770                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3771                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3772
3773                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3774                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3775                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3776                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3777
3778                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3779
3780                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3781                        pi2_dst_scratch += 8;
3782                    }
3783
3784                }
3785            }
3786        }
3787        /* Transpose */
3788        {
3789            WORD16 *pi2_src_scratch = temp_ptr;
3790            WORD16 *pi2_dst_scratch = pi2_tmp;
3791            WORD32 in_stride = (trans_size << 1);
3792
3793            for(j = 0; j < 2; j++)
3794            {
3795                m_temp_reg_30 =  _mm_load_si128((__m128i *)pi2_src_scratch);
3796                pi2_src_scratch += in_stride;
3797                m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
3798                pi2_src_scratch += in_stride;
3799                m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
3800                pi2_src_scratch += in_stride;
3801                m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
3802                pi2_src_scratch += in_stride;
3803                m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
3804                pi2_src_scratch += in_stride;
3805                m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
3806                pi2_src_scratch += in_stride;
3807                m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
3808                pi2_src_scratch += in_stride;
3809                m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
3810                pi2_src_scratch += 8;
3811
3812                m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
3813                pi2_src_scratch -= in_stride;
3814                m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
3815                pi2_src_scratch -= in_stride;
3816                m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
3817                pi2_src_scratch -= in_stride;
3818                m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
3819                pi2_src_scratch -= in_stride;
3820                m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
3821                pi2_src_scratch -= in_stride;
3822                m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
3823                pi2_src_scratch -= in_stride;
3824                m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
3825                pi2_src_scratch -= in_stride;
3826                m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
3827                pi2_src_scratch += 8;
3828
3829
3830                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
3831                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
3832
3833                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
3834                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
3835
3836                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
3837                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
3838
3839                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
3840                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
3841
3842                m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
3843                m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
3844
3845                m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
3846                m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
3847
3848                m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
3849                m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
3850
3851                m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
3852                m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
3853
3854                /****************/
3855
3856                m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
3857                m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
3858
3859                m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
3860                m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
3861
3862                m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
3863                m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
3864
3865                m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
3866                m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
3867
3868                m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
3869                m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
3870
3871                m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
3872                m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
3873
3874                m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
3875                m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
3876
3877                m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
3878                m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
3879
3880                /******************/
3881
3882                m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
3883                m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
3884
3885                m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
3886                m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
3887
3888                m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
3889                m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
3890
3891                m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
3892                m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
3893
3894                m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
3895                m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
3896
3897                m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
3898                m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
3899
3900                m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
3901                m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
3902
3903                m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
3904                m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
3905
3906                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
3907                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
3908                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
3909                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
3910
3911                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
3912                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
3913                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
3914                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
3915
3916                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
3917                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
3918                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
3919                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
3920
3921                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
3922                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
3923                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
3924                _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
3925
3926                pi2_dst_scratch += 4 * trans_size;
3927            }
3928        }
3929        pi2_src += 8;
3930//      pi2_dequant_coeff +=8;
3931        pi2_tmp += 8 * trans_size;
3932        zero_cols = zero_cols >> 1;
3933    }
3934
3935    if(trans_size_stg1 != TRANS_SIZE_32)
3936    {
3937        m_temp_reg_10 = _mm_setzero_si128();
3938
3939        for(i = trans_size_stg1; i < 32; i += 8)
3940        {
3941            WORD16 *pi2_dst_scratch = pi2_tmp;
3942
3943            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
3944            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
3945            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
3946            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
3947
3948            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
3949            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
3950            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
3951            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
3952
3953            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
3954            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
3955            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
3956            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
3957
3958            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
3959            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
3960            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
3961            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
3962
3963            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
3964            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
3965            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
3966            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
3967
3968            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
3969            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
3970            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
3971            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
3972
3973            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
3974            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
3975            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
3976            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
3977
3978            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
3979            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
3980            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
3981            _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
3982
3983            pi2_tmp += 8 * trans_size;
3984        }
3985    }
3986
3987    pi2_tmp = pi2_tmp_orig;
3988
3989    /* Inverse Transform 2nd stage */
3990
3991
3992    for(j = 0; j < trans_size; j += 4)
3993    {
3994        i4_shift = IT_SHIFT_STAGE_2;
3995
3996        /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
3997        if(zero_last28_rows_stg2)
3998        {
3999            {
4000
4001                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
4002                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
4003                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
4004                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
4005                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
4006                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
4007                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
4008                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
4009
4010                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
4011
4012                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
4013
4014                /* eo0[0-3] */
4015                {
4016                    m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4017
4018                }
4019                /* eo1[0-3] */
4020                {
4021                    m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
4022
4023                }
4024                /* eo2[0-3] */
4025                {
4026                    m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4027                }
4028
4029                /* eo3[0-3] */
4030                {
4031                    m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
4032                }
4033                /* eo4[0-3] */
4034                {
4035                    m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
4036                }
4037
4038                /* eo5[0-3] */
4039                {
4040                    m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
4041                }
4042
4043                /* eo6[0-3] */
4044                {
4045                    m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
4046                }
4047                /* eo7[0-3] */
4048                {
4049                    m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
4050                }
4051            }
4052
4053            m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
4054
4055            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
4056
4057            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
4058
4059            m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4060
4061            m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4062
4063            /* e[]*/
4064
4065            temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[0] */
4066            temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[15] */
4067
4068            temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[1] */
4069            temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[14] */
4070
4071            temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[2] */
4072            temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[13] */
4073
4074            temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[3] */
4075            temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[12] */
4076
4077            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[4] */
4078            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[11] */
4079
4080            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[5] */
4081            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[10] */
4082
4083            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[6] */
4084            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[9] */
4085
4086            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[7] */
4087            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[8] */
4088
4089            /*o[k]*/
4090            {
4091
4092                WORD16 *pi2_dst_scratch = temp_ptr;
4093                WORD32 out_stride = 8;
4094
4095                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
4096
4097                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
4098                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
4099
4100                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
4101
4102
4103                /* o0[0-3] */
4104                {
4105                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4106
4107                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
4108                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
4109
4110                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4111                    m_count = _mm_cvtsi32_si128(i4_shift);
4112                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4113                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4114
4115                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4116                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4117                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4118                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4119
4120                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4121
4122                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4123                    pi2_dst_scratch += out_stride;
4124
4125                }
4126
4127                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
4128
4129                /* o1[0-3] */
4130                {
4131                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4132
4133                    m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
4134                    m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
4135
4136                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4137                    m_count = _mm_cvtsi32_si128(i4_shift);
4138                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4139                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4140
4141                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4142                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4143                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4144                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4145
4146                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4147
4148                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4149                    pi2_dst_scratch += out_stride;
4150
4151                }
4152
4153                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
4154
4155                /* o2[0-3] */
4156                {
4157                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4158
4159                    m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
4160                    m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
4161
4162                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4163                    m_count = _mm_cvtsi32_si128(i4_shift);
4164                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4165                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4166
4167                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4168                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4169                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4170                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4171
4172                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4173
4174                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4175                    pi2_dst_scratch += out_stride;
4176
4177                }
4178
4179                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
4180
4181                /* o3[0-3] */
4182                {
4183                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4184
4185                    m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
4186                    m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
4187
4188                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4189                    m_count = _mm_cvtsi32_si128(i4_shift);
4190                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4191                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4192
4193                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4194                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4195                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4196                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4197
4198                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4199
4200                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4201                    pi2_dst_scratch += out_stride;
4202
4203                }
4204
4205                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
4206
4207                /* o4[0-3] */
4208                {
4209                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4210
4211                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
4212                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
4213
4214                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4215                    m_count = _mm_cvtsi32_si128(i4_shift);
4216                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4217                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4218
4219                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4220                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4221                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4222                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4223
4224                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4225
4226                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4227                    pi2_dst_scratch += out_stride;
4228
4229                }
4230
4231                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
4232
4233                /* o5[0-3] */
4234                {
4235                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4236
4237                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
4238                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
4239
4240                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4241                    m_count = _mm_cvtsi32_si128(i4_shift);
4242                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4243                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4244
4245                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4246                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4247                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4248                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4249
4250                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4251
4252                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4253                    pi2_dst_scratch += out_stride;
4254
4255                }
4256
4257                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
4258
4259                /* o6[0-3] */
4260                {
4261                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4262
4263                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
4264                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
4265
4266                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4267                    m_count = _mm_cvtsi32_si128(i4_shift);
4268                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4269                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4270
4271                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4272                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4273                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4274                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4275
4276                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4277
4278                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4279                    pi2_dst_scratch += out_stride;
4280
4281                }
4282
4283                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
4284
4285                /* o7[0-3] */
4286                {
4287                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4288
4289                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
4290                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
4291
4292                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4293                    m_count = _mm_cvtsi32_si128(i4_shift);
4294                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4295                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4296
4297                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4298                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4299                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4300                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4301
4302                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4303
4304                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4305                    pi2_dst_scratch += 8;
4306
4307                }
4308
4309                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
4310
4311                /* o8[0-3] */
4312                {
4313                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4314
4315                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
4316                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
4317
4318                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4319                    m_count = _mm_cvtsi32_si128(i4_shift);
4320                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4321                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4322
4323                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4324                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4325                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4326                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4327
4328                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4329
4330                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4331                    pi2_dst_scratch += out_stride;
4332                }
4333
4334                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
4335
4336                /* o9[0-3] */
4337                {
4338                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4339
4340                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
4341                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
4342
4343                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4344                    m_count = _mm_cvtsi32_si128(i4_shift);
4345                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4346                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4347
4348                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4349                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4350                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4351                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4352
4353                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4354
4355                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4356                    pi2_dst_scratch += out_stride;
4357
4358                }
4359
4360                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
4361
4362                /* o10[0-3] */
4363                {
4364                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4365
4366                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
4367                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
4368
4369                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4370                    m_count = _mm_cvtsi32_si128(i4_shift);
4371                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4372                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4373
4374                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4375                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4376                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4377                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4378
4379                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4380
4381                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4382                    pi2_dst_scratch += out_stride;
4383                }
4384
4385                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
4386
4387                /* o11[0-3] */
4388                {
4389                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4390
4391                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
4392                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
4393
4394                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4395                    m_count = _mm_cvtsi32_si128(i4_shift);
4396                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4397                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4398
4399                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4400                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4401                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4402                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4403
4404                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4405
4406                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4407                    pi2_dst_scratch += out_stride;
4408
4409                }
4410
4411                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
4412
4413                /* o12[0-3] */
4414                {
4415                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4416
4417                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
4418                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
4419
4420                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4421                    m_count = _mm_cvtsi32_si128(i4_shift);
4422                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4423                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4424
4425                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4426                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4427                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4428                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4429
4430                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4431
4432                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4433                    pi2_dst_scratch += out_stride;
4434
4435                }
4436
4437                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
4438
4439                /* o13[0-3] */
4440                {
4441                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4442
4443                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
4444                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
4445
4446                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4447                    m_count = _mm_cvtsi32_si128(i4_shift);
4448                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4449                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4450
4451                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4452                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4453                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4454                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4455
4456                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4457
4458                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4459                    pi2_dst_scratch += out_stride;
4460                }
4461
4462                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
4463
4464                /* o14[0-3] */
4465                {
4466                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4467
4468                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
4469                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
4470
4471                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4472                    m_count = _mm_cvtsi32_si128(i4_shift);
4473                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4474                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4475
4476                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4477                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4478                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4479                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4480
4481                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4482
4483                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4484                    pi2_dst_scratch += out_stride;
4485
4486                }
4487
4488                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
4489
4490                /* o15[0-3] */
4491                {
4492                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4493
4494                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
4495                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
4496
4497                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4498                    m_count = _mm_cvtsi32_si128(i4_shift);
4499                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4500                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4501
4502                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4503                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4504                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4505                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4506
4507                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4508
4509                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4510                    pi2_dst_scratch += 8;
4511                }
4512
4513            }
4514
4515        }
4516        else if(zero_last24_rows_stg2)
4517        {
4518            /* eo */
4519            {
4520                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
4521
4522                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
4523                m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
4524
4525                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
4526
4527
4528                /* eo0[0-3] */
4529                {
4530                    m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4531
4532                }
4533
4534                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
4535
4536                /* eo1[0-3] */
4537                {
4538                    m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4539
4540                }
4541                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
4542
4543                /* eo2[0-3] */
4544                {
4545                    m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4546
4547                }
4548
4549                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
4550
4551                /* eo3[0-3] */
4552                {
4553
4554                    m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4555
4556                }
4557
4558                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
4559
4560                /* eo4[0-3] */
4561                {
4562                    m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4563
4564                }
4565
4566                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
4567
4568                /* eo5[0-3] */
4569                {
4570                    m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4571                }
4572
4573                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
4574                /* eo6[0-3] */
4575                {
4576                    m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4577                }
4578
4579                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
4580                /* eo7[0-3] */
4581                {
4582                    m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4583
4584                }
4585
4586            }
4587
4588            /* eeo */
4589            {
4590
4591                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
4592                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
4593                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
4594                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
4595
4596                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
4597
4598                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
4599
4600                /* eeo0[0-3] */
4601                {
4602                    temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4603
4604                }
4605
4606                /* eeo1[0-3] */
4607                {
4608                    temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
4609
4610                }
4611
4612                /* eo2[0-3] */
4613                {
4614                    temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
4615
4616                }
4617
4618
4619                /* eo3[0-3] */
4620                {
4621                    temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4622
4623                }
4624
4625            }
4626
4627            m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
4628            m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
4629            m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
4630
4631            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
4632
4633            //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
4634            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
4635
4636            m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4637            m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4638
4639            m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1);  /* ee[0] */
4640            m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1);  /* ee[7] */
4641
4642            m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2);  /* ee[1] */
4643            m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2);  /* ee[6] */
4644
4645            m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3);  /* ee[2] */
4646            m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3);  /* ee[5] */
4647
4648            m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4);  /* ee[3] */
4649            m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4);  /* ee[4] */
4650
4651            /* e[]*/
4652
4653            temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
4654            temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
4655
4656            temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
4657            temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
4658
4659            temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
4660            temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
4661
4662            temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
4663            temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
4664
4665            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
4666            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
4667
4668            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
4669            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
4670
4671            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
4672            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
4673
4674            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
4675            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
4676
4677            /*o[k] */
4678            {
4679
4680                WORD16 *pi2_dst_scratch = temp_ptr;
4681                WORD32 out_stride = 8;
4682
4683                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
4684                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
4685
4686                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
4687                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
4688                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
4689                m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
4690
4691                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
4692                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
4693
4694                /* o0[0-3] */
4695                {
4696                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4697                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4698
4699                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
4700
4701                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
4702                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
4703
4704                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4705                    m_count = _mm_cvtsi32_si128(i4_shift);
4706                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4707                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4708
4709                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4710                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4711                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4712                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4713
4714                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4715
4716                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4717                    pi2_dst_scratch += out_stride;
4718
4719                }
4720
4721
4722                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
4723                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
4724
4725                /* o1[0-3] */
4726                {
4727                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4728                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4729
4730                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
4731
4732                    m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
4733                    m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
4734
4735                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4736                    m_count = _mm_cvtsi32_si128(i4_shift);
4737                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4738                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4739
4740                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4741                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4742                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4743                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4744
4745                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4746
4747                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4748                    pi2_dst_scratch += out_stride;
4749
4750                }
4751
4752                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
4753                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
4754
4755                /* o2[0-3] */
4756                {
4757                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4758                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4759
4760                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
4761
4762                    m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
4763                    m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
4764
4765                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4766                    m_count = _mm_cvtsi32_si128(i4_shift);
4767                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4768                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4769
4770                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4771                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4772                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4773                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4774
4775                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4776
4777                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4778                    pi2_dst_scratch += out_stride;
4779
4780                }
4781
4782                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
4783                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
4784
4785                /* o3[0-3] */
4786                {
4787                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4788                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4789
4790                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
4791
4792                    m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
4793                    m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
4794
4795                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4796                    m_count = _mm_cvtsi32_si128(i4_shift);
4797                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4798                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4799
4800                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4801                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4802                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4803                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4804
4805                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4806
4807                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4808                    pi2_dst_scratch += out_stride;
4809
4810                }
4811
4812                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
4813                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
4814
4815                /* o4[0-3] */
4816                {
4817                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4818                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4819
4820                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4821
4822                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
4823                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
4824
4825                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4826                    m_count = _mm_cvtsi32_si128(i4_shift);
4827                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4828                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4829
4830                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4831                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4832                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4833                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4834
4835                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4836
4837                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4838                    pi2_dst_scratch += out_stride;
4839
4840                }
4841
4842                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
4843                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
4844
4845                /* o5[0-3] */
4846                {
4847                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4848                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4849
4850                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4851
4852                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
4853                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
4854
4855                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4856                    m_count = _mm_cvtsi32_si128(i4_shift);
4857                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4858                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4859
4860                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4861                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4862                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4863                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4864
4865                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4866
4867                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4868                    pi2_dst_scratch += out_stride;
4869
4870                }
4871
4872                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
4873                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
4874
4875                /* o6[0-3] */
4876                {
4877                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4878                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4879
4880                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4881
4882                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
4883                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
4884
4885                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4886                    m_count = _mm_cvtsi32_si128(i4_shift);
4887                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4888                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4889
4890                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4891                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4892                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4893                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4894
4895                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4896
4897                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4898                    pi2_dst_scratch += out_stride;
4899
4900                }
4901
4902                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
4903                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
4904
4905                /* o7[0-3] */
4906                {
4907                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4908                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4909
4910                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4911
4912                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
4913                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
4914
4915                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4916                    m_count = _mm_cvtsi32_si128(i4_shift);
4917                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4918                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4919
4920                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4921                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4922                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4923                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4924
4925                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4926
4927                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4928                    pi2_dst_scratch += 8;
4929
4930                }
4931
4932                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
4933                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
4934
4935                /* o8[0-3] */
4936                {
4937                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4938                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4939
4940                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4941
4942                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
4943                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
4944
4945                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4946                    m_count = _mm_cvtsi32_si128(i4_shift);
4947                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4948                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4949
4950                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4951                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4952                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4953                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4954
4955                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4956
4957                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4958                    pi2_dst_scratch += out_stride;
4959                }
4960
4961                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
4962                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
4963
4964                /* o9[0-3] */
4965                {
4966                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4967                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4968
4969                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4970
4971                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
4972                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
4973
4974                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4975                    m_count = _mm_cvtsi32_si128(i4_shift);
4976                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4977                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4978
4979                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4980                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4981                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4982                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4983
4984                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4985
4986                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4987                    pi2_dst_scratch += out_stride;
4988                }
4989
4990                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
4991                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
4992
4993                /* o10[0-3] */
4994                {
4995                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4996                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4997
4998                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4999
5000                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
5001                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
5002
5003                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5004                    m_count = _mm_cvtsi32_si128(i4_shift);
5005                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5006                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5007
5008                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5009                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5010                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5011                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5012
5013                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5014
5015                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5016                    pi2_dst_scratch += out_stride;
5017                }
5018
5019                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
5020                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
5021
5022                /* o11[0-3] */
5023                {
5024                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5025                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5026
5027                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5028
5029                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
5030                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
5031
5032                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5033                    m_count = _mm_cvtsi32_si128(i4_shift);
5034                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5035                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5036
5037                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5038                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5039                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5040                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5041
5042                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5043
5044                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5045                    pi2_dst_scratch += out_stride;
5046
5047                }
5048
5049                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
5050                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
5051
5052                /* o12[0-3] */
5053                {
5054                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5055                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5056
5057                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5058
5059                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
5060                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
5061
5062                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5063                    m_count = _mm_cvtsi32_si128(i4_shift);
5064                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5065                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5066
5067                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5068                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5069                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5070                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5071
5072                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5073
5074                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5075                    pi2_dst_scratch += out_stride;
5076
5077                }
5078
5079                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
5080                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
5081
5082                /* o13[0-3] */
5083                {
5084                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5085                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5086
5087                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5088
5089                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
5090                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
5091
5092                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5093                    m_count = _mm_cvtsi32_si128(i4_shift);
5094                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5095                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5096
5097                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5098                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5099                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5100                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5101
5102                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5103
5104                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5105                    pi2_dst_scratch += out_stride;
5106                }
5107
5108                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
5109                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
5110
5111                /* o14[0-3] */
5112                {
5113                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5114                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5115
5116                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5117
5118                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
5119                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
5120
5121                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5122                    m_count = _mm_cvtsi32_si128(i4_shift);
5123                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5124                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5125
5126                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5127                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5128                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5129                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5130
5131                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5132
5133                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5134                    pi2_dst_scratch += out_stride;
5135                }
5136
5137                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
5138                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
5139
5140                /* o15[0-3] */
5141                {
5142                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5143                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5144
5145                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5146
5147                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
5148                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
5149
5150                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5151                    m_count = _mm_cvtsi32_si128(i4_shift);
5152                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5153                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5154
5155                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5156                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5157                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5158                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5159
5160                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5161
5162                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5163                    pi2_dst_scratch += 8;
5164                }
5165
5166            }
5167        }
5168        else
5169        {
5170            /* eo */
5171            {
5172
5173                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
5174                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
5175                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
5176                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
5177
5178
5179                m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
5180                m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
5181                m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
5182                m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
5183                m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
5184                m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
5185                m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
5186                m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
5187
5188                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
5189                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
5190                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
5191                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
5192
5193                /* eo0[0-3] */
5194                {
5195                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5196                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5197
5198                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5199
5200                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5201                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5202
5203                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5204
5205                    m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5206
5207                }
5208
5209                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
5210                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
5211                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
5212                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
5213
5214                /* eo1[0-3] */
5215                {
5216                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5217                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5218
5219                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5220
5221                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5222                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5223
5224                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5225
5226                    m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
5227
5228                }
5229
5230                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
5231                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
5232                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
5233                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
5234
5235                /* eo2[0-3] */
5236                {
5237                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5238                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5239
5240                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
5241
5242                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5243                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5244
5245                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5246
5247                    m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5248
5249                }
5250
5251                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
5252                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
5253                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
5254                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
5255
5256                /* eo3[0-3] */
5257                {
5258                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5259                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5260
5261                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5262
5263                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5264                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5265
5266                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
5267
5268                    m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5269
5270                }
5271
5272                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
5273                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
5274                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
5275                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
5276
5277
5278                /* eo4[0-3] */
5279                {
5280                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5281                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5282
5283                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5284
5285                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5286                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5287
5288                    m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
5289
5290                    m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5291
5292                }
5293
5294                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
5295                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
5296                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
5297                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
5298
5299                /* eo5[0-3] */
5300                {
5301                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5302                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5303
5304                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5305
5306                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5307                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5308
5309                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5310
5311                    m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5312                }
5313
5314                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
5315                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
5316                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
5317                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
5318
5319                /* eo6[0-3] */
5320                {
5321                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5322                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5323
5324                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5325
5326                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5327                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5328
5329                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5330
5331                    m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5332
5333                }
5334
5335                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
5336                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
5337                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
5338                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
5339
5340                /* eo7[0-3] */
5341                {
5342                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5343                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5344
5345                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5346
5347                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5348                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5349
5350                    m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5351
5352                    m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5353
5354
5355                }
5356
5357            }
5358
5359            /* eeo */
5360            {
5361                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
5362                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
5363
5364                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
5365                m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
5366                m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
5367                m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
5368
5369                /* eeo0[0-3] */
5370                {
5371
5372                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
5373                    m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
5374
5375                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5376                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5377
5378                    temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5379
5380                }
5381
5382                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
5383                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
5384
5385                /* eeo1[0-3] */
5386                {
5387                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5388                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5389
5390                    temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
5391
5392                }
5393
5394                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
5395                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
5396
5397                /* eo2[0-3] */
5398                {
5399                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5400                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5401
5402                    temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5403
5404                }
5405
5406                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
5407                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
5408
5409                /* eo3[0-3] */
5410                {
5411                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5412                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5413
5414                    temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5415
5416                }
5417
5418
5419            }
5420
5421            m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
5422            m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
5423
5424            m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
5425            m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
5426
5427            m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
5428            m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
5429
5430            m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
5431
5432            m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
5433            m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
5434
5435            m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
5436
5437            m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
5438            m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
5439
5440            m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
5441            m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
5442
5443/* eeeo[0]= m_temp_reg_20  */
5444/* eeeo[1]= m_temp_reg_21  */
5445/* eeee[0]= m_temp_reg_22  */
5446/* eeee[1]= m_temp_reg_23  */
5447
5448            /* eee[0] = eeee[0] + eeeo[0]; */
5449            m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
5450
5451            /* eee[3] = eeee[0] - eeeo[0]; */
5452            m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
5453
5454            /* eee[2] = eeee[1] - eeeo[1]; */
5455            m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
5456
5457            /* eee[1] = eeee[1] + eeeo[1];*/
5458            m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
5459
5460            m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1);  /* ee[0] */
5461            m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1);  /* ee[7] */
5462
5463            m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2);  /* ee[1] */
5464            m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2);  /* ee[6] */
5465
5466            m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3);  /* ee[2] */
5467            m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3);  /* ee[5] */
5468
5469            m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4);  /* ee[3] */
5470            m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4);  /* ee[4] */
5471
5472/* e[]*/
5473
5474            temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
5475            temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
5476
5477            temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
5478            temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
5479
5480            temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
5481            temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
5482
5483            temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
5484            temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
5485
5486            m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
5487            m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
5488
5489            m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
5490            m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
5491
5492            m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
5493            m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
5494
5495            m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
5496            m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
5497
5498/*o[k] */
5499            {
5500
5501                WORD16 *pi2_dst_scratch = temp_ptr;
5502                WORD32 out_stride = 8;
5503
5504                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
5505                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
5506                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
5507                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
5508                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
5509                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
5510                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
5511                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
5512
5513
5514                m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
5515                m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
5516                m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
5517                m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
5518                m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
5519                m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
5520                m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
5521                m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
5522
5523                m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
5524                m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
5525                m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
5526                m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
5527                m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
5528                m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
5529                m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
5530                m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
5531
5532                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
5533                m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
5534                m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
5535                m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
5536                m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
5537                m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
5538                m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
5539                m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
5540
5541                /* o0[0-3] */
5542                {
5543                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5544                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5545                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5546                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5547
5548                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5549                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5550
5551                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5552
5553                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5554                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5555                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5556                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5557
5558                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5559                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5560
5561                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5562
5563                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5564
5565                    m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
5566                    m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
5567
5568                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5569                    m_count = _mm_cvtsi32_si128(i4_shift);
5570                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5571                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5572
5573                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5574                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5575                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5576                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5577
5578                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5579
5580                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5581                    pi2_dst_scratch += out_stride;
5582
5583                }
5584
5585                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
5586                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
5587                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
5588                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
5589                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
5590                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
5591                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
5592                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
5593
5594                /* o1[0-3] */
5595                {
5596                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5597                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5598                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5599                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5600
5601                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5602                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5603
5604                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
5605
5606                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5607                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5608                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5609                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5610
5611                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5612                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5613
5614                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5615
5616                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5617
5618                    m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
5619                    m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
5620
5621                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5622                    m_count = _mm_cvtsi32_si128(i4_shift);
5623                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5624                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5625
5626                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5627                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5628                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5629                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5630
5631                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5632
5633                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5634                    pi2_dst_scratch += out_stride;
5635
5636                }
5637
5638                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
5639                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
5640                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
5641                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
5642                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
5643                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
5644                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
5645                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
5646
5647                /* o2[0-3] */
5648                {
5649                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5650                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5651                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5652                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5653
5654                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
5655                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5656
5657                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5658
5659                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5660                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5661                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5662                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5663
5664                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
5665                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5666
5667                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
5668
5669                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5670
5671                    m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
5672                    m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
5673
5674                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5675                    m_count = _mm_cvtsi32_si128(i4_shift);
5676                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5677                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5678
5679                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5680                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5681                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5682                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5683
5684                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5685
5686                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5687                    pi2_dst_scratch += out_stride;
5688
5689                }
5690
5691                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
5692                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
5693                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
5694                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
5695                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
5696                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
5697                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
5698                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
5699
5700                /* o3[0-3] */
5701                {
5702                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5703                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5704                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5705                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5706
5707                    m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
5708                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5709
5710                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5711
5712                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5713                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5714                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5715                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5716
5717                    m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
5718                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5719
5720                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5721
5722                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5723
5724                    m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
5725                    m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
5726
5727                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5728                    m_count = _mm_cvtsi32_si128(i4_shift);
5729                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5730                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5731
5732                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5733                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5734                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5735                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5736
5737                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5738
5739                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5740                    pi2_dst_scratch += out_stride;
5741
5742                }
5743
5744                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
5745                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
5746                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
5747                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
5748                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
5749                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
5750                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
5751                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
5752
5753                /* o4[0-3] */
5754                {
5755                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5756                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5757                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5758                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5759
5760                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5761                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5762
5763                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5764
5765                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5766                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5767                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5768                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5769
5770                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5771                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5772
5773                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5774
5775                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5776
5777                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
5778                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
5779                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5780                    m_count = _mm_cvtsi32_si128(i4_shift);
5781                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5782                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5783
5784                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5785                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5786                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5787                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5788
5789                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5790
5791                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5792                    pi2_dst_scratch += out_stride;
5793
5794                }
5795
5796                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
5797                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
5798                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
5799                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
5800                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
5801                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
5802                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
5803                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
5804
5805                /* o5[0-3] */
5806                {
5807                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5808                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5809                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5810                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5811
5812                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5813                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5814
5815                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5816
5817                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5818                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5819                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5820                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5821
5822                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5823                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5824
5825                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5826
5827                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5828
5829                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
5830                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
5831
5832                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5833                    m_count = _mm_cvtsi32_si128(i4_shift);
5834                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5835                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5836
5837                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5838                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5839                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5840                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5841
5842                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5843
5844                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5845                    pi2_dst_scratch += out_stride;
5846
5847                }
5848
5849                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
5850                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
5851                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
5852                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
5853                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
5854                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
5855                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
5856                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
5857
5858                /* o6[0-3] */
5859                {
5860                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5861                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5862                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5863                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5864
5865                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5866                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5867
5868                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5869
5870                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5871                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5872                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5873                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5874
5875                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5876                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5877
5878                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5879
5880                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5881
5882                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
5883                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
5884
5885                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5886                    m_count = _mm_cvtsi32_si128(i4_shift);
5887                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5888                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5889
5890                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5891                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5892                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5893                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5894
5895                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5896
5897                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5898                    pi2_dst_scratch += out_stride;
5899
5900                }
5901
5902                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
5903                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
5904                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
5905                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
5906                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
5907                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
5908                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
5909                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
5910
5911                /* o7[0-3] */
5912                {
5913                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5914                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5915                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5916                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5917
5918                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5919                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5920
5921                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5922
5923                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5924                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5925                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5926                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5927
5928                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5929                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5930
5931                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5932
5933                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5934
5935                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
5936                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
5937
5938                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5939                    m_count = _mm_cvtsi32_si128(i4_shift);
5940                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5941                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5942
5943                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5944                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5945                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5946                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5947
5948                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5949
5950                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5951                    pi2_dst_scratch += 8;
5952
5953                }
5954
5955                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
5956                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
5957                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
5958                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
5959                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
5960                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
5961                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
5962                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
5963
5964                /* o8[0-3] */
5965                {
5966                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5967                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5968                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5969                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5970
5971                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5972                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5973
5974                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5975
5976                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5977                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5978                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5979                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5980
5981                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5982                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5983
5984                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5985
5986                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5987
5988                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
5989                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
5990
5991                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5992                    m_count = _mm_cvtsi32_si128(i4_shift);
5993                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5994                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5995
5996                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5997                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5998                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5999                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6000
6001                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6002
6003                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6004                    pi2_dst_scratch += out_stride;
6005                }
6006
6007                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
6008                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
6009                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
6010                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
6011                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
6012                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
6013                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
6014                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
6015
6016                /* o9[0-3] */
6017                {
6018                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6019                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6020                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6021                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6022
6023                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6024                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6025
6026                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6027
6028                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6029                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6030                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6031                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6032
6033                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6034                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6035
6036                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6037
6038                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6039
6040                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
6041                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
6042
6043                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6044                    m_count = _mm_cvtsi32_si128(i4_shift);
6045                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6046                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6047
6048                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6049                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6050                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6051                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6052
6053                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6054
6055                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6056                    pi2_dst_scratch += out_stride;
6057                }
6058
6059                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
6060                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
6061                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
6062                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
6063                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
6064                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
6065                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
6066                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
6067
6068                /* o10[0-3] */
6069                {
6070                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6071                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6072                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6073                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6074
6075                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6076                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6077
6078                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6079
6080                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6081                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6082                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6083                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6084
6085                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6086                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6087
6088                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6089
6090                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6091
6092                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
6093                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
6094
6095                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6096                    m_count = _mm_cvtsi32_si128(i4_shift);
6097                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6098                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6099
6100                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6101                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6102                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6103                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6104
6105                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6106
6107                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6108                    pi2_dst_scratch += out_stride;
6109                }
6110
6111
6112                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
6113                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
6114                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
6115                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
6116                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
6117                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
6118                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
6119                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
6120
6121                /* o11[0-3] */
6122                {
6123                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6124                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6125                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6126                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6127
6128                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6129                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6130
6131                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6132
6133                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6134                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6135                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6136                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6137
6138                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6139                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6140
6141                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6142
6143                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6144
6145                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
6146                    m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
6147
6148                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6149                    m_count = _mm_cvtsi32_si128(i4_shift);
6150                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6151                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6152
6153                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6154                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6155                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6156                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6157
6158                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6159
6160                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6161                    pi2_dst_scratch += out_stride;
6162
6163                }
6164
6165                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
6166                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
6167                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
6168                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
6169                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
6170                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
6171                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
6172                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
6173
6174                /* o12[0-3] */
6175                {
6176                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6177                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6178                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6179                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6180
6181                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6182                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6183
6184                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6185
6186                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6187                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6188                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6189                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6190
6191                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6192                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6193
6194                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6195
6196                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6197
6198                    m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
6199                    m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
6200
6201                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6202                    m_count = _mm_cvtsi32_si128(i4_shift);
6203                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6204                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6205
6206                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6207                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6208                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6209                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6210
6211                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6212
6213                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6214                    pi2_dst_scratch += out_stride;
6215
6216                }
6217
6218                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
6219                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
6220                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
6221                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
6222                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
6223                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
6224                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
6225                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
6226
6227                /* o13[0-3] */
6228                {
6229                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6230                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6231                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6232                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6233
6234                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6235                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6236
6237                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6238
6239                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6240                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6241                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6242                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6243
6244                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6245                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6246
6247                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6248
6249                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6250
6251                    m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
6252                    m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
6253
6254                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6255                    m_count = _mm_cvtsi32_si128(i4_shift);
6256                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6257                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6258
6259                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6260                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6261                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6262                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6263
6264                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6265
6266                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6267                    pi2_dst_scratch += out_stride;
6268                }
6269
6270                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
6271                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
6272                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
6273                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
6274                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
6275                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
6276                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
6277                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
6278
6279                /* o14[0-3] */
6280                {
6281                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6282                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6283                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6284                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6285
6286                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6287                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6288
6289                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6290
6291                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6292                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6293                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6294                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6295
6296                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6297                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6298
6299                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6300
6301                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6302
6303                    m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
6304                    m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
6305
6306                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6307                    m_count = _mm_cvtsi32_si128(i4_shift);
6308                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6309                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6310
6311                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6312                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6313                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6314                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6315
6316                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6317
6318                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6319                    pi2_dst_scratch += out_stride;
6320
6321                }
6322
6323                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
6324                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
6325                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
6326                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
6327                m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
6328                m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
6329                m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
6330                m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
6331
6332                /* o15[0-3] */
6333                {
6334                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6335                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6336                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6337                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6338
6339                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6340                    m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6341
6342                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6343
6344                    m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6345                    m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6346                    m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6347                    m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6348
6349                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6350                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6351
6352                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6353
6354                    m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6355
6356                    m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
6357                    m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
6358
6359                    m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6360                    m_count = _mm_cvtsi32_si128(i4_shift);
6361                    m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6362                    m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6363
6364                    m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6365                    m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6366                    m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6367                    m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6368
6369                    m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6370
6371                    _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6372                    pi2_dst_scratch += 8;
6373                }
6374
6375            }
6376        }
6377
6378        /* Transpose */
6379        {
6380
6381            WORD16 *pi2_src_scratch = temp_ptr;
6382            WORD32 out_stride = dst_strd;
6383            WORD32 in_stride = 8;
6384
6385            m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
6386            pi2_src_scratch += in_stride;
6387            m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
6388            pi2_src_scratch += in_stride;
6389            m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
6390            pi2_src_scratch += in_stride;
6391            m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
6392            pi2_src_scratch += in_stride;
6393            m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
6394            pi2_src_scratch += in_stride;
6395            m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
6396            pi2_src_scratch += in_stride;
6397            m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
6398            pi2_src_scratch += in_stride;
6399            m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
6400            pi2_src_scratch += 8;
6401
6402            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
6403            pi2_src_scratch += in_stride;
6404            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
6405            pi2_src_scratch += in_stride;
6406            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
6407            pi2_src_scratch += in_stride;
6408            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
6409            pi2_src_scratch += in_stride;
6410            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
6411            pi2_src_scratch += in_stride;
6412            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
6413            pi2_src_scratch += in_stride;
6414            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
6415            pi2_src_scratch += in_stride;
6416            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
6417            pi2_src_scratch += 8;
6418
6419
6420            m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
6421            m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
6422
6423            m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
6424            m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
6425
6426            m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
6427            m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
6428
6429            m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
6430            m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
6431
6432            m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
6433            m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
6434
6435            m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
6436            m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
6437
6438            m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
6439            m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
6440
6441            m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
6442            m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
6443
6444
6445            m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
6446            m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
6447
6448            m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
6449            m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
6450
6451            m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
6452            m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
6453
6454            m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
6455            m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
6456
6457            m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
6458            m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
6459
6460            m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
6461            m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
6462
6463            m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
6464            m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
6465
6466            m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
6467            m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
6468
6469
6470            m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);       // row0 = 0-7
6471            m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);       // row1 = 0-7
6472
6473            m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);     // row0=24-31
6474            m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);     // row1=24-31
6475
6476            m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);       // row0=8-15
6477            m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);       // row1=8-15
6478
6479            m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);     // row0=16-23
6480            m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);     // row1=16-23
6481
6482            m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);      // row2 =0-7
6483            m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);      // row3 =0-7
6484
6485            m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);    // row2=24-31
6486            m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);    // row3=24-31
6487
6488            m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);      // row2=8-15
6489            m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);      // row3=8-15
6490
6491            m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);    // row2=16-23
6492            m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);    // row3=16-23
6493
6494            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6495
6496            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6497            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6498
6499            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
6500            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6501
6502            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6503            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6504
6505            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
6506            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6507
6508            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6509
6510            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6511
6512            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6513            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6514
6515            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
6516            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6517
6518            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6519            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6520
6521            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
6522            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6523
6524            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6525            pu1_dst += out_stride;
6526            pu1_pred += pred_strd;
6527
6528
6529            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6530
6531            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6532            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6533
6534            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
6535            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6536
6537            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6538            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6539
6540            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
6541            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6542
6543            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6544
6545            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6546
6547            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6548            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6549
6550            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
6551            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6552
6553            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6554            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6555
6556            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
6557            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6558
6559            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6560            pu1_dst += out_stride;
6561            pu1_pred += pred_strd;
6562
6563            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6564
6565            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6566            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6567
6568            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
6569            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6570
6571            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6572            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6573
6574            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
6575            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6576
6577            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6578
6579            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6580
6581            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6582            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6583
6584            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
6585            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6586
6587            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6588            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6589
6590            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
6591            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6592
6593            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6594            pu1_dst += out_stride;
6595            pu1_pred += pred_strd;
6596
6597
6598            m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6599
6600            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6601            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6602
6603            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
6604            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6605
6606            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6607            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6608
6609            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
6610            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6611
6612            _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6613
6614            m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6615
6616            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6617            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6618
6619            m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
6620            m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6621
6622            //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6623            m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6624
6625            m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
6626            m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6627
6628            _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6629            pu1_dst += out_stride;
6630            pu1_pred += pred_strd;
6631
6632        }
6633        pi2_tmp += 4;
6634    }
6635}
6636
6637