1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19 *******************************************************************************
20 * @file
21 *  ihevc_16x16_itrans_recon_x86_intr.c
22 *
23 * @brief
24 *  Contains function definitions for inverse
25 * transform and reconstruction for 16x16.
26 *
27 * @author
28 *  100470
29 *  100592 (edited by)
30 *
31 * @par List of Functions:
32 *  - ihevc_itrans_recon_16x16_sse42()
33 *
34 * @remarks
35 *  None
36 *
37 *******************************************************************************
38 */
39#include <stdio.h>
40#include <string.h>
41#include "ihevc_typedefs.h"
42#include "ihevc_macros.h"
43#include "ihevc_platform_macros.h"
44#include "ihevc_defs.h"
45#include "ihevc_trans_tables.h"
46#include "ihevc_itrans_recon.h"
47#include "ihevc_func_selector.h"
48#include "ihevc_trans_macros.h"
49
50#include <immintrin.h>
51#include <emmintrin.h>
52#include <smmintrin.h>
53#include <tmmintrin.h>
54
55/**
56 *******************************************************************************
57 *
58 * @brief
59 *  This function performs inverse quantization, inverse  transform and
60 * reconstruction for 16x16 input block
61 *
62 * @par Description:
63 *  Performs inverse quantization , inverse transform  and adds the
64 * prediction data and clips output to 8 bit
65 *
66 * @param[in] pi2_src
67 *  Input 16x16 coefficients
68 *
69 * @param[in] pi2_tmp
70 *  Temporary 16x16 buffer for storing inverse
71 *  transform 1st stage output
72 *
73 * @param[in] pu1_pred
74 *  Prediction 16x16 block
75 *
76 * @param[in] pi2_dequant_coeff
77 *  Dequant Coeffs
78 *
79 * @param[out] pu1_dst
80 *  Output 16x16 block
81 *
82 * @param[in] qp_div
83 *  Quantization parameter / 6
84 *
85 * @param[in] qp_rem
86 *  Quantization parameter % 6
87 *
88 * @param[in] src_strd
89 *  Input stride
90 *
91 * @param[in] pred_strd
92 *  Prediction stride
93 *
94 * @param[in] dst_strd
95 *  Output Stride
96 *
97 * @param[in] zero_cols
98 *  Zero columns in pi2_src
99 *
100 * @returns  Void
101 *
102 * @remarks
103 *  None
104 *
105 *******************************************************************************
106 */
107
108void ihevc_itrans_recon_16x16_sse42(WORD16 *pi2_src,
109                                    WORD16 *pi2_tmp,
110                                    UWORD8 *pu1_pred,
111                                    UWORD8 *pu1_dst,
112                                    WORD32 src_strd,
113                                    WORD32 pred_strd,
114                                    WORD32 dst_strd,
115                                    WORD32 zero_cols,
116                                    WORD32 zero_rows)
117{
118    __m128i m_temp_reg_0;
119    __m128i m_temp_reg_1;
120    __m128i m_temp_reg_10;
121    __m128i m_temp_reg_11;
122    __m128i m_temp_reg_12;
123    __m128i m_temp_reg_13;
124    __m128i m_temp_reg_14;
125    __m128i m_temp_reg_20;
126    __m128i m_temp_reg_21;
127    __m128i m_temp_reg_22;
128    __m128i m_temp_reg_23;
129    __m128i m_temp_reg_24;
130    __m128i m_temp_reg_25;
131    __m128i m_temp_reg_26;
132    __m128i m_temp_reg_27;
133    __m128i m_temp_reg_30;
134    __m128i m_temp_reg_31;
135    __m128i m_temp_reg_32;
136    __m128i m_temp_reg_33;
137    __m128i m_temp_reg_34;
138    __m128i m_temp_reg_35;
139    __m128i m_temp_reg_36;
140    __m128i m_temp_reg_37;
141    __m128i m_temp_reg_40;
142    __m128i m_temp_reg_41;
143    __m128i m_temp_reg_42;
144    __m128i m_temp_reg_43;
145    __m128i m_temp_reg_44;
146    __m128i m_temp_reg_45;
147    __m128i m_temp_reg_46;
148    __m128i m_temp_reg_47;
149
150    __m128i m_temp_reg_70;
151    __m128i m_temp_reg_71;
152    __m128i m_temp_reg_72;
153    __m128i m_temp_reg_73;
154    __m128i m_temp_reg_74;
155    __m128i m_temp_reg_75;
156    __m128i m_temp_reg_76;
157    __m128i m_temp_reg_77;
158    __m128i m_rdng_factor;
159    __m128i m_count;
160    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
161    __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
162
163    WORD32 i;
164
165    WORD32  zero_last8_cols_stg1;
166    WORD32  zero_last8_rows_stg1;
167    WORD32  zero_last12_rows_stg1;
168    WORD32  zero_last12_rows_stg2;
169    WORD32  zero_last8_rows_stg2;
170
171    WORD32  loop = 0;
172
173    WORD32 i4_shift = IT_SHIFT_STAGE_1;
174    WORD32 trans_size = TRANS_SIZE_16;
175
176    /* Following 3 instructions replicates the value in the */
177    /* lower 16 bits of m_add_iq in the entire register */
178
179    /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
180
181    zero_last8_cols_stg1  = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0;
182    zero_last8_rows_stg1  = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0;
183    zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0;
184
185    zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0;
186    zero_last8_rows_stg2 = zero_last8_cols_stg1;
187
188    if(zero_last8_cols_stg1)
189    {
190        loop = 1;
191    }
192    else
193        loop = 2;
194
195    /* i = 0 => lower 8 samples */
196    /* i = 1 => higher 8 samples */
197    for(i = 0; i < loop; i++)
198    {
199        {
200            WORD32 sample_half_index = i << 3;
201            WORD16 *pi2_tmp_src = pi2_src + sample_half_index;
202            WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
203
204            m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
205            pi2_tmp_src += (src_strd << 1);
206            m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
207            pi2_tmp_src += (src_strd << 1);
208            m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
209            pi2_tmp_src += (src_strd << 1);
210            m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
211            pi2_tmp_src += (src_strd << 1);
212            m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
213            pi2_tmp_src += (src_strd << 1);
214            m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
215            pi2_tmp_src += (src_strd << 1);
216            m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
217            pi2_tmp_src += (src_strd << 1);
218            m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
219            pi2_tmp_src += (src_strd << 1);
220
221
222
223
224            /* If last 12 rows are zero : Rishab */
225            if(zero_last12_rows_stg1)
226            {
227
228                /* eee */
229                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
230                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
231                {
232                    /* Loading coeff and src for use in next block */
233
234                    m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign
235                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
236
237                    m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
238
239                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
240
241                    m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
242
243                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
244
245                    m_temp_reg_26 = m_temp_reg_24;
246                    m_temp_reg_27 = m_temp_reg_25;
247                }
248
249                /* eo */
250
251                /* eo0[0-3] */
252                {
253                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
254                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
255
256                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
257
258                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
259
260                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
261                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
262                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
263                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
264
265                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
266                    pi2_scratch += 8;
267                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
268                    pi2_scratch += 8;
269
270                }
271
272
273                /* eo0[4-7] */
274                {
275                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
276
277                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
278
279                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
280                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
281                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
282                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
283
284                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
285                    pi2_scratch += 8;
286                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
287                    pi2_scratch += 8;
288
289                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
290                }
291
292                /* eo1[0-3] */
293                {
294                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
295
296                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
297
298                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
299                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
300                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
301                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
302
303                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
304                    pi2_scratch += 8;
305                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
306                    pi2_scratch += 8;
307                }
308
309                /* eo1[4-7] */
310                {
311                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
312
313                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
314
315                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
316                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
317                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
318                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
319
320                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
321                    pi2_scratch += 8;
322                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
323                    pi2_scratch += 8;
324
325                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
326
327                }
328
329                /* eo2[0-3] */
330                {
331                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
332
333                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
334                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
335                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
336                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
337
338                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
339                    pi2_scratch += 8;
340                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
341                    pi2_scratch += 8;
342
343                }
344
345                /* eo2[4-7] */
346                {
347                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
348
349                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
350                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
351                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
352                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
353
354
355                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
356                    pi2_scratch += 8;
357                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
358                    pi2_scratch += 8;
359
360                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
361                }
362
363                /* eo3[0-3] */
364                {
365                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
366
367                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
368                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
369                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
370                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
371
372                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
373                    pi2_scratch += 8;
374                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
375                    pi2_scratch += 8;
376                }
377
378                /* eo3[4-7] */
379                {
380                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
381
382                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
383                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
384                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
385                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
386
387                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
388                    pi2_scratch += 8;
389                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
390                    pi2_scratch += 8;
391                }
392            }
393            /* If last 8 rows are zero : Rishab */
394            else if(zero_last8_rows_stg1)
395            {
396                /* eeo */
397                /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
398                /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
399                {
400                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
401                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
402
403                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
404                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
405
406                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
407                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
408
409                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
410                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
411
412                }
413
414                /* eee */
415                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
416                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
417                {
418                    /* Loading coeff and src for use in next block */
419                    m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to  get signs
420                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
421
422                    m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
423
424                    //m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
425
426                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
427
428                    m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
429
430                    m_temp_reg_26 = m_temp_reg_24;
431                    m_temp_reg_27 = m_temp_reg_25;
432
433                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
434                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
435                }
436
437                /* eo0[0-3] */
438                {
439                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
440                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
441
442                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
443
444                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
445                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
446                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
447
448                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
449                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
450                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
451                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
452
453                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
454                    pi2_scratch += 8;
455                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
456                    pi2_scratch += 8;
457
458                }
459
460                /* eo0[4-7] */
461                {
462                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
463
464                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
465                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
466                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
467
468                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
469                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
470                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
471                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
472
473                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
474                    pi2_scratch += 8;
475                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
476                    pi2_scratch += 8;
477
478                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
479                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
480
481                }
482
483                /* eo1[0-3] */
484                {
485                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
486
487                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
488                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
489                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
490
491                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
492                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
493                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
494                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
495
496                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
497                    pi2_scratch += 8;
498                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
499                    pi2_scratch += 8;
500
501                }
502
503                /* eo1[4-7] */
504                {
505                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
506
507                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
508                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
509                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
510
511                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
512                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
513                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
514                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
515
516                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
517                    pi2_scratch += 8;
518                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
519                    pi2_scratch += 8;
520
521                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
522                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
523
524                }
525
526                /* eo2[0-3] */
527                {
528                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
529
530                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
531                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
532                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
533                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
534
535                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
536                    pi2_scratch += 8;
537                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
538                    pi2_scratch += 8;
539
540                }
541
542                /* eo2[4-7] */
543                {
544                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
545
546                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
547                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
548                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
549                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
550
551                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
552                    pi2_scratch += 8;
553                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
554                    pi2_scratch += 8;
555
556                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
557                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
558
559                }
560
561                /* eo3[0-3] */
562                {
563                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
564
565                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
566                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
567                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
568                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
569
570                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
571                    pi2_scratch += 8;
572                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
573                    pi2_scratch += 8;
574                }
575
576                /* eo3[4-7] */
577                {
578                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
579
580                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
581                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
582                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
583                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
584
585                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
586                    pi2_scratch += 8;
587                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
588                    pi2_scratch += 8;
589                }
590            } /* If all the rows are non-zero : Rishab */
591            else
592            {
593                /* eeo */
594                /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
595                /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
596
597                {
598                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
599                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
600
601                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
602                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
603
604                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
605                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
606
607                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
608                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
609                }
610
611                /* eee */
612                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
613                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
614                {
615                    /* Loading coeff and src for use in next block */
616                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
617                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
618
619                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
620                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
621
622                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
623                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
624
625                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
626                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
627
628                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
629                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
630
631                }
632                /* eo0[0-3] */
633                {
634                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
635                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
636                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
637                    m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
638
639                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
640                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
641
642
643                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
644                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
645                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
646
647                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
648                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
649                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
650                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
651                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
652                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
653
654                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
655                    pi2_scratch += 8;
656                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
657                    pi2_scratch += 8;
658
659
660                }
661
662                /* eo0[4-7] */
663                {
664                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
665                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
666
667                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
668                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
669                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
670
671                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
672                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
673                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
674                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
675                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
676                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
677
678                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
679                    pi2_scratch += 8;
680                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
681                    pi2_scratch += 8;
682
683                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
684                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
685
686                }
687
688                /* eo1[0-3] */
689                {
690                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
691                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
692
693                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
694                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
695                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
696
697                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
698                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
699                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
700                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
701                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
702                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
703
704                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
705                    pi2_scratch += 8;
706                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
707                    pi2_scratch += 8;
708
709                }
710
711                /* eo1[4-7] */
712                {
713                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
714                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
715
716                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
717                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
718                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
719
720                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
721                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
722                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
723                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
724                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
725                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
726
727                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
728                    pi2_scratch += 8;
729                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
730                    pi2_scratch += 8;
731                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
732                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
733                }
734
735                /* eo2[0-3] */
736                {
737                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
738                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
739
740                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
741                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
742                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
743                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
744                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
745                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
746
747                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
748                    pi2_scratch += 8;
749                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
750                    pi2_scratch += 8;
751                }
752
753                /* eo2[4-7] */
754                {
755                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
756                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
757
758                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
759                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
760                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
761                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
762                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
763                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
764
765                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
766                    pi2_scratch += 8;
767                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
768                    pi2_scratch += 8;
769
770                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
771                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
772
773                }
774
775                /* eo3[0-3] */
776                {
777                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
778                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
779
780                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
781                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
782                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
783                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
784                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
785                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
786
787
788                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
789                    pi2_scratch += 8;
790                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
791                    pi2_scratch += 8;
792                }
793
794                /* eo3[4-7] */
795                {
796                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
797                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
798
799                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
800                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
801                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
802                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
803                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
804                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
805
806                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
807                    pi2_scratch += 8;
808                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
809                    pi2_scratch += 8;
810                }
811
812            }
813        }
814
815        {
816            WORD32 sample_half_index = i << 3;
817            WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd;
818
819            m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
820            pi2_tmp_src += (src_strd << 1);
821            m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
822            pi2_tmp_src += (src_strd << 1);
823            m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
824            pi2_tmp_src += (src_strd << 1);
825            m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
826            pi2_tmp_src += (src_strd << 1);
827            m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
828            pi2_tmp_src += (src_strd << 1);
829            m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
830            pi2_tmp_src += (src_strd << 1);
831            m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
832            pi2_tmp_src += (src_strd << 1);
833            m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
834            pi2_tmp_src += (src_strd << 1);
835        }
836
837        /* o & stage 1 out */
838        {
839            WORD32 j;
840            WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
841            WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
842            WORD32 out_stride = (trans_size << 1);
843            WORD32 in_stride = trans_size << 1;
844
845            if(zero_last12_rows_stg1)
846            {
847                for(j = 0; j < 2; j++)
848                {
849                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
850                    {
851                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
852                    }
853                    else
854                    {
855                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
856                    }
857                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
858
859
860                    /* o0[0-3] */
861                    {
862                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
863
864
865                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
866                        pi2_src_scratch += in_stride;
867
868                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
869
870
871                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
872                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
873
874                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
875                        m_count = _mm_cvtsi32_si128(i4_shift);
876                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
877
878                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
879                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
880                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
881                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
882
883                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
884
885                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
886                        pi2_dst_scratch += out_stride;
887                    }
888
889                    /* o1[0-3] */
890                    {
891                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
892
893
894                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
895                        pi2_src_scratch += in_stride;
896
897                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
898
899                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
900                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
901
902                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
903                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
904                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
905                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
906
907                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
908
909                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
910                        pi2_dst_scratch += out_stride;
911                    }
912
913                    /* o2[0-3] */
914                    {
915                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
916
917                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
918                        pi2_src_scratch += in_stride;
919
920                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
921
922                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
923                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
924
925                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
926                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
927                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
928                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
929
930                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
931
932                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
933                        pi2_dst_scratch += out_stride;
934                    }
935
936                    /* o3[0-3] */
937                    {
938                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
939
940                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
941                        pi2_src_scratch += 8;
942
943                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
944
945                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
946                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
947
948                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
949                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
950                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
951                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
952
953                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
954
955                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
956                        pi2_dst_scratch += 8;
957                    }
958
959                    /* o4[0-3] */
960                    {
961                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
962
963                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
964                        pi2_src_scratch -= in_stride;
965
966                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
967
968                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
969                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
970
971                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
972                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
973                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
974                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
975
976                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
977
978                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
979                        pi2_dst_scratch -= out_stride;
980                    }
981
982                    /* o5[0-3] */
983                    {
984                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
985
986
987                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
988                        pi2_src_scratch -= in_stride;
989
990                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
991
992                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
993                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
994
995                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
996                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
997                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
998                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
999
1000                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1001
1002                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1003                        pi2_dst_scratch -= out_stride;
1004                    }
1005
1006                    /* o6[0-3] */
1007                    {
1008                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1009
1010                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1011                        pi2_src_scratch -= in_stride;
1012
1013                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1014
1015                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1016                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1017
1018                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1019                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1020                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1021                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1022
1023                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1024
1025                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1026                        pi2_dst_scratch -= out_stride;
1027                    }
1028
1029                    /* o7[0-3] */
1030                    {
1031                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1032
1033                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1034                        pi2_src_scratch += 8;
1035
1036                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1037                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1038
1039                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1040                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1041                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1042                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1043
1044                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1045
1046                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1047                        pi2_dst_scratch += 8;
1048                    }
1049                }
1050            }
1051            else if(zero_last8_rows_stg1)
1052            {
1053                for(j = 0; j < 2; j++)
1054                {
1055                    if(j)
1056                    {
1057                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
1058                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
1059                    }
1060                    else
1061                    {
1062                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
1063                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
1064                    }
1065                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
1066                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
1067
1068                    /* o0[0-3] */
1069                    {
1070                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1071                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1072
1073                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1074                        pi2_src_scratch += in_stride;
1075
1076                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
1077                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
1078
1079                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1080                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1081                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1082
1083
1084                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1085                        m_count = _mm_cvtsi32_si128(i4_shift);
1086
1087                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
1088
1089                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1090                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1091                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1092                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1093
1094                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1095
1096                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1097                        pi2_dst_scratch += out_stride;
1098                    }
1099
1100                    /* o1[0-3] */
1101                    {
1102                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1103                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1104
1105                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1106                        pi2_src_scratch += in_stride;
1107
1108                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
1109                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
1110
1111                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1112                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1113                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1114
1115                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1116                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1117                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1118                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1119
1120                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1121
1122                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1123                        pi2_dst_scratch += out_stride;
1124                    }
1125
1126                    /* o2[0-3] */
1127                    {
1128                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1129                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1130
1131                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1132                        pi2_src_scratch += in_stride;
1133
1134                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
1135                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
1136
1137                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1138                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1139                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1140
1141                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1142                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1143                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1144                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1145
1146                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1147
1148                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1149                        pi2_dst_scratch += out_stride;
1150                    }
1151
1152                    /* o3[0-3] */
1153                    {
1154                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1155                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1156
1157                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1158                        pi2_src_scratch += 8;
1159
1160                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
1161                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
1162
1163                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
1164                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1165                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1166
1167                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1168                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1169                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1170                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1171
1172                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1173
1174                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1175                        pi2_dst_scratch += 8;
1176                    }
1177
1178                    /* o4[0-3] */
1179                    {
1180                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1181                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1182
1183                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1184                        pi2_src_scratch -= in_stride;
1185
1186                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
1187                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
1188
1189                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1190                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1191                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1192
1193
1194                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1195                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1196                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1197                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1198
1199                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1200
1201                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1202                        pi2_dst_scratch -= out_stride;
1203                    }
1204
1205                    /* o5[0-3] */
1206                    {
1207                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1208                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1209
1210                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1211                        pi2_src_scratch -= in_stride;
1212
1213                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
1214                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
1215
1216                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1217                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1218                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1219
1220
1221                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1222                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1223                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1224                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1225
1226                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1227
1228                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1229                        pi2_dst_scratch -= out_stride;
1230                    }
1231
1232                    /* o6[0-3] */
1233                    {
1234                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1235                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1236
1237                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1238                        pi2_src_scratch -= in_stride;
1239
1240                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1241                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
1242
1243                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1244                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1245                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1246
1247                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1248                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1249                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1250                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1251
1252                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1253
1254                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1255                        pi2_dst_scratch -= out_stride;
1256                    }
1257
1258                    /* o7[0-3] */
1259                    {
1260                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1261                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1262
1263                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1264                        pi2_src_scratch += 8;
1265
1266                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1267                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1268                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1269
1270                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1271                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1272                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1273                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1274
1275                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1276
1277                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1278                        pi2_dst_scratch += 8;
1279                    }
1280                }
1281
1282            }
1283            else
1284            {
1285
1286
1287
1288                for(j = 0; j < 2; j++)
1289                {
1290                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
1291                    {
1292                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
1293                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
1294                        m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
1295                        m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
1296                    }
1297                    else
1298                    {
1299                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
1300                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
1301                        m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
1302                        m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
1303                    }
1304                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
1305                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
1306                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
1307                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
1308
1309
1310                    /* o0[0-3] */
1311                    {
1312                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1313                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1314                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1315                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1316
1317
1318                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1319                        pi2_src_scratch += in_stride;
1320
1321                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
1322                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
1323                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
1324                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
1325
1326                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1327                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
1328                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
1329                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1330                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1331
1332                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1333                        m_count = _mm_cvtsi32_si128(i4_shift);
1334                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1335                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1336
1337                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1338                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1339                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1340                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1341
1342                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1343
1344                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1345                        pi2_dst_scratch += out_stride;
1346                    }
1347
1348                    /* o1[0-3] */
1349                    {
1350                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1351                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1352                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1353                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1354
1355
1356                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1357                        pi2_src_scratch += in_stride;
1358
1359                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
1360                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
1361                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
1362                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
1363
1364                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1365                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
1366                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
1367                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1368                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1369
1370                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1371                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1372                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1373                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1374
1375                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1376
1377                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1378                        pi2_dst_scratch += out_stride;
1379                    }
1380
1381                    /* o2[0-3] */
1382                    {
1383                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1384                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1385                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1386                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1387
1388                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1389                        pi2_src_scratch += in_stride;
1390
1391                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
1392                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
1393                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
1394                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
1395
1396                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1397                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1398                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
1399                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1400                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1401
1402                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1403                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1404                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1405                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1406
1407                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1408
1409                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1410                        pi2_dst_scratch += out_stride;
1411                    }
1412
1413                    /* o3[0-3] */
1414                    {
1415                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1416                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1417                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1418                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1419
1420
1421                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1422                        pi2_src_scratch += 8;
1423
1424                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
1425                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
1426                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
1427                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
1428
1429                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
1430                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
1431                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
1432                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1433                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1434
1435                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1436                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1437                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1438                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1439
1440                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1441
1442                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1443                        pi2_dst_scratch += 8;
1444                    }
1445
1446                    /* o4[0-3] */
1447                    {
1448                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1449                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1450                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1451                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1452
1453                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1454                        pi2_src_scratch -= in_stride;
1455
1456                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
1457                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
1458                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
1459                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
1460
1461                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1462                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1463                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
1464                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1465                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1466
1467                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1468                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1469                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1470                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1471
1472                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1473
1474                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1475                        pi2_dst_scratch -= out_stride;
1476                    }
1477
1478                    /* o5[0-3] */
1479                    {
1480                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1481                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1482                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1483                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1484
1485
1486                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1487                        pi2_src_scratch -= in_stride;
1488
1489                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
1490                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
1491                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
1492                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
1493
1494                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1495                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
1496                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
1497                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1498                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1499
1500                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1501                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1502                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1503                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1504
1505                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1506
1507                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1508                        pi2_dst_scratch -= out_stride;
1509                    }
1510
1511                    /* o6[0-3] */
1512                    {
1513                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1514                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1515                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1516                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1517
1518
1519                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1520                        pi2_src_scratch -= in_stride;
1521
1522                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1523                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
1524                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
1525                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
1526
1527
1528                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1529                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1530                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
1531                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1532                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1533
1534                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1535                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1536                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1537                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1538
1539                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1540
1541                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1542                        pi2_dst_scratch -= out_stride;
1543                    }
1544
1545                    /* o7[0-3] */
1546                    {
1547                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1548                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1549                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1550                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1551
1552                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1553                        pi2_src_scratch += 8;
1554
1555                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1556                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
1557                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
1558                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1559                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1560
1561
1562                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1563                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1564                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1565                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1566
1567                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1568
1569                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1570                        pi2_dst_scratch += 8;
1571                    }
1572                }
1573            }
1574        }
1575
1576        /* Transpose */
1577        {
1578            WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
1579            WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp);
1580            WORD32 out_stride = (trans_size << 1);
1581            WORD32 in_stride = (trans_size << 1);
1582            WORD32 j;
1583
1584            for(j = 0; j < 2; j++)
1585            {
1586                m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
1587                pi2_src_scratch += in_stride;
1588                m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
1589                pi2_src_scratch += in_stride;
1590                m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
1591                pi2_src_scratch += in_stride;
1592                m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
1593                pi2_src_scratch += 8;
1594                m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
1595                pi2_src_scratch -= in_stride;
1596                m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
1597                pi2_src_scratch -= in_stride;
1598                m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
1599                pi2_src_scratch -= in_stride;
1600                m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
1601                pi2_src_scratch += 8;
1602
1603                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
1604                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
1605
1606                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
1607                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
1608
1609                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
1610                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
1611
1612                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
1613                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
1614
1615
1616                m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
1617                m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
1618
1619                m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
1620                m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
1621
1622                m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
1623                m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
1624
1625                m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
1626                m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
1627
1628
1629                m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
1630                m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
1631
1632                m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
1633                m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
1634
1635                m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
1636                m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1
1637
1638                m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2
1639                m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3
1640
1641                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1642                pi2_dst_scratch += out_stride;
1643                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44);
1644                pi2_dst_scratch += out_stride;
1645                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41);
1646                pi2_dst_scratch += out_stride;
1647                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45);
1648                pi2_dst_scratch += 8;
1649                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42);
1650                pi2_dst_scratch -= out_stride;
1651                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46);
1652                pi2_dst_scratch -= out_stride;
1653                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43);
1654                pi2_dst_scratch -= out_stride;
1655                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47);
1656                pi2_dst_scratch += 8;
1657            }
1658        }
1659    }
1660
1661    if(zero_last8_cols_stg1)
1662    {
1663        WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size);
1664        WORD32 out_stride = (trans_size << 1);
1665        WORD32 j;
1666
1667        m_temp_reg_40 = _mm_setzero_si128();
1668        for(j = 0; j < 2; j++)
1669        {
1670            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1671            pi2_dst_scratch += out_stride;
1672            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1673            pi2_dst_scratch += out_stride;
1674            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1675            pi2_dst_scratch += out_stride;
1676            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1677            pi2_dst_scratch += 8;
1678            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1679            pi2_dst_scratch -= out_stride;
1680            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1681            pi2_dst_scratch -= out_stride;
1682            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1683            pi2_dst_scratch -= out_stride;
1684            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1685            pi2_dst_scratch += 8;
1686        }
1687    }
1688
1689
1690
1691
1692    /* Stage 2 */
1693    for(i = 0; i < 2; i++)
1694    {
1695        //__m128i m_temp_reg_15,m_temp_reg_16;
1696        WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp);
1697        WORD32 stride = (trans_size);
1698        WORD16 temp_array[256];
1699
1700        i4_shift = IT_SHIFT_STAGE_2;
1701
1702        if(zero_last12_rows_stg2)
1703        {
1704            /* eeo */
1705            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1706            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1707            {
1708                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
1709
1710                pi2_src_temp += (stride * 9);
1711
1712                if(!i)
1713                {
1714                    pi2_src_temp += (stride * 6 + 8);
1715                }
1716                else
1717                {
1718                    pi2_src_temp += (stride * 2 + 8);
1719                }
1720
1721                pi2_src_temp -= (stride * 9);
1722
1723                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
1724
1725                m_temp_reg_20 = _mm_setzero_si128();
1726                m_temp_reg_22 = _mm_setzero_si128();
1727
1728                m_temp_reg_21 = _mm_setzero_si128();
1729                m_temp_reg_23 = _mm_setzero_si128();
1730            }
1731
1732            /* eee */
1733            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
1734            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
1735            {
1736                /* Loading coeff and src for use in next block */
1737
1738                /* Loading coeff and src for use in next block */
1739                m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70);
1740
1741                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
1742
1743                m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
1744
1745                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
1746                m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
1747
1748                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
1749                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
1750
1751                m_temp_reg_26 = m_temp_reg_24;
1752                m_temp_reg_27 = m_temp_reg_25;
1753                /*  */
1754
1755                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20);
1756                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20);
1757            }
1758
1759            /* eo */
1760            {
1761                WORD16 *pi2_scratch = temp_array;
1762                WORD32 out_stride = 8;
1763
1764
1765                /* eo0[0-3] */
1766                {
1767                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1768
1769                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1770
1771                    /* e[0][0-3] stored in pu1_dst[0] */
1772                    /* e[7][0-3] stored in pu1_dst[1] */
1773                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
1774                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
1775
1776                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1777                    pi2_scratch += out_stride;
1778                    _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
1779                    pi2_scratch += out_stride;
1780                }
1781
1782                /* eo0[4-7] */
1783                {
1784                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1785
1786                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1787
1788                    /* e[0][4-7] stored in pu1_dst[2] */
1789                    /* e[7][4-7] stored in pu1_dst[3] */
1790
1791                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
1792                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
1793
1794                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1795                    pi2_scratch += out_stride;
1796                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1797                    pi2_scratch += out_stride;
1798
1799                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
1800                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
1801
1802                }
1803
1804                /* eo1[0-3] */
1805                {
1806                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1807
1808                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
1809
1810                    /* e[1][0-3] stored in pu1_dst[4] */
1811                    /* e[6][0-3] stored in pu1_dst[5] */
1812                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
1813                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
1814
1815                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1816                    pi2_scratch += out_stride;
1817                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1818                    pi2_scratch += out_stride;
1819                }
1820
1821                /* eo1[4-7] */
1822                {
1823                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1824
1825                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
1826
1827                    /* e[1][4-7] stored in pu1_dst[6]*/
1828                    /* e[6][4-7] stored in pu1_dst[7] */
1829                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
1830                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
1831
1832                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1833                    pi2_scratch += out_stride;
1834                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1835                    pi2_scratch += out_stride;
1836
1837                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
1838
1839                }
1840
1841                /* eo2[0-3] */
1842                {
1843                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1844
1845                    /* e[2][0-3] stored in pu1_dst[8]*/
1846                    /* e[5][0-3] stored in pu1_dst[9] */
1847                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
1848                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
1849
1850                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1851                    pi2_scratch += out_stride;
1852                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1853                    pi2_scratch += out_stride;
1854                }
1855
1856                /* eo2[4-7] */
1857                {
1858                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1859
1860                    /* e[2][4-7] stored in pu1_dst[10]*/
1861                    /* e[5][4-7] stored in pu1_dst[11] */
1862                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
1863                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
1864
1865                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1866                    pi2_scratch += out_stride;
1867                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1868                    pi2_scratch += out_stride;
1869
1870                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
1871                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
1872
1873                }
1874
1875                /* eo3[0-3] */
1876                {
1877                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1878
1879                    /* e[3][0-3] stored in pu1_dst[12]*/
1880                    /* e[4][0-3] stored in pu1_dst[13] */
1881                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
1882                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
1883
1884                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1885                    pi2_scratch += out_stride;
1886                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1887                    pi2_scratch += out_stride;
1888                }
1889
1890                /* eo3[4-7] */
1891                {
1892                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1893
1894                    /* e[3][4-7] stored in pu1_dst[14]*/
1895                    /* e[4][4-7] stored in pu1_dst[15] */
1896                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
1897                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
1898
1899                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1900                    pi2_scratch += out_stride;
1901                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1902                    pi2_scratch += out_stride;
1903                }
1904
1905            }
1906        }
1907        else if(zero_last8_rows_stg2)
1908        {
1909            /* eeo */
1910            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1911            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1912            {
1913
1914                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83
1915                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36
1916
1917                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
1918                pi2_src_temp += (stride);
1919                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
1920                pi2_src_temp += (stride * 8);
1921
1922                if(!i)
1923                {
1924                    pi2_src_temp += (stride * 6 + 8);
1925                }
1926                else
1927                {
1928                    pi2_src_temp += (stride * 2 + 8);
1929                }
1930
1931                pi2_src_temp -= (stride * 8);
1932                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
1933                pi2_src_temp -= (stride);
1934                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
1935
1936
1937                m_temp_reg_76 = _mm_setzero_si128();
1938
1939
1940                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
1941                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
1942
1943                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
1944                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
1945
1946                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1947                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1948
1949                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1950                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1951            }
1952
1953            /* eee */
1954            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
1955            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
1956            {
1957                /* Loading coeff and src for use in next block */
1958
1959
1960                m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70);
1961
1962                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
1963
1964                m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
1965                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
1966                m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
1967
1968                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
1969
1970                m_temp_reg_26 = m_temp_reg_24;
1971                m_temp_reg_27 = m_temp_reg_25;
1972
1973                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1974                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1975            }
1976
1977            /* eo */
1978            {
1979                WORD16 *pi2_scratch = temp_array;
1980                WORD32 out_stride = 8;
1981
1982
1983                /* eo0[0-3] */
1984                {
1985                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1986
1987                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1988                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
1989                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
1990
1991                    /* e[0][0-3] stored in pu1_dst[0] */
1992                    /* e[7][0-3] stored in pu1_dst[1] */
1993                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1994                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1995
1996                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1997                    pi2_scratch += out_stride;
1998                    _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
1999                    pi2_scratch += out_stride;
2000                }
2001
2002                /* eo0[4-7] */
2003                {
2004                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
2005
2006                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2007                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
2008                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
2009
2010                    /* e[0][4-7] stored in pu1_dst[2] */
2011                    /* e[7][4-7] stored in pu1_dst[3] */
2012
2013                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2014                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2015
2016                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2017                    pi2_scratch += out_stride;
2018                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2019                    pi2_scratch += out_stride;
2020
2021                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
2022                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
2023
2024                }
2025
2026                /* eo1[0-3] */
2027                {
2028                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2029
2030                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2031                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
2032                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
2033
2034                    /* e[1][0-3] stored in pu1_dst[4] */
2035                    /* e[6][0-3] stored in pu1_dst[5] */
2036                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
2037                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
2038
2039                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2040                    pi2_scratch += out_stride;
2041                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2042                    pi2_scratch += out_stride;
2043                }
2044
2045                /* eo1[4-7] */
2046                {
2047                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
2048
2049                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2050                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
2051                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
2052
2053                    /* e[1][4-7] stored in pu1_dst[6]*/
2054                    /* e[6][4-7] stored in pu1_dst[7] */
2055                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
2056                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
2057
2058                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2059                    pi2_scratch += out_stride;
2060                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2061                    pi2_scratch += out_stride;
2062
2063                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
2064
2065                }
2066
2067                /* eo2[0-3] */
2068                {
2069                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2070
2071                    /* e[2][0-3] stored in pu1_dst[8]*/
2072                    /* e[5][0-3] stored in pu1_dst[9] */
2073                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
2074                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
2075
2076                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2077                    pi2_scratch += out_stride;
2078                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2079                    pi2_scratch += out_stride;
2080                }
2081
2082                /* eo2[4-7] */
2083                {
2084                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
2085
2086                    /* e[2][4-7] stored in pu1_dst[10]*/
2087                    /* e[5][4-7] stored in pu1_dst[11] */
2088                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
2089                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
2090
2091                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2092                    pi2_scratch += out_stride;
2093                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2094                    pi2_scratch += out_stride;
2095
2096                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
2097                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
2098
2099                }
2100
2101                /* eo3[0-3] */
2102                {
2103                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2104
2105                    /* e[3][0-3] stored in pu1_dst[12]*/
2106                    /* e[4][0-3] stored in pu1_dst[13] */
2107                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
2108                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
2109
2110                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2111                    pi2_scratch += out_stride;
2112                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2113                    pi2_scratch += out_stride;
2114                }
2115
2116                /* eo3[4-7] */
2117                {
2118                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
2119
2120                    /* e[3][4-7] stored in pu1_dst[14]*/
2121                    /* e[4][4-7] stored in pu1_dst[15] */
2122                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
2123                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
2124
2125                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2126                    pi2_scratch += out_stride;
2127                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2128                    pi2_scratch += out_stride;
2129                }
2130            }
2131        }
2132
2133        else
2134        {
2135            /* eeo */
2136            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
2137            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
2138            {
2139
2140
2141                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
2142                pi2_src_temp += (stride);
2143                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
2144                pi2_src_temp += (stride * 7);
2145                m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8
2146                pi2_src_temp += (stride);
2147                m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12
2148                if(!i)
2149                {
2150                    pi2_src_temp += (stride * 6 + 8);
2151                }
2152                else
2153                {
2154                    pi2_src_temp += (stride * 2 + 8);
2155                }
2156                m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14
2157                pi2_src_temp -= (stride);
2158                m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10
2159                pi2_src_temp -= (stride * 7);
2160                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
2161                pi2_src_temp -= (stride);
2162                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
2163
2164                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
2165                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
2166
2167                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
2168                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
2169
2170                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2171                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
2172
2173                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2174                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
2175
2176
2177            }
2178
2179            /* eee */
2180            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
2181            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
2182            {
2183                /* Loading coeff and src for use in next block */
2184                m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
2185                m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
2186
2187                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
2188                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
2189
2190                m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
2191                m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
2192
2193                m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
2194                m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
2195
2196                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
2197                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
2198
2199            }
2200
2201            /* eo */
2202            {
2203                WORD16 *pi2_scratch = temp_array;
2204                WORD32 out_stride = 8;
2205
2206
2207
2208                /* eo0[0-3] */
2209                {
2210                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
2211                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
2212                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
2213                    m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
2214
2215
2216                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2217                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
2218
2219
2220                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2221                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
2222                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
2223
2224
2225                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
2226                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
2227                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
2228                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
2229                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2230                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2231
2232                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2233                    pi2_scratch += out_stride;
2234                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2235                    pi2_scratch += out_stride;
2236
2237
2238                }
2239
2240                /* eo0[4-7] */
2241                {
2242
2243                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
2244                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
2245
2246                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2247                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
2248                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
2249
2250                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
2251                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
2252                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2253                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2254                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2255                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2256
2257                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2258                    pi2_scratch += out_stride;
2259                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2260                    pi2_scratch += out_stride;
2261
2262                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
2263                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
2264
2265                }
2266
2267                /* eo1[0-3] */
2268                {
2269                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2270                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
2271
2272                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2273                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
2274                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
2275
2276                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
2277                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
2278                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
2279                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
2280                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
2281                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
2282
2283                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2284                    pi2_scratch += out_stride;
2285                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2286                    pi2_scratch += out_stride;
2287
2288                }
2289
2290                /* eo1[4-7] */
2291                {
2292                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
2293                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2294
2295                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2296                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
2297                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
2298
2299                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
2300                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
2301                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
2302                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
2303                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
2304                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
2305
2306                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2307                    pi2_scratch += out_stride;
2308                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2309                    pi2_scratch += out_stride;
2310                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
2311                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
2312                }
2313
2314                /* eo2[0-3] */
2315                {
2316                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2317                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
2318
2319                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
2320                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
2321                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
2322                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
2323                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2324                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2325
2326                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2327                    pi2_scratch += out_stride;
2328                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2329                    pi2_scratch += out_stride;
2330                }
2331
2332                /* eo2[4-7] */
2333                {
2334                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
2335                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
2336
2337                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
2338                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
2339                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
2340                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
2341                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2342                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2343
2344                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2345                    pi2_scratch += out_stride;
2346                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2347                    pi2_scratch += out_stride;
2348
2349                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
2350                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
2351
2352                }
2353
2354                /* eo3[0-3] */
2355                {
2356                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2357                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
2358
2359                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
2360                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
2361                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
2362                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
2363                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2364                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2365
2366
2367                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2368                    pi2_scratch += out_stride;
2369                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2370                    pi2_scratch += out_stride;
2371                }
2372
2373                /* eo3[4-7] */
2374                {
2375                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
2376                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2377
2378                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
2379                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
2380                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
2381                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
2382                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2383                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2384
2385                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2386                    pi2_scratch += out_stride;
2387                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2388                    pi2_scratch += out_stride;
2389                }
2390            }
2391        }
2392
2393        if(zero_last12_rows_stg2)
2394        {
2395            /* o & stage 2 pre-transposed out */
2396            {
2397                WORD32 j;
2398                WORD16 *pi2_src_scratch = temp_array;
2399                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2400                WORD32 out_stride = (trans_size);
2401                WORD32 in_stride = (8) * 4;
2402
2403                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2404
2405                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2406
2407                pi2_src_temp += (stride * 9);
2408
2409                if(0 == i)
2410                {
2411                    pi2_src_temp -= (stride * 2 - 8);
2412                }
2413                else
2414                {
2415                    pi2_src_temp -= (stride * 6 - 8);
2416                }
2417                pi2_src_temp -= (stride * 9);
2418
2419                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2420
2421
2422                for(j = 0; j < 2; j++)
2423                {
2424                    if(j)
2425                    {
2426                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2427                    }
2428                    else
2429                    {
2430                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2431                    }
2432                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2433
2434                    /* o0[0-3] */
2435                    {
2436                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2437
2438                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2439                        pi2_src_scratch += in_stride;
2440
2441                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2442
2443                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2444                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2445
2446
2447                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2448                        m_count = _mm_cvtsi32_si128(i4_shift);
2449                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2450
2451
2452                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2453                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2454                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2455                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2456
2457                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2458
2459                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2460                        pi2_dst_scratch += out_stride;
2461                    }
2462
2463                    /* o1[0-3] */
2464                    {
2465                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2466
2467                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2468                        pi2_src_scratch += in_stride;
2469
2470                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
2471
2472                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2473                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2474
2475                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2476                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2477                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2478                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2479
2480                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2481
2482                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2483                        pi2_dst_scratch += ((!i) * out_stride + 8);
2484                    }
2485
2486                    /* o2[0-3] */
2487                    {
2488                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2489
2490                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2491                        pi2_src_scratch += in_stride;
2492
2493                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
2494
2495                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2496                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2497
2498                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2499                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2500                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2501                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2502
2503                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2504
2505                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2506                        pi2_dst_scratch += out_stride;
2507                    }
2508
2509                    /* o3[0-3] */
2510                    {
2511                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2512
2513                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2514                        pi2_src_scratch += 8;
2515
2516                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
2517
2518                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2519                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2520
2521                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2522                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2523                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2524                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2525
2526                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2527
2528                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2529                        pi2_dst_scratch += (i * out_stride + 8);
2530                    }
2531
2532                    /* o4[0-3] */
2533                    {
2534                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2535
2536                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2537                        pi2_src_scratch -= in_stride;
2538
2539                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
2540
2541                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2542                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2543
2544
2545                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2546                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2547                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2548                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2549
2550                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2551
2552                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2553                        pi2_dst_scratch += out_stride;
2554                    }
2555
2556                    /* o5[0-3] */
2557                    {
2558                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2559
2560                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2561                        pi2_src_scratch -= in_stride;
2562
2563                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
2564
2565                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2566                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2567
2568
2569                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2570                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2571                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2572                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2573
2574                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2575
2576                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2577                        pi2_dst_scratch += ((!i) * out_stride + 8);
2578                    }
2579
2580                    /* o6[0-3] */
2581                    {
2582                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2583
2584                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2585                        pi2_src_scratch -= in_stride;
2586
2587                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
2588
2589                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2590                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2591
2592                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2593                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2594                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2595                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2596
2597                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2598
2599                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2600                        pi2_dst_scratch += out_stride;
2601                    }
2602
2603                    /* o7[0-3] */
2604                    {
2605                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2606
2607                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2608                        pi2_src_scratch += 8;
2609
2610                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2611                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2612
2613                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2614                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2615                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2616                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2617
2618                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2619
2620                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2621                        pi2_dst_scratch += (i * out_stride + 8);
2622                    }
2623
2624
2625                }
2626            }
2627        }
2628        else if(zero_last8_rows_stg2)
2629        {
2630            /* o & stage 2 pre-transposed out */
2631            {
2632                WORD32 j;
2633                WORD16 *pi2_src_scratch = temp_array;
2634                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2635                WORD32 out_stride = (trans_size);
2636                WORD32 in_stride = (8) * 4;
2637
2638                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2639
2640
2641                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2642                pi2_src_temp += (stride);
2643                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
2644                pi2_src_temp += (stride * 8);
2645
2646                if(0 == i)
2647                {
2648                    pi2_src_temp -= (stride * 2 - 8);
2649                }
2650                else
2651                {
2652                    pi2_src_temp -= (stride * 6 - 8);
2653                }
2654
2655                pi2_src_temp -= (stride * 8);
2656                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
2657                pi2_src_temp -= (stride);
2658                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2659
2660
2661                for(j = 0; j < 2; j++)
2662                {
2663                    if(j)
2664                    {
2665                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2666                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
2667                    }
2668                    else
2669                    {
2670                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2671                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
2672                    }
2673                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2674                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
2675
2676                    /* o0[0-3] */
2677                    {
2678                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2679                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2680
2681                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2682                        pi2_src_scratch += in_stride;
2683
2684                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2685                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
2686
2687                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2688                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2689                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2690
2691
2692                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2693                        m_count = _mm_cvtsi32_si128(i4_shift);
2694
2695                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2696
2697                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2698                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2699                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2700                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2701
2702                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2703
2704                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2705                        pi2_dst_scratch += out_stride;
2706                    }
2707
2708                    /* o1[0-3] */
2709                    {
2710                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2711                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2712
2713                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2714                        pi2_src_scratch += in_stride;
2715
2716                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
2717                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
2718
2719                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2720                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2721                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2722
2723                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2724                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2725                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2726                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2727
2728                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2729
2730                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2731                        pi2_dst_scratch += ((!i) * out_stride + 8);
2732                    }
2733
2734                    /* o2[0-3] */
2735                    {
2736                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2737                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2738
2739                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2740                        pi2_src_scratch += in_stride;
2741
2742                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
2743                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
2744
2745                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
2746                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2747                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2748
2749                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2750                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2751                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2752                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2753
2754                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2755
2756                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2757                        pi2_dst_scratch += out_stride;
2758                    }
2759
2760                    /* o3[0-3] */
2761                    {
2762                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2763                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2764
2765                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2766                        pi2_src_scratch += 8;
2767
2768                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
2769                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
2770
2771                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
2772                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2773                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2774
2775                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2776                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2777                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2778                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2779
2780                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2781
2782                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2783                        pi2_dst_scratch += (i * out_stride + 8);
2784                    }
2785
2786                    /* o4[0-3] */
2787                    {
2788                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2789                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2790
2791                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2792                        pi2_src_scratch -= in_stride;
2793
2794                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
2795                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
2796
2797                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
2798                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2799                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2800
2801
2802                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2803                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2804                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2805                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2806
2807                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2808
2809                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2810                        pi2_dst_scratch += out_stride;
2811                    }
2812
2813                    /* o5[0-3] */
2814                    {
2815                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2816                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2817
2818                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2819                        pi2_src_scratch -= in_stride;
2820
2821                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
2822                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
2823
2824                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2825                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2826                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2827
2828
2829                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2830                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2831                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2832                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2833
2834                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2835
2836                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2837                        pi2_dst_scratch += ((!i) * out_stride + 8);
2838                    }
2839
2840                    /* o6[0-3] */
2841                    {
2842                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2843                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2844
2845                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2846                        pi2_src_scratch -= in_stride;
2847
2848                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
2849                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
2850
2851                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2852                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2853                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2854
2855                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2856                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2857                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2858                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2859
2860                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2861
2862                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2863                        pi2_dst_scratch += out_stride;
2864                    }
2865
2866                    /* o7[0-3] */
2867                    {
2868                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2869                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2870
2871                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2872                        pi2_src_scratch += 8;
2873
2874                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2875                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2876                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2877
2878                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2879                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2880                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2881                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2882
2883                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2884
2885                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2886                        pi2_dst_scratch += (i * out_stride + 8);
2887                    }
2888                }
2889            }
2890        }
2891        else
2892        {
2893            /* o & stage 2 pre-transposed out */
2894            {
2895                WORD32 j;
2896                WORD16 *pi2_src_scratch = temp_array;
2897                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2898                WORD32 out_stride = (trans_size);
2899                WORD32 in_stride = (8) * 4;
2900
2901                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2902
2903
2904                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2905                pi2_src_temp += (stride);
2906                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
2907                pi2_src_temp += (stride * 7);
2908                m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9
2909                pi2_src_temp += (stride);
2910                m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13
2911                if(0 == i)
2912                {
2913                    pi2_src_temp -= (stride * 2 - 8);
2914                }
2915                else
2916                {
2917                    pi2_src_temp -= (stride * 6 - 8);
2918                }
2919                m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15
2920                pi2_src_temp -= (stride);
2921                m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11
2922                pi2_src_temp -= (stride * 7);
2923                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
2924                pi2_src_temp -= (stride);
2925                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2926
2927
2928                for(j = 0; j < 2; j++)
2929                {
2930
2931                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
2932                    {
2933                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2934                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
2935                        m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
2936                        m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
2937                    }
2938                    else
2939                    {
2940                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2941                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
2942                        m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
2943                        m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
2944                    }
2945                    m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2946                    m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
2947                    m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
2948                    m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
2949
2950
2951                    /* o0[0-3] */
2952                    {
2953                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2954                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2955                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2956                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2957
2958
2959                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2960                        pi2_src_scratch += in_stride;
2961
2962                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2963                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
2964                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
2965                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
2966
2967                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2968                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2969                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
2970                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2971                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2972
2973                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2974                        m_count = _mm_cvtsi32_si128(i4_shift);
2975                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2976
2977                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2978                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2979                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2980                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2981
2982                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2983
2984                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2985                        pi2_dst_scratch += out_stride;
2986                    }
2987
2988                    /* o1[0-3] */
2989                    {
2990                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2991                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2992                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
2993                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
2994
2995
2996                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2997                        pi2_src_scratch += in_stride;
2998
2999                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
3000                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
3001                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
3002                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
3003
3004                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3005                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
3006                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
3007                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3008                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3009
3010                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3011                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3012                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3013                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3014
3015                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3016
3017                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3018                        pi2_dst_scratch += ((!i) * out_stride + 8);
3019                    }
3020
3021                    /* o2[0-3] */
3022                    {
3023                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3024                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3025                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3026                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3027
3028                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3029                        pi2_src_scratch += in_stride;
3030
3031                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
3032                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
3033                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
3034                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
3035
3036                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
3037                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3038                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
3039                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3040                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3041
3042                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3043                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3044                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3045                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3046
3047                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3048
3049                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3050                        pi2_dst_scratch += out_stride;
3051                    }
3052
3053                    /* o3[0-3] */
3054                    {
3055                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3056                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3057                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3058                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3059
3060
3061                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3062                        pi2_src_scratch += 8;
3063
3064                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
3065                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
3066                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
3067                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
3068
3069                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
3070                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
3071                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
3072                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3073                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3074
3075                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3076                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3077                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3078                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3079
3080                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3081
3082                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3083                        pi2_dst_scratch += (i * out_stride + 8);
3084                    }
3085
3086                    /* o4[0-3] */
3087                    {
3088                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3089                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3090                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3091                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3092
3093                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3094                        pi2_src_scratch -= in_stride;
3095
3096                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
3097                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
3098                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
3099                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
3100
3101                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
3102                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3103                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
3104                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3105                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3106
3107                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3108                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3109                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3110                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3111
3112                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3113
3114                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3115                        pi2_dst_scratch += out_stride;
3116                    }
3117
3118                    /* o5[0-3] */
3119                    {
3120                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3121                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3122                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3123                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3124
3125
3126                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3127                        pi2_src_scratch -= in_stride;
3128
3129                        m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
3130                        m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
3131                        m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
3132                        m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
3133
3134                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3135                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
3136                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
3137                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3138                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3139
3140                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3141                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3142                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3143                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3144
3145                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3146
3147                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3148                        pi2_dst_scratch += ((!i) * out_stride + 8);
3149                    }
3150
3151                    /* o6[0-3] */
3152                    {
3153                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3154                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3155                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3156                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3157
3158
3159                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3160                        pi2_src_scratch -= in_stride;
3161
3162                        m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
3163                        m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
3164                        m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
3165                        m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
3166
3167
3168                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3169                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3170                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3171                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3172                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3173
3174                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3175                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3176                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3177                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3178
3179                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3180
3181                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3182                        pi2_dst_scratch += out_stride;
3183                    }
3184
3185                    /* o7[0-3] */
3186                    {
3187                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3188                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3189                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3190                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3191
3192                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3193                        pi2_src_scratch += 8;
3194
3195                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3196                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
3197                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
3198                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3199                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3200
3201
3202                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3203                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3204                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3205                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3206
3207                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3208
3209                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3210                        pi2_dst_scratch += (i * out_stride + 8);
3211                    }
3212
3213                }
3214            }
3215        }
3216    }
3217
3218    /* Transpose */
3219    {
3220        WORD16 *pi2_src_scratch;
3221        UWORD8 *pu1_pred_temp = pu1_pred;
3222        WORD32 out_stride = dst_strd;
3223        WORD32 in_stride = trans_size;
3224        WORD32 j;
3225        m_temp_reg_1 = _mm_setzero_si128();
3226        for(i = 0; i < 2; i++)
3227        {
3228            pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp;
3229
3230            for(j = 0; j < 2; j++)
3231            {
3232                m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
3233                pi2_src_scratch += in_stride;
3234                m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
3235                pi2_src_scratch += ((!i) * in_stride + 8);
3236                m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
3237                pi2_src_scratch += (in_stride);
3238                m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
3239                pi2_src_scratch += (i * in_stride + 8);
3240                m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
3241                pi2_src_scratch += in_stride;
3242                m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
3243                pi2_src_scratch += ((!i) * in_stride + 8);
3244                m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
3245                pi2_src_scratch += in_stride;
3246                m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
3247                pi2_src_scratch += (i * in_stride + 8);
3248
3249                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
3250                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
3251
3252                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
3253                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
3254
3255                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
3256                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
3257
3258                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
3259                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
3260
3261
3262                m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
3263                m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
3264
3265                m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
3266                m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
3267
3268                m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
3269                m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
3270
3271                m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
3272                m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
3273
3274
3275                m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
3276                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3277
3278                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3279                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3280
3281                m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
3282                m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0);
3283                m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12);
3284
3285                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
3286                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3287                pu1_dst += out_stride;
3288                pu1_pred_temp += pred_strd;
3289
3290                m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
3291                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3292
3293                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3294                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3295
3296                m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
3297                m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0);
3298                m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12);
3299
3300                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45);
3301                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3302                pu1_dst += out_stride;
3303                pu1_pred_temp += pred_strd;
3304
3305                m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
3306                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3307
3308                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3309                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3310
3311                m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
3312                m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0);
3313                m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12);
3314
3315                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46);
3316                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3317                pu1_dst += out_stride;
3318                pu1_pred_temp += pred_strd;
3319
3320                m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
3321                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3322
3323                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3324                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3325
3326                m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
3327                m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0);
3328                m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12);
3329
3330                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47);
3331                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3332                pu1_dst += out_stride;
3333                pu1_pred_temp += pred_strd;
3334            }
3335        }
3336    }
3337}
3338