1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19 *******************************************************************************
20 * @file
21 *  ihevc_iquant_itrans_recon_atom_intr.c
22 *
23 * @brief
24 *  Contains function definitions for inverse  quantization, inverse
25 * transform and reconstruction
26 *
27 * @author
28 *  100470
29 *  100592 (edited by)
30 *
31 * @par List of Functions:
32 *  - ihevc_iquant_itrans_recon_16x16_ssse3()
33 *
34 * @remarks
35 *  None
36 *
37 *******************************************************************************
38 */
39#include <stdio.h>
40#include <string.h>
41#include "ihevc_typedefs.h"
42#include "ihevc_macros.h"
43#include "ihevc_platform_macros.h"
44#include "ihevc_defs.h"
45#include "ihevc_trans_tables.h"
46#include "ihevc_itrans_recon.h"
47#include "ihevc_func_selector.h"
48#include "ihevc_trans_macros.h"
49
50
51
52#include <immintrin.h>
53#include <emmintrin.h>
54
55#include <tmmintrin.h>
56
57
58/**
59 *******************************************************************************
60 *
61 * @brief
62 *  This function performs inverse quantization, inverse  transform and
63 * reconstruction for 16x16 input block
64 *
65 * @par Description:
66 *  Performs inverse quantization , inverse transform  and adds the
67 * prediction data and clips output to 8 bit
68 *
69 * @param[in] pi2_src
70 *  Input 16x16 coefficients
71 *
72 * @param[in] pi2_tmp
73 *  Temporary 16x16 buffer for storing inverse
74 *  transform 1st stage output
75 *
76 * @param[in] pu1_pred
77 *  Prediction 16x16 block
78 *
79 * @param[in] pi2_dequant_coeff
80 *  Dequant Coeffs
81 *
82 * @param[out] pu1_dst
83 *  Output 16x16 block
84 *
85 * @param[in] qp_div
86 *  Quantization parameter / 6
87 *
88 * @param[in] qp_rem
89 *  Quantization parameter % 6
90 *
91 * @param[in] src_strd
92 *  Input stride
93 *
94 * @param[in] pred_strd
95 *  Prediction stride
96 *
97 * @param[in] dst_strd
98 *  Output Stride
99 *
100 * @param[in] zero_cols
101 *  Zero columns in pi2_src
102 *
103 * @returns  Void
104 *
105 * @remarks
106 *  None
107 *
108 *******************************************************************************
109 */
110
111void ihevc_itrans_recon_16x16_ssse3(WORD16 *pi2_src,
112                                    WORD16 *pi2_tmp,
113                                    UWORD8 *pu1_pred,
114                                    UWORD8 *pu1_dst,
115                                    WORD32 src_strd,
116                                    WORD32 pred_strd,
117                                    WORD32 dst_strd,
118                                    WORD32 zero_cols,
119                                    WORD32 zero_rows)
120{
121    __m128i m_temp_reg_0;
122    __m128i m_temp_reg_1;
123    __m128i m_temp_reg_10;
124    __m128i m_temp_reg_11;
125    __m128i m_temp_reg_12;
126    __m128i m_temp_reg_13;
127    __m128i m_temp_reg_14;
128
129    __m128i m_temp_reg_20;
130    __m128i m_temp_reg_21;
131    __m128i m_temp_reg_22;
132    __m128i m_temp_reg_23;
133    __m128i m_temp_reg_24;
134    __m128i m_temp_reg_25;
135    __m128i m_temp_reg_26;
136    __m128i m_temp_reg_27;
137    __m128i m_temp_reg_30;
138    __m128i m_temp_reg_31;
139    __m128i m_temp_reg_32;
140    __m128i m_temp_reg_33;
141    __m128i m_temp_reg_34;
142    __m128i m_temp_reg_35;
143    __m128i m_temp_reg_36;
144    __m128i m_temp_reg_37;
145    __m128i m_temp_reg_40;
146    __m128i m_temp_reg_41;
147    __m128i m_temp_reg_42;
148    __m128i m_temp_reg_43;
149    __m128i m_temp_reg_44;
150    __m128i m_temp_reg_45;
151    __m128i m_temp_reg_46;
152    __m128i m_temp_reg_47;
153
154    __m128i m_temp_reg_70;
155    __m128i m_temp_reg_71;
156    __m128i m_temp_reg_72;
157    __m128i m_temp_reg_73;
158    __m128i m_temp_reg_74;
159    __m128i m_temp_reg_75;
160    __m128i m_temp_reg_76;
161    __m128i m_temp_reg_77;
162    __m128i m_rdng_factor;
163    __m128i m_count;
164    __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
165    __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
166
167    WORD32 i;
168/*Lokesh*/
169    WORD32  zero_last8_cols_stg1;
170    WORD32  zero_last8_rows_stg1;
171    WORD32  zero_last12_rows_stg1;
172    WORD32  zero_last12_rows_stg2;
173    WORD32  zero_last8_rows_stg2;
174
175    WORD32  loop = 0;
176
177    WORD32 i4_shift = IT_SHIFT_STAGE_1;
178    WORD32 trans_size = TRANS_SIZE_16;
179
180
181
182
183    /* Following 3 instructions replicates the value in the */
184    /* lower 16 bits of m_add_iq in the entire register */
185
186    /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
187
188    zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0;
189    zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0;
190    zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0;
191
192    zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0;
193    zero_last8_rows_stg2 = zero_last8_cols_stg1;
194    if(zero_last8_cols_stg1)
195    {
196        loop = 1;
197    }
198    else
199        loop = 2;
200
201    /* i = 0 => lower 8 samples */
202    /* i = 1 => higher 8 samples */
203    for(i = 0; i < loop; i++)
204    {
205        {
206            WORD32 sample_half_index = i << 3;
207            WORD16 *pi2_tmp_src = pi2_src + sample_half_index;
208            WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
209
210            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
211            pi2_tmp_src += (src_strd << 1);
212            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
213            pi2_tmp_src += (src_strd << 1);
214            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
215            pi2_tmp_src += (src_strd << 1);
216            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
217            pi2_tmp_src += (src_strd << 1);
218            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
219            pi2_tmp_src += (src_strd << 1);
220            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
221            pi2_tmp_src += (src_strd << 1);
222            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
223            pi2_tmp_src += (src_strd << 1);
224            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
225            pi2_tmp_src += (src_strd << 1);
226
227
228
229
230            /* If last 12 rows are zero : Rishab */
231            if(zero_last12_rows_stg1)
232            {
233
234                /* eee */
235                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
236                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
237                {
238                    /* Loading coeff and src for use in next block */
239
240                    m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign
241                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
242
243                    m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
244
245                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
246
247                    m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
248
249                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
250
251                    m_temp_reg_26 = m_temp_reg_24;
252                    m_temp_reg_27 = m_temp_reg_25;
253                }
254
255                /* eo */
256
257                /* eo0[0-3] */
258                {
259                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
260                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
261
262                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
263
264                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
265
266                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
267                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
268                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
269                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
270
271                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
272                    pi2_scratch += 8;
273                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
274                    pi2_scratch += 8;
275
276                }
277
278
279                /* eo0[4-7] */
280                {
281                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
282
283                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
284
285                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
286                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
287                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
288                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
289
290                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
291                    pi2_scratch += 8;
292                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
293                    pi2_scratch += 8;
294
295                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
296                }
297
298                /* eo1[0-3] */
299                {
300                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
301
302                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
303
304                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
305                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
306                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
307                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
308
309                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
310                    pi2_scratch += 8;
311                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
312                    pi2_scratch += 8;
313                }
314
315                /* eo1[4-7] */
316                {
317                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
318
319                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
320
321                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
322                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
323                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
324                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
325
326                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
327                    pi2_scratch += 8;
328                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
329                    pi2_scratch += 8;
330
331                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
332
333                }
334
335                /* eo2[0-3] */
336                {
337                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
338
339                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
340                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
341                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
342                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
343
344                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
345                    pi2_scratch += 8;
346                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
347                    pi2_scratch += 8;
348
349                }
350
351                /* eo2[4-7] */
352                {
353                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
354
355                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
356                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
357                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
358                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
359
360
361                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
362                    pi2_scratch += 8;
363                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
364                    pi2_scratch += 8;
365
366                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
367                }
368
369                /* eo3[0-3] */
370                {
371                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
372
373                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
374                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
375                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
376                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
377
378                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
379                    pi2_scratch += 8;
380                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
381                    pi2_scratch += 8;
382                }
383
384                /* eo3[4-7] */
385                {
386                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
387
388                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
389                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
390                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
391                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
392
393                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
394                    pi2_scratch += 8;
395                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
396                    pi2_scratch += 8;
397                }
398            }
399            /* If last 8 rows are zero : Rishab */
400            else if(zero_last8_rows_stg1)
401            {
402                /* eeo */
403                /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
404                /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
405                {
406                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
407                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
408
409                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
410                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
411
412                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
413                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
414
415                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
416                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
417
418                }
419
420                /* eee */
421                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
422                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
423                {
424                    /* Loading coeff and src for use in next block */
425                    m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to  get signs
426                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
427
428                    m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
429
430                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
431
432                    m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
433
434                    m_temp_reg_26 = m_temp_reg_24;
435                    m_temp_reg_27 = m_temp_reg_25;
436
437                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
438                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
439                }
440
441                /* eo0[0-3] */
442                {
443                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
444                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
445
446                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
447
448                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
449                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
450                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
451
452                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
453                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
454                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
455                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
456
457                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
458                    pi2_scratch += 8;
459                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
460                    pi2_scratch += 8;
461
462                }
463
464                /* eo0[4-7] */
465                {
466                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
467
468                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
469                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
470                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
471
472                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
473                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
474                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
475                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
476
477                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
478                    pi2_scratch += 8;
479                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
480                    pi2_scratch += 8;
481
482                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
483                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
484
485                }
486
487                /* eo1[0-3] */
488                {
489                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
490
491                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
492                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
493                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
494
495                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
496                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
497                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
498                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
499
500                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
501                    pi2_scratch += 8;
502                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
503                    pi2_scratch += 8;
504
505                }
506
507                /* eo1[4-7] */
508                {
509                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
510
511                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
512                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
513                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
514
515                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
516                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
517                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
518                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
519
520                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
521                    pi2_scratch += 8;
522                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
523                    pi2_scratch += 8;
524
525                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
526                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
527
528                }
529
530                /* eo2[0-3] */
531                {
532                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
533
534                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
535                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
536                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
537                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
538
539                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
540                    pi2_scratch += 8;
541                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
542                    pi2_scratch += 8;
543
544                }
545
546                /* eo2[4-7] */
547                {
548                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
549
550                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
551                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
552                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
553                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
554
555                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
556                    pi2_scratch += 8;
557                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
558                    pi2_scratch += 8;
559
560                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
561                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
562
563                }
564
565                /* eo3[0-3] */
566                {
567                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
568
569                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
570                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
571                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
572                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
573
574                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
575                    pi2_scratch += 8;
576                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
577                    pi2_scratch += 8;
578                }
579
580                /* eo3[4-7] */
581                {
582                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
583
584                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
585                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
586                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
587                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
588
589                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
590                    pi2_scratch += 8;
591                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
592                    pi2_scratch += 8;
593                }
594            } /* If all the rows are non-zero : Rishab */
595            else
596            {
597                /* eeo */
598                /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
599                /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
600
601                {
602                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
603                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
604
605                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
606                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
607
608                    m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
609                    m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
610
611                    m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
612                    m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
613                }
614
615                /* eee */
616                /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
617                /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
618                {
619                    /* Loading coeff and src for use in next block */
620                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
621                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
622
623                    m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
624                    m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
625
626                    m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
627                    m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
628
629                    m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
630                    m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
631
632                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
633                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
634
635                }
636                /* eo0[0-3] */
637                {
638                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
639                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
640                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
641                    m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
642
643                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
644                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
645
646
647                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
648                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
649                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
650
651                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
652                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
653                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
654                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
655                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
656                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
657
658                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
659                    pi2_scratch += 8;
660                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
661                    pi2_scratch += 8;
662
663
664                }
665
666                /* eo0[4-7] */
667                {
668                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
669                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
670
671                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
672                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
673                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
674
675                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
676                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
677                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
678                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
679                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
680                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
681
682                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
683                    pi2_scratch += 8;
684                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
685                    pi2_scratch += 8;
686
687                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
688                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
689
690                }
691
692                /* eo1[0-3] */
693                {
694                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
695                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
696
697                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
698                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
699                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
700
701                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
702                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
703                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
704                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
705                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
706                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
707
708                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
709                    pi2_scratch += 8;
710                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
711                    pi2_scratch += 8;
712
713                }
714
715                /* eo1[4-7] */
716                {
717                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
718                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
719
720                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
721                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
722                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
723
724                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
725                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
726                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
727                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
728                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
729                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
730
731                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
732                    pi2_scratch += 8;
733                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
734                    pi2_scratch += 8;
735                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
736                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
737                }
738
739                /* eo2[0-3] */
740                {
741                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
742                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
743
744                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
745                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
746                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
747                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
748                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
749                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
750
751                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
752                    pi2_scratch += 8;
753                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
754                    pi2_scratch += 8;
755                }
756
757                /* eo2[4-7] */
758                {
759                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
760                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
761
762                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
763                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
764                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
765                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
766                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
767                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
768
769                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
770                    pi2_scratch += 8;
771                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
772                    pi2_scratch += 8;
773
774                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
775                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
776
777                }
778
779                /* eo3[0-3] */
780                {
781                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
782                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
783
784                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
785                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
786                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
787                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
788                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
789                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
790
791
792                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
793                    pi2_scratch += 8;
794                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
795                    pi2_scratch += 8;
796                }
797
798                /* eo3[4-7] */
799                {
800                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
801                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
802
803                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
804                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
805                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
806                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
807                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
808                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
809
810                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
811                    pi2_scratch += 8;
812                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
813                    pi2_scratch += 8;
814                }
815
816            }
817        }
818
819        {
820            WORD32 sample_half_index = i << 3;
821            WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd;
822
823            m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
824            pi2_tmp_src += (src_strd << 1);
825            m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
826            pi2_tmp_src += (src_strd << 1);
827            m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
828            pi2_tmp_src += (src_strd << 1);
829            m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
830            pi2_tmp_src += (src_strd << 1);
831            m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
832            pi2_tmp_src += (src_strd << 1);
833            m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
834            pi2_tmp_src += (src_strd << 1);
835            m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
836            pi2_tmp_src += (src_strd << 1);
837            m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
838            pi2_tmp_src += (src_strd << 1);
839        }
840
841        /* o & stage 1 out */
842        {
843            WORD32 j;
844            WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
845            WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
846            WORD32 out_stride = (trans_size << 1);
847            WORD32 in_stride = trans_size << 1;
848
849            if(zero_last12_rows_stg1)
850            {
851                for(j = 0; j < 2; j++)
852                {
853                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
854                    {
855                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
856                    }
857                    else
858                    {
859                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
860                    }
861                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
862
863
864                    /* o0[0-3] */
865                    {
866                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
867
868
869                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
870                        pi2_src_scratch += in_stride;
871
872                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
873
874
875                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
876                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
877
878                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
879                        m_count = _mm_cvtsi32_si128(i4_shift);
880                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
881
882                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
883                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
884                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
885                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
886
887                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
888
889                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
890                        pi2_dst_scratch += out_stride;
891                    }
892
893                    /* o1[0-3] */
894                    {
895                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
896
897
898                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
899                        pi2_src_scratch += in_stride;
900
901                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
902
903                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
904                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
905
906                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
907                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
908                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
909                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
910
911                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
912
913                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
914                        pi2_dst_scratch += out_stride;
915                    }
916
917                    /* o2[0-3] */
918                    {
919                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
920
921                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
922                        pi2_src_scratch += in_stride;
923
924                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
925
926                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
927                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
928
929                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
930                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
931                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
932                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
933
934                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
935
936                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
937                        pi2_dst_scratch += out_stride;
938                    }
939
940                    /* o3[0-3] */
941                    {
942                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
943
944                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
945                        pi2_src_scratch += 8;
946
947                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
948
949                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
950                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
951
952                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
953                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
954                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
955                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
956
957                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
958
959                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
960                        pi2_dst_scratch += 8;
961                    }
962
963                    /* o4[0-3] */
964                    {
965                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
966
967                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
968                        pi2_src_scratch -= in_stride;
969
970                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
971
972                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
973                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
974
975                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
976                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
977                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
978                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
979
980                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
981
982                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
983                        pi2_dst_scratch -= out_stride;
984                    }
985
986                    /* o5[0-3] */
987                    {
988                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
989
990
991                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
992                        pi2_src_scratch -= in_stride;
993
994                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
995
996                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
997                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
998
999                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1000                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1001                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1002                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1003
1004                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1005
1006                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1007                        pi2_dst_scratch -= out_stride;
1008                    }
1009
1010                    /* o6[0-3] */
1011                    {
1012                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1013
1014                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1015                        pi2_src_scratch -= in_stride;
1016
1017                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1018
1019                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1020                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1021
1022                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1023                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1024                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1025                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1026
1027                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1028
1029                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1030                        pi2_dst_scratch -= out_stride;
1031                    }
1032
1033                    /* o7[0-3] */
1034                    {
1035                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1036
1037                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1038                        pi2_src_scratch += 8;
1039
1040                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1041                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1042
1043                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1044                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1045                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1046                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1047
1048                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1049
1050                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1051                        pi2_dst_scratch += 8;
1052                    }
1053                }
1054            }
1055            else if(zero_last8_rows_stg1)
1056            {
1057                for(j = 0; j < 2; j++)
1058                {
1059                    if(j)
1060                    {
1061                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
1062                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
1063                    }
1064                    else
1065                    {
1066                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
1067                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
1068                    }
1069                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
1070                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
1071
1072                    /* o0[0-3] */
1073                    {
1074                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1075                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1076
1077                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1078                        pi2_src_scratch += in_stride;
1079
1080                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
1081                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
1082
1083                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1084                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1085                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1086
1087
1088                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1089                        m_count = _mm_cvtsi32_si128(i4_shift);
1090
1091                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
1092
1093                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1094                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1095                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1096                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1097
1098                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1099
1100                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1101                        pi2_dst_scratch += out_stride;
1102                    }
1103
1104                    /* o1[0-3] */
1105                    {
1106                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1107                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1108
1109                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1110                        pi2_src_scratch += in_stride;
1111
1112                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
1113                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
1114
1115                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1116                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1117                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1118
1119                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1120                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1121                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1122                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1123
1124                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1125
1126                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1127                        pi2_dst_scratch += out_stride;
1128                    }
1129
1130                    /* o2[0-3] */
1131                    {
1132                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1133                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1134
1135                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1136                        pi2_src_scratch += in_stride;
1137
1138                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
1139                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
1140
1141                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1142                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1143                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1144
1145                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1146                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1147                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1148                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1149
1150                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1151
1152                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1153                        pi2_dst_scratch += out_stride;
1154                    }
1155
1156                    /* o3[0-3] */
1157                    {
1158                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1159                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1160
1161                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1162                        pi2_src_scratch += 8;
1163
1164                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
1165                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
1166
1167                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
1168                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1169                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1170
1171                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1172                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1173                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1174                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1175
1176                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1177
1178                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1179                        pi2_dst_scratch += 8;
1180                    }
1181
1182                    /* o4[0-3] */
1183                    {
1184                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1185                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1186
1187                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1188                        pi2_src_scratch -= in_stride;
1189
1190                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
1191                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
1192
1193                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1194                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1195                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1196
1197
1198                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1199                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1200                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1201                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1202
1203                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1204
1205                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1206                        pi2_dst_scratch -= out_stride;
1207                    }
1208
1209                    /* o5[0-3] */
1210                    {
1211                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1212                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1213
1214                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1215                        pi2_src_scratch -= in_stride;
1216
1217                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
1218                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
1219
1220                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1221                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1222                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1223
1224
1225                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1226                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1227                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1228                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1229
1230                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1231
1232                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1233                        pi2_dst_scratch -= out_stride;
1234                    }
1235
1236                    /* o6[0-3] */
1237                    {
1238                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1239                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1240
1241                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1242                        pi2_src_scratch -= in_stride;
1243
1244                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1245                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
1246
1247                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1248                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1249                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1250
1251                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1252                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1253                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1254                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1255
1256                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1257
1258                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1259                        pi2_dst_scratch -= out_stride;
1260                    }
1261
1262                    /* o7[0-3] */
1263                    {
1264                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1265                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1266
1267                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1268                        pi2_src_scratch += 8;
1269
1270                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1271                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1272                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1273
1274                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1275                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1276                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1277                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1278
1279                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1280
1281                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1282                        pi2_dst_scratch += 8;
1283                    }
1284                }
1285
1286            }
1287            else
1288            {
1289
1290
1291
1292                for(j = 0; j < 2; j++)
1293                {
1294                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
1295                    {
1296                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
1297                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
1298                        m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
1299                        m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
1300                    }
1301                    else
1302                    {
1303                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
1304                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
1305                        m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
1306                        m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
1307                    }
1308                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
1309                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
1310                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
1311                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
1312
1313
1314                    /* o0[0-3] */
1315                    {
1316                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1317                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1318                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1319                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1320
1321
1322                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1323                        pi2_src_scratch += in_stride;
1324
1325                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
1326                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
1327                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
1328                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
1329
1330                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1331                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
1332                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
1333                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1334                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1335
1336                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1337                        m_count = _mm_cvtsi32_si128(i4_shift);
1338                        m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1339                        m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1340
1341                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1342                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1343                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1344                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1345
1346                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1347
1348                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1349                        pi2_dst_scratch += out_stride;
1350                    }
1351
1352                    /* o1[0-3] */
1353                    {
1354                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1355                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1356                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1357                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1358
1359
1360                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1361                        pi2_src_scratch += in_stride;
1362
1363                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
1364                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
1365                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
1366                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
1367
1368                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1369                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
1370                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
1371                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1372                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1373
1374                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1375                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1376                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1377                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1378
1379                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1380
1381                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1382                        pi2_dst_scratch += out_stride;
1383                    }
1384
1385                    /* o2[0-3] */
1386                    {
1387                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1388                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1389                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1390                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1391
1392                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1393                        pi2_src_scratch += in_stride;
1394
1395                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
1396                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
1397                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
1398                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
1399
1400                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1401                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1402                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
1403                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1404                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1405
1406                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1407                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1408                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1409                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1410
1411                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1412
1413                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1414                        pi2_dst_scratch += out_stride;
1415                    }
1416
1417                    /* o3[0-3] */
1418                    {
1419                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1420                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1421                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1422                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1423
1424
1425                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1426                        pi2_src_scratch += 8;
1427
1428                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
1429                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
1430                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
1431                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
1432
1433                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
1434                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
1435                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
1436                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1437                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1438
1439                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1440                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1441                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1442                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1443
1444                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1445
1446                        _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1447                        pi2_dst_scratch += 8;
1448                    }
1449
1450                    /* o4[0-3] */
1451                    {
1452                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1453                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1454                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1455                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1456
1457                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1458                        pi2_src_scratch -= in_stride;
1459
1460                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
1461                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
1462                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
1463                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
1464
1465                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1466                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1467                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
1468                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1469                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1470
1471                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1472                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1473                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1474                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1475
1476                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1477
1478                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1479                        pi2_dst_scratch -= out_stride;
1480                    }
1481
1482                    /* o5[0-3] */
1483                    {
1484                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1485                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1486                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1487                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1488
1489
1490                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1491                        pi2_src_scratch -= in_stride;
1492
1493                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
1494                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
1495                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
1496                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
1497
1498                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1499                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
1500                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
1501                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1502                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1503
1504                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1505                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1506                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1507                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1508
1509                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1510
1511                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1512                        pi2_dst_scratch -= out_stride;
1513                    }
1514
1515                    /* o6[0-3] */
1516                    {
1517                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1518                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1519                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1520                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1521
1522
1523                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1524                        pi2_src_scratch -= in_stride;
1525
1526                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1527                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
1528                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
1529                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
1530
1531
1532                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1533                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1534                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
1535                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1536                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1537
1538                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1539                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1540                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1541                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1542
1543                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1544
1545                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1546                        pi2_dst_scratch -= out_stride;
1547                    }
1548
1549                    /* o7[0-3] */
1550                    {
1551                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1552                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1553                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1554                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1555
1556                        m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1557                        pi2_src_scratch += 8;
1558
1559                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1560                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
1561                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
1562                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1563                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1564
1565
1566                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1567                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1568                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1569                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1570
1571                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1572
1573                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1574                        pi2_dst_scratch += 8;
1575                    }
1576                }
1577            }
1578        }
1579
1580        /* Transpose */
1581        {
1582            WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
1583            WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp);
1584            WORD32 out_stride = (trans_size << 1);
1585            WORD32 in_stride = (trans_size << 1);
1586            WORD32 j;
1587
1588            for(j = 0; j < 2; j++)
1589            {
1590                m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
1591                pi2_src_scratch += in_stride;
1592                m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
1593                pi2_src_scratch += in_stride;
1594                m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
1595                pi2_src_scratch += in_stride;
1596                m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
1597                pi2_src_scratch += 8;
1598                m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
1599                pi2_src_scratch -= in_stride;
1600                m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
1601                pi2_src_scratch -= in_stride;
1602                m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
1603                pi2_src_scratch -= in_stride;
1604                m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
1605                pi2_src_scratch += 8;
1606
1607                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
1608                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
1609
1610                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
1611                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
1612
1613                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
1614                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
1615
1616                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
1617                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
1618
1619
1620                m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
1621                m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
1622
1623                m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
1624                m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
1625
1626                m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
1627                m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
1628
1629                m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
1630                m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
1631
1632
1633                m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
1634                m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
1635
1636                m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
1637                m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
1638
1639                m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
1640                m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1
1641
1642                m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2
1643                m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3
1644
1645                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1646                pi2_dst_scratch += out_stride;
1647                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44);
1648                pi2_dst_scratch += out_stride;
1649                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41);
1650                pi2_dst_scratch += out_stride;
1651                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45);
1652                pi2_dst_scratch += 8;
1653                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42);
1654                pi2_dst_scratch -= out_stride;
1655                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46);
1656                pi2_dst_scratch -= out_stride;
1657                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43);
1658                pi2_dst_scratch -= out_stride;
1659                _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47);
1660                pi2_dst_scratch += 8;
1661            }
1662        }
1663    }
1664
1665    if(zero_last8_cols_stg1)
1666    {
1667        WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size);
1668        WORD32 out_stride = (trans_size << 1);
1669        WORD32 j;
1670
1671        m_temp_reg_40 = _mm_setzero_si128();
1672        for(j = 0; j < 2; j++)
1673        {
1674            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1675            pi2_dst_scratch += out_stride;
1676            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1677            pi2_dst_scratch += out_stride;
1678            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1679            pi2_dst_scratch += out_stride;
1680            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1681            pi2_dst_scratch += 8;
1682            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1683            pi2_dst_scratch -= out_stride;
1684            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1685            pi2_dst_scratch -= out_stride;
1686            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1687            pi2_dst_scratch -= out_stride;
1688            _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1689            pi2_dst_scratch += 8;
1690        }
1691    }
1692
1693
1694
1695
1696    /* Stage 2 */
1697    for(i = 0; i < 2; i++)
1698    {
1699        WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp);
1700        WORD32 stride = (trans_size);
1701        MEM_ALIGN16 WORD16 temp_array[256];
1702
1703        i4_shift = IT_SHIFT_STAGE_2;
1704
1705        if(zero_last12_rows_stg2)
1706        {
1707            /* eeo */
1708            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1709            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1710            {
1711                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
1712
1713                pi2_src_temp += (stride * 9);
1714
1715                if(!i)
1716                {
1717                    pi2_src_temp += (stride * 6 + 8);
1718                }
1719                else
1720                {
1721                    pi2_src_temp += (stride * 2 + 8);
1722                }
1723
1724                pi2_src_temp -= (stride * 9);
1725
1726                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
1727
1728                m_temp_reg_20 = _mm_setzero_si128();
1729                m_temp_reg_22 = _mm_setzero_si128();
1730
1731                m_temp_reg_21 = _mm_setzero_si128();
1732                m_temp_reg_23 = _mm_setzero_si128();
1733            }
1734
1735            /* eee */
1736            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
1737            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
1738            {
1739                /* Loading coeff and src for use in next block */
1740
1741                /* Loading coeff and src for use in next block */
1742                m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70);
1743
1744                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
1745
1746                m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
1747
1748                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
1749                m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
1750
1751                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
1752                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
1753
1754                m_temp_reg_26 = m_temp_reg_24;
1755                m_temp_reg_27 = m_temp_reg_25;
1756                /*  */
1757
1758                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20);
1759                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20);
1760            }
1761
1762            /* eo */
1763            {
1764                WORD16 *pi2_scratch = temp_array;
1765                WORD32 out_stride = 8;
1766
1767
1768                /* eo0[0-3] */
1769                {
1770                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1771
1772                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1773
1774                    /* e[0][0-3] stored in pu1_dst[0] */
1775                    /* e[7][0-3] stored in pu1_dst[1] */
1776                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
1777                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
1778
1779                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1780                    pi2_scratch += out_stride;
1781                    _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
1782                    pi2_scratch += out_stride;
1783                }
1784
1785                /* eo0[4-7] */
1786                {
1787                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1788
1789                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1790
1791                    /* e[0][4-7] stored in pu1_dst[2] */
1792                    /* e[7][4-7] stored in pu1_dst[3] */
1793
1794                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
1795                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
1796
1797                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1798                    pi2_scratch += out_stride;
1799                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1800                    pi2_scratch += out_stride;
1801
1802                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
1803                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
1804
1805                }
1806
1807                /* eo1[0-3] */
1808                {
1809                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1810
1811                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
1812
1813                    /* e[1][0-3] stored in pu1_dst[4] */
1814                    /* e[6][0-3] stored in pu1_dst[5] */
1815                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
1816                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
1817
1818                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1819                    pi2_scratch += out_stride;
1820                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1821                    pi2_scratch += out_stride;
1822                }
1823
1824                /* eo1[4-7] */
1825                {
1826                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1827
1828                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
1829
1830                    /* e[1][4-7] stored in pu1_dst[6]*/
1831                    /* e[6][4-7] stored in pu1_dst[7] */
1832                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
1833                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
1834
1835                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1836                    pi2_scratch += out_stride;
1837                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1838                    pi2_scratch += out_stride;
1839
1840                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
1841
1842                }
1843
1844                /* eo2[0-3] */
1845                {
1846                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1847
1848                    /* e[2][0-3] stored in pu1_dst[8]*/
1849                    /* e[5][0-3] stored in pu1_dst[9] */
1850                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
1851                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
1852
1853                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1854                    pi2_scratch += out_stride;
1855                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1856                    pi2_scratch += out_stride;
1857                }
1858
1859                /* eo2[4-7] */
1860                {
1861                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1862
1863                    /* e[2][4-7] stored in pu1_dst[10]*/
1864                    /* e[5][4-7] stored in pu1_dst[11] */
1865                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
1866                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
1867
1868                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1869                    pi2_scratch += out_stride;
1870                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1871                    pi2_scratch += out_stride;
1872
1873                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
1874                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
1875
1876                }
1877
1878                /* eo3[0-3] */
1879                {
1880                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1881
1882                    /* e[3][0-3] stored in pu1_dst[12]*/
1883                    /* e[4][0-3] stored in pu1_dst[13] */
1884                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
1885                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
1886
1887                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1888                    pi2_scratch += out_stride;
1889                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1890                    pi2_scratch += out_stride;
1891                }
1892
1893                /* eo3[4-7] */
1894                {
1895                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1896
1897                    /* e[3][4-7] stored in pu1_dst[14]*/
1898                    /* e[4][4-7] stored in pu1_dst[15] */
1899                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
1900                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
1901
1902                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1903                    pi2_scratch += out_stride;
1904                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1905                    pi2_scratch += out_stride;
1906                }
1907
1908            }
1909        }
1910        else if(zero_last8_rows_stg2)
1911        {
1912            /* eeo */
1913            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1914            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1915            {
1916
1917                m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83
1918                m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36
1919
1920                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
1921                pi2_src_temp += (stride);
1922                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
1923                pi2_src_temp += (stride * 8);
1924
1925                if(!i)
1926                {
1927                    pi2_src_temp += (stride * 6 + 8);
1928                }
1929                else
1930                {
1931                    pi2_src_temp += (stride * 2 + 8);
1932                }
1933
1934                pi2_src_temp -= (stride * 8);
1935                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
1936                pi2_src_temp -= (stride);
1937                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
1938
1939
1940                m_temp_reg_76 = _mm_setzero_si128();
1941
1942
1943                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
1944                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
1945
1946                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
1947                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
1948
1949                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1950                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1951
1952                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1953                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1954            }
1955
1956            /* eee */
1957            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
1958            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
1959            {
1960                /* Loading coeff and src for use in next block */
1961
1962
1963                m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70);
1964
1965                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
1966
1967                m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
1968                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
1969                m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
1970
1971                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
1972
1973                m_temp_reg_26 = m_temp_reg_24;
1974                m_temp_reg_27 = m_temp_reg_25;
1975
1976                m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1977                m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1978            }
1979
1980            /* eo */
1981            {
1982                WORD16 *pi2_scratch = temp_array;
1983                WORD32 out_stride = 8;
1984
1985
1986                /* eo0[0-3] */
1987                {
1988                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1989
1990                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1991                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
1992                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
1993
1994                    /* e[0][0-3] stored in pu1_dst[0] */
1995                    /* e[7][0-3] stored in pu1_dst[1] */
1996                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1997                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1998
1999                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2000                    pi2_scratch += out_stride;
2001                    _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
2002                    pi2_scratch += out_stride;
2003                }
2004
2005                /* eo0[4-7] */
2006                {
2007                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
2008
2009                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2010                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
2011                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
2012
2013                    /* e[0][4-7] stored in pu1_dst[2] */
2014                    /* e[7][4-7] stored in pu1_dst[3] */
2015
2016                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2017                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2018
2019                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2020                    pi2_scratch += out_stride;
2021                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2022                    pi2_scratch += out_stride;
2023
2024                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
2025                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
2026
2027                }
2028
2029                /* eo1[0-3] */
2030                {
2031                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2032
2033                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2034                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
2035                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
2036
2037                    /* e[1][0-3] stored in pu1_dst[4] */
2038                    /* e[6][0-3] stored in pu1_dst[5] */
2039                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
2040                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
2041
2042                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2043                    pi2_scratch += out_stride;
2044                    _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2045                    pi2_scratch += out_stride;
2046                }
2047
2048                /* eo1[4-7] */
2049                {
2050                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
2051
2052                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2053                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
2054                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
2055
2056                    /* e[1][4-7] stored in pu1_dst[6]*/
2057                    /* e[6][4-7] stored in pu1_dst[7] */
2058                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
2059                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
2060
2061                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2062                    pi2_scratch += out_stride;
2063                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2064                    pi2_scratch += out_stride;
2065
2066                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
2067
2068                }
2069
2070                /* eo2[0-3] */
2071                {
2072                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2073
2074                    /* e[2][0-3] stored in pu1_dst[8]*/
2075                    /* e[5][0-3] stored in pu1_dst[9] */
2076                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
2077                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
2078
2079                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2080                    pi2_scratch += out_stride;
2081                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2082                    pi2_scratch += out_stride;
2083                }
2084
2085                /* eo2[4-7] */
2086                {
2087                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
2088
2089                    /* e[2][4-7] stored in pu1_dst[10]*/
2090                    /* e[5][4-7] stored in pu1_dst[11] */
2091                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
2092                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
2093
2094                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2095                    pi2_scratch += out_stride;
2096                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2097                    pi2_scratch += out_stride;
2098
2099                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
2100                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
2101
2102                }
2103
2104                /* eo3[0-3] */
2105                {
2106                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2107
2108                    /* e[3][0-3] stored in pu1_dst[12]*/
2109                    /* e[4][0-3] stored in pu1_dst[13] */
2110                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
2111                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
2112
2113                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2114                    pi2_scratch += out_stride;
2115                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2116                    pi2_scratch += out_stride;
2117                }
2118
2119                /* eo3[4-7] */
2120                {
2121                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
2122
2123                    /* e[3][4-7] stored in pu1_dst[14]*/
2124                    /* e[4][4-7] stored in pu1_dst[15] */
2125                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
2126                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
2127
2128                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2129                    pi2_scratch += out_stride;
2130                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2131                    pi2_scratch += out_stride;
2132                }
2133            }
2134        }
2135
2136        else
2137        {
2138            /* eeo */
2139            /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
2140            /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
2141            {
2142
2143
2144                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
2145                pi2_src_temp += (stride);
2146                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
2147                pi2_src_temp += (stride * 7);
2148                m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8
2149                pi2_src_temp += (stride);
2150                m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12
2151                if(!i)
2152                {
2153                    pi2_src_temp += (stride * 6 + 8);
2154                }
2155                else
2156                {
2157                    pi2_src_temp += (stride * 2 + 8);
2158                }
2159                m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14
2160                pi2_src_temp -= (stride);
2161                m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10
2162                pi2_src_temp -= (stride * 7);
2163                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
2164                pi2_src_temp -= (stride);
2165                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
2166
2167                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83  36
2168                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
2169
2170                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
2171                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
2172
2173                m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2174                m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
2175
2176                m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2177                m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
2178
2179
2180            }
2181
2182            /* eee */
2183            /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
2184            /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
2185            {
2186                /* Loading coeff and src for use in next block */
2187                m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64  64
2188                m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
2189
2190                m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
2191                m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
2192
2193                m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
2194                m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
2195
2196                m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
2197                m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
2198
2199                m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
2200                m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
2201
2202            }
2203
2204            /* eo */
2205            {
2206                WORD16 *pi2_scratch = temp_array;
2207                WORD32 out_stride = 8;
2208
2209
2210
2211                /* eo0[0-3] */
2212                {
2213                    m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
2214                    m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
2215                    m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
2216                    m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
2217
2218
2219                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2220                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
2221
2222
2223                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2224                    m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
2225                    m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
2226
2227
2228                    /* e[0][0-3] stored in pi2_tmp[0][0-7] */
2229                    /* e[7][0-3] stored in pi2_tmp[0][8-15] */
2230                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
2231                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
2232                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2233                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2234
2235                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2236                    pi2_scratch += out_stride;
2237                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2238                    pi2_scratch += out_stride;
2239
2240
2241                }
2242
2243                /* eo0[4-7] */
2244                {
2245
2246                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
2247                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
2248
2249                    /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2250                    m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
2251                    m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
2252
2253                    /* e[0][4-7] stored in pi2_tmp[1][0-7] */
2254                    /* e[7][4-7] stored in pi2_tmp[1][8-15] */
2255                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2256                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2257                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2258                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2259
2260                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2261                    pi2_scratch += out_stride;
2262                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2263                    pi2_scratch += out_stride;
2264
2265                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
2266                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
2267
2268                }
2269
2270                /* eo1[0-3] */
2271                {
2272                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2273                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
2274
2275                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2276                    m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
2277                    m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
2278
2279                    /* e[1][0-3] stored in pi2_tmp[2][0-7] */
2280                    /* e[6][0-3] stored in pi2_tmp[2][8-15] */
2281                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
2282                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
2283                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
2284                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
2285
2286                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2287                    pi2_scratch += out_stride;
2288                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2289                    pi2_scratch += out_stride;
2290
2291                }
2292
2293                /* eo1[4-7] */
2294                {
2295                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
2296                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2297
2298                    /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2299                    m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
2300                    m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
2301
2302                    /* e[1][4-7] stored in pi2_tmp[3][0-7] */
2303                    /* e[6][4-7] stored in pi2_tmp[3][8-15] */
2304                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
2305                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
2306                    m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
2307                    m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
2308
2309                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2310                    pi2_scratch += out_stride;
2311                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2312                    pi2_scratch += out_stride;
2313                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
2314                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
2315                }
2316
2317                /* eo2[0-3] */
2318                {
2319                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2320                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
2321
2322                    /* e[2][0-3] stored in pi2_tmp[4][0-7] */
2323                    /* e[5][0-3] stored in pi2_tmp[4][8-15] */
2324                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
2325                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
2326                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2327                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2328
2329                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2330                    pi2_scratch += out_stride;
2331                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2332                    pi2_scratch += out_stride;
2333                }
2334
2335                /* eo2[4-7] */
2336                {
2337                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
2338                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
2339
2340                    /* e[2][4-7] stored in pi2_tmp[5][0-7] */
2341                    /* e[5][4-7] stored in pi2_tmp[5][8-15] */
2342                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
2343                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
2344                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2345                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2346
2347                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2348                    pi2_scratch += out_stride;
2349                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2350                    pi2_scratch += out_stride;
2351
2352                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
2353                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
2354
2355                }
2356
2357                /* eo3[0-3] */
2358                {
2359                    m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2360                    m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
2361
2362                    /* e[3][0-3] stored in pi2_tmp[6][0-7] */
2363                    /* e[4][0-3] stored in pi2_tmp[6][8-15] */
2364                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
2365                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
2366                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2367                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2368
2369
2370                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2371                    pi2_scratch += out_stride;
2372                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2373                    pi2_scratch += out_stride;
2374                }
2375
2376                /* eo3[4-7] */
2377                {
2378                    m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
2379                    m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2380
2381                    /* e[3][4-7] stored in pi2_tmp[7][0-7] */
2382                    /* e[4][4-7] stored in pi2_tmp[7][8-15] */
2383                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
2384                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
2385                    m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2386                    m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2387
2388                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2389                    pi2_scratch += out_stride;
2390                    _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2391                    pi2_scratch += out_stride;
2392                }
2393            }
2394        }
2395
2396        if(zero_last12_rows_stg2)
2397        {
2398            /* o & stage 2 pre-transposed out */
2399            {
2400                WORD32 j;
2401                WORD16 *pi2_src_scratch = temp_array;
2402                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2403                WORD32 out_stride = (trans_size);
2404                WORD32 in_stride = (8) * 4;
2405
2406                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2407
2408                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2409
2410                pi2_src_temp += (stride * 9);
2411
2412                if(0 == i)
2413                {
2414                    pi2_src_temp -= (stride * 2 - 8);
2415                }
2416                else
2417                {
2418                    pi2_src_temp -= (stride * 6 - 8);
2419                }
2420                pi2_src_temp -= (stride * 9);
2421
2422                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2423
2424
2425                for(j = 0; j < 2; j++)
2426                {
2427                    if(j)
2428                    {
2429                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2430                    }
2431                    else
2432                    {
2433                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2434                    }
2435                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2436
2437                    /* o0[0-3] */
2438                    {
2439                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2440
2441                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2442                        pi2_src_scratch += in_stride;
2443
2444                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2445
2446                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2447                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2448
2449
2450                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2451                        m_count = _mm_cvtsi32_si128(i4_shift);
2452                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2453
2454
2455                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2456                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2457                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2458                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2459
2460                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2461
2462                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2463                        pi2_dst_scratch += out_stride;
2464                    }
2465
2466                    /* o1[0-3] */
2467                    {
2468                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2469
2470                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2471                        pi2_src_scratch += in_stride;
2472
2473                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
2474
2475                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2476                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2477
2478                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2479                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2480                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2481                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2482
2483                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2484
2485                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2486                        pi2_dst_scratch += ((!i) * out_stride + 8);
2487                    }
2488
2489                    /* o2[0-3] */
2490                    {
2491                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2492
2493                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2494                        pi2_src_scratch += in_stride;
2495
2496                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
2497
2498                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2499                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2500
2501                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2502                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2503                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2504                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2505
2506                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2507
2508                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2509                        pi2_dst_scratch += out_stride;
2510                    }
2511
2512                    /* o3[0-3] */
2513                    {
2514                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2515
2516                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2517                        pi2_src_scratch += 8;
2518
2519                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
2520
2521                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2522                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2523
2524                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2525                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2526                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2527                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2528
2529                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2530
2531                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2532                        pi2_dst_scratch += (i * out_stride + 8);
2533                    }
2534
2535                    /* o4[0-3] */
2536                    {
2537                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2538
2539                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2540                        pi2_src_scratch -= in_stride;
2541
2542                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
2543
2544                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2545                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2546
2547
2548                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2549                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2550                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2551                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2552
2553                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2554
2555                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2556                        pi2_dst_scratch += out_stride;
2557                    }
2558
2559                    /* o5[0-3] */
2560                    {
2561                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2562
2563                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2564                        pi2_src_scratch -= in_stride;
2565
2566                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
2567
2568                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2569                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2570
2571
2572                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2573                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2574                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2575                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2576
2577                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2578
2579                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2580                        pi2_dst_scratch += ((!i) * out_stride + 8);
2581                    }
2582
2583                    /* o6[0-3] */
2584                    {
2585                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2586
2587                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2588                        pi2_src_scratch -= in_stride;
2589
2590                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
2591
2592                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2593                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2594
2595                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2596                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2597                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2598                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2599
2600                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2601
2602                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2603                        pi2_dst_scratch += out_stride;
2604                    }
2605
2606                    /* o7[0-3] */
2607                    {
2608                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2609
2610                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2611                        pi2_src_scratch += 8;
2612
2613                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2614                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2615
2616                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2617                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2618                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2619                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2620
2621                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2622
2623                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2624                        pi2_dst_scratch += (i * out_stride + 8);
2625                    }
2626
2627
2628                }
2629            }
2630        }
2631        else if(zero_last8_rows_stg2)
2632        {
2633            /* o & stage 2 pre-transposed out */
2634            {
2635                WORD32 j;
2636                WORD16 *pi2_src_scratch = temp_array;
2637                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2638                WORD32 out_stride = (trans_size);
2639                WORD32 in_stride = (8) * 4;
2640
2641                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2642
2643
2644                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2645                pi2_src_temp += (stride);
2646                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
2647                pi2_src_temp += (stride * 8);
2648
2649                if(0 == i)
2650                {
2651                    pi2_src_temp -= (stride * 2 - 8);
2652                }
2653                else
2654                {
2655                    pi2_src_temp -= (stride * 6 - 8);
2656                }
2657
2658                pi2_src_temp -= (stride * 8);
2659                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
2660                pi2_src_temp -= (stride);
2661                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2662
2663
2664                for(j = 0; j < 2; j++)
2665                {
2666                    if(j)
2667                    {
2668                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2669                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
2670                    }
2671                    else
2672                    {
2673                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2674                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
2675                    }
2676                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2677                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
2678
2679                    /* o0[0-3] */
2680                    {
2681                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2682                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2683
2684                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2685                        pi2_src_scratch += in_stride;
2686
2687                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2688                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
2689
2690                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2691                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2692                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2693
2694
2695                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2696                        m_count = _mm_cvtsi32_si128(i4_shift);
2697
2698                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2699
2700                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2701                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2702                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2703                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2704
2705                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2706
2707                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2708                        pi2_dst_scratch += out_stride;
2709                    }
2710
2711                    /* o1[0-3] */
2712                    {
2713                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2714                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2715
2716                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2717                        pi2_src_scratch += in_stride;
2718
2719                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
2720                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
2721
2722                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2723                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2724                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2725
2726                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2727                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2728                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2729                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2730
2731                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2732
2733                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2734                        pi2_dst_scratch += ((!i) * out_stride + 8);
2735                    }
2736
2737                    /* o2[0-3] */
2738                    {
2739                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2740                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2741
2742                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2743                        pi2_src_scratch += in_stride;
2744
2745                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
2746                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
2747
2748                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
2749                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2750                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2751
2752                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2753                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2754                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2755                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2756
2757                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2758
2759                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2760                        pi2_dst_scratch += out_stride;
2761                    }
2762
2763                    /* o3[0-3] */
2764                    {
2765                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2766                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2767
2768                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2769                        pi2_src_scratch += 8;
2770
2771                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
2772                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
2773
2774                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
2775                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2776                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2777
2778                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2779                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2780                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2781                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2782
2783                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2784
2785                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2786                        pi2_dst_scratch += (i * out_stride + 8);
2787                    }
2788
2789                    /* o4[0-3] */
2790                    {
2791                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2792                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2793
2794                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2795                        pi2_src_scratch -= in_stride;
2796
2797                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
2798                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
2799
2800                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
2801                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2802                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2803
2804
2805                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2806                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2807                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2808                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2809
2810                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2811
2812                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2813                        pi2_dst_scratch += out_stride;
2814                    }
2815
2816                    /* o5[0-3] */
2817                    {
2818                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2819                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2820
2821                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2822                        pi2_src_scratch -= in_stride;
2823
2824                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
2825                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
2826
2827                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2828                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2829                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2830
2831
2832                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2833                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2834                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2835                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2836
2837                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2838
2839                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2840                        pi2_dst_scratch += ((!i) * out_stride + 8);
2841                    }
2842
2843                    /* o6[0-3] */
2844                    {
2845                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2846                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2847
2848                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2849                        pi2_src_scratch -= in_stride;
2850
2851                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
2852                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
2853
2854                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2855                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2856                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2857
2858                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2859                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2860                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2861                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2862
2863                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2864
2865                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2866                        pi2_dst_scratch += out_stride;
2867                    }
2868
2869                    /* o7[0-3] */
2870                    {
2871                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2872                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2873
2874                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2875                        pi2_src_scratch += 8;
2876
2877                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2878                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2879                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2880
2881                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2882                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2883                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2884                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2885
2886                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2887
2888                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2889                        pi2_dst_scratch += (i * out_stride + 8);
2890                    }
2891                }
2892            }
2893        }
2894        else
2895        {
2896            /* o & stage 2 pre-transposed out */
2897            {
2898                WORD32 j;
2899                WORD16 *pi2_src_scratch = temp_array;
2900                WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2901                WORD32 out_stride = (trans_size);
2902                WORD32 in_stride = (8) * 4;
2903
2904                pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2905
2906
2907                m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2908                pi2_src_temp += (stride);
2909                m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
2910                pi2_src_temp += (stride * 7);
2911                m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9
2912                pi2_src_temp += (stride);
2913                m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13
2914                if(0 == i)
2915                {
2916                    pi2_src_temp -= (stride * 2 - 8);
2917                }
2918                else
2919                {
2920                    pi2_src_temp -= (stride * 6 - 8);
2921                }
2922                m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15
2923                pi2_src_temp -= (stride);
2924                m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11
2925                pi2_src_temp -= (stride * 7);
2926                m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
2927                pi2_src_temp -= (stride);
2928                m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2929
2930
2931                for(j = 0; j < 2; j++)
2932                {
2933
2934                    if(j) //H8B= higher 8 bytes L8B lower 8 bytes
2935                    {
2936                        m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2937                        m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
2938                        m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
2939                        m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
2940                    }
2941                    else
2942                    {
2943                        m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2944                        m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
2945                        m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
2946                        m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
2947                    }
2948                    m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2949                    m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
2950                    m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
2951                    m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25  9
2952
2953
2954                    /* o0[0-3] */
2955                    {
2956                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2957                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2958                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2959                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2960
2961
2962                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2963                        pi2_src_scratch += in_stride;
2964
2965                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2966                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
2967                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
2968                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
2969
2970                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2971                        m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2972                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
2973                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2974                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2975
2976                        m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2977                        m_count = _mm_cvtsi32_si128(i4_shift);
2978                        m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2979
2980                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2981                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2982                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2983                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2984
2985                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2986
2987                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2988                        pi2_dst_scratch += out_stride;
2989                    }
2990
2991                    /* o1[0-3] */
2992                    {
2993                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2994                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2995                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
2996                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
2997
2998
2999                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3000                        pi2_src_scratch += in_stride;
3001
3002                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
3003                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
3004                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
3005                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
3006
3007                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3008                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
3009                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
3010                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3011                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3012
3013                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3014                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3015                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3016                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3017
3018                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3019
3020                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3021                        pi2_dst_scratch += ((!i) * out_stride + 8);
3022                    }
3023
3024                    /* o2[0-3] */
3025                    {
3026                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3027                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3028                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3029                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3030
3031                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3032                        pi2_src_scratch += in_stride;
3033
3034                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
3035                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
3036                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
3037                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
3038
3039                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
3040                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3041                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
3042                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3043                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3044
3045                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3046                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3047                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3048                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3049
3050                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3051
3052                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3053                        pi2_dst_scratch += out_stride;
3054                    }
3055
3056                    /* o3[0-3] */
3057                    {
3058                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3059                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3060                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3061                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3062
3063
3064                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3065                        pi2_src_scratch += 8;
3066
3067                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
3068                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
3069                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
3070                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
3071
3072                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
3073                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
3074                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
3075                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3076                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3077
3078                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3079                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3080                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3081                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3082
3083                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3084
3085                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3086                        pi2_dst_scratch += (i * out_stride + 8);
3087                    }
3088
3089                    /* o4[0-3] */
3090                    {
3091                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3092                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3093                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3094                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3095
3096                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3097                        pi2_src_scratch -= in_stride;
3098
3099                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
3100                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
3101                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
3102                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
3103
3104                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
3105                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3106                        m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
3107                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3108                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3109
3110                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3111                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3112                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3113                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3114
3115                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3116
3117                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3118                        pi2_dst_scratch += out_stride;
3119                    }
3120
3121                    /* o5[0-3] */
3122                    {
3123                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3124                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3125                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3126                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3127
3128
3129                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3130                        pi2_src_scratch -= in_stride;
3131
3132                        m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
3133                        m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
3134                        m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
3135                        m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
3136
3137                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3138                        m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
3139                        m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
3140                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3141                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3142
3143                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3144                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3145                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3146                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3147
3148                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3149
3150                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3151                        pi2_dst_scratch += ((!i) * out_stride + 8);
3152                    }
3153
3154                    /* o6[0-3] */
3155                    {
3156                        m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3157                        m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3158                        m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3159                        m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3160
3161
3162                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3163                        pi2_src_scratch -= in_stride;
3164
3165                        m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
3166                        m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
3167                        m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
3168                        m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
3169
3170
3171                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3172                        m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3173                        m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3174                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3175                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3176
3177                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3178                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3179                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3180                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3181
3182                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3183
3184                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3185                        pi2_dst_scratch += out_stride;
3186                    }
3187
3188                    /* o7[0-3] */
3189                    {
3190                        m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3191                        m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3192                        m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3193                        m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3194
3195                        m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3196                        pi2_src_scratch += 8;
3197
3198                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3199                        m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
3200                        m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
3201                        m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3202                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3203
3204
3205                        m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3206                        m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3207                        m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3208                        m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3209
3210                        m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3211
3212                        _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3213                        pi2_dst_scratch += (i * out_stride + 8);
3214                    }
3215
3216                }
3217            }
3218        }
3219    }
3220
3221    /* Transpose */
3222    {
3223        WORD16 *pi2_src_scratch;
3224        UWORD8 *pu1_pred_temp = pu1_pred;
3225        WORD32 out_stride = dst_strd;
3226        WORD32 in_stride = trans_size;
3227        WORD32 j;
3228        m_temp_reg_1 = _mm_setzero_si128();
3229        for(i = 0; i < 2; i++)
3230        {
3231            pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp;
3232
3233            for(j = 0; j < 2; j++)
3234            {
3235                m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
3236                pi2_src_scratch += in_stride;
3237                m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
3238                pi2_src_scratch += ((!i) * in_stride + 8);
3239                m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
3240                pi2_src_scratch += (in_stride);
3241                m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
3242                pi2_src_scratch += (i * in_stride + 8);
3243                m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
3244                pi2_src_scratch += in_stride;
3245                m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
3246                pi2_src_scratch += ((!i) * in_stride + 8);
3247                m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
3248                pi2_src_scratch += in_stride;
3249                m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
3250                pi2_src_scratch += (i * in_stride + 8);
3251
3252                m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
3253                m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
3254
3255                m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
3256                m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
3257
3258                m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
3259                m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
3260
3261                m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
3262                m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
3263
3264
3265                m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
3266                m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
3267
3268                m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
3269                m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
3270
3271                m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
3272                m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
3273
3274                m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
3275                m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
3276
3277
3278                m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
3279                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3280
3281                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3282                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3283
3284                m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
3285                m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0);
3286                m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12);
3287
3288                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
3289                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3290                pu1_dst += out_stride;
3291                pu1_pred_temp += pred_strd;
3292
3293                m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
3294                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3295
3296                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3297                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3298
3299                m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
3300                m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0);
3301                m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12);
3302
3303                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45);
3304                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3305                pu1_dst += out_stride;
3306                pu1_pred_temp += pred_strd;
3307
3308                m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
3309                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3310
3311                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3312                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3313
3314                m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
3315                m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0);
3316                m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12);
3317
3318                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46);
3319                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3320                pu1_dst += out_stride;
3321                pu1_pred_temp += pred_strd;
3322
3323                m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
3324                m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3325
3326                m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3327                m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3328
3329                m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
3330                m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0);
3331                m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12);
3332
3333                m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47);
3334                _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3335                pu1_dst += out_stride;
3336                pu1_pred_temp += pred_strd;
3337            }
3338        }
3339    }
3340}
3341