1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19*******************************************************************************
20* @file
21*  ihevc_chroma_intra_pred_filters_atom_intr.c
22*
23* @brief
24*  Contains function Definition for intra prediction  interpolation filters
25*
26*
27* @author
28*  Ittiam
29*
30* @par List of Functions:
31*  ihevc_intra_pred_chroma_planar_ssse3()
32*
33*  ihevc_intra_pred_chroma_dc_ssse3()
34*
35*  ihevc_intra_pred_chroma_horz_ssse3()
36*
37*  ihevc_intra_pred_chroma_ver_ssse3()
38*
39*  ihevc_intra_pred_chroma_mode2_ssse3()
40*
41*  ihevc_intra_pred_chroma_mode_18_34_ssse3()
42*
43*  ihevc_intra_pred_chroma_mode_3_to_9_ssse3()
44*
45*  ihevc_intra_pred_chroma_mode_11_to_17_ssse3()
46*
47*  ihevc_intra_pred_chroma_mode_19_to_25_ssse3()
48*
49*  ihevc_intra_pred_chroma_mode_27_to_33_ssse3()
50*
51*
52*
53* @remarks
54*  None
55*
56*******************************************************************************
57*/
58
59
60/*****************************************************************************/
61/* File Includes                                                             */
62/*****************************************************************************/
63
64#include "ihevc_typedefs.h"
65#include "ihevc_platform_macros.h"
66#include "ihevc_macros.h"
67#include "ihevc_func_selector.h"
68#include "ihevc_intra_pred.h"
69
70#include "ihevc_chroma_intra_pred.h"
71#include "ihevc_common_tables.h"
72#include "ihevc_tables_x86_intr.h"
73
74#include <mmintrin.h>
75#include <xmmintrin.h>
76#include <emmintrin.h>
77
78#include <immintrin.h>
79
80
81/****************************************************************************/
82/* Constant Macros                                                          */
83/****************************************************************************/
84#define MAX_CU_SIZE 64
85#define BIT_DEPTH 8
86#define T32_4NT 128
87#define T16_4NT 64
88#define T16C_4NT 64
89#define T8C_4NT 32
90/****************************************************************************/
91/* Function Macros                                                          */
92/****************************************************************************/
93
94#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
95
96/* tables to shuffle 8-bit values */
97
98/*****************************************************************************/
99/* Function Definition                                                      */
100/*****************************************************************************/
101
102
103
104/**
105*******************************************************************************
106*
107* @brief
108*  Planar Intraprediction with reference neighboring samples location
109* pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
110* to section 8.4.4.2.4 in the standard
111*
112* @par Description:
113*
114*
115* @param[in] pu1_src
116*  UWORD8 pointer to the source
117*
118* @param[in] pu1_dst
119*  UWORD8 pointer to the destination
120*
121* @param[in] src_strd
122*  integer source stride
123*
124* @param[in] dst_strd
125*  integer destination stride
126*
127* @param[in] nt
128*  integer Transform Block size
129*
130* @param[in] mode
131*  integer intraprediction mode
132*
133* @returns
134*
135* @remarks
136*  None
137*
138*******************************************************************************
139*/
140
141void ihevc_intra_pred_chroma_planar_ssse3(UWORD8 *pu1_ref,
142                                          WORD32 src_strd,
143                                          UWORD8 *pu1_dst,
144                                          WORD32 dst_strd,
145                                          WORD32 nt,
146                                          WORD32 mode)
147{
148
149    WORD32 row, col;
150    WORD32 log2nt = 5;
151    WORD32 two_nt, three_nt;
152
153    __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
154    __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
155    UNUSED(src_strd);
156    UNUSED(mode);
157    switch(nt)
158    {
159        case 16:
160            log2nt = 4;
161            break;
162        case 8:
163            log2nt = 3;
164            break;
165        case 4:
166            log2nt = 2;
167            break;
168        default:
169            break;
170    }
171    two_nt = 2 * nt;
172    three_nt = 3 * nt;
173
174    /* Planar filtering */
175
176/* setting vallues in  registera*/
177
178//  pu1_ref[2*(two_nt - 1 - row)]
179//  pu1_ref[2 * (three_nt + 1)]
180//  pu1_ref[2 * (two_nt + 1) + col]
181//  pu1_ref[2 * (nt - 1)]
182
183    const_temp_4x32b  = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
184                                      pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
185                                      pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
186
187    const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
188                                      pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
189
190    const_temp4_4x32b = _mm_set1_epi16(nt - 1);
191    const_temp6_4x32b = _mm_set1_epi16(nt);
192    const_temp7_4x32b = _mm_set1_epi16(4);
193
194    zero_8x16b = _mm_set1_epi32(0);
195
196
197    if(nt % 4 == 0)
198    {
199        const_temp7_4x32b = _mm_set1_epi16(4);
200
201        for(row = 0; row < nt; row++)
202        {
203            __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
204            __m128i res_temp3_8x16b;
205
206            const_temp2_4x32b  = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
207                                               pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
208                                               pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
209
210            const_temp3_4x32b  = _mm_set1_epi16((row + 1));
211            row_8x16b = _mm_set1_epi16((nt - 1 - row));
212
213            const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
214            col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
215
216            const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
217
218            /*(row + 1) * pu1_ref[nt - 1]*/
219            res_temp_8x16b  = _mm_mullo_epi16(const_temp3_4x32b,  const_temp1_4x32b);
220
221            /*(row + 1) * pu1_ref[nt - 1] + nt)*/
222            res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
223
224            for(col = 0; col < 2 * nt; col += 8)
225            {
226                __m128i src_temp_8x16b;
227
228                /* loding 8bit 16 pixles*/
229                src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
230
231                //src_temp_8x16b =  _mm_cvtepu8_epi16 (src_temp_8x16b); /* row=0*/
232                src_temp_8x16b = _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b);
233
234                /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
235                res_temp1_8x16b  = _mm_mullo_epi16(src_temp_8x16b,  row_8x16b);
236
237                /*(col + 1) * pu1_ref[three_nt + 1]*/
238                res_temp2_8x16b  = _mm_mullo_epi16(const_temp_4x32b,  col_8x16b);
239
240                /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
241                res_temp3_8x16b  = _mm_mullo_epi16(const_temp2_4x32b,  const_temp5_4x32b);
242
243                res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
244                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
245                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
246
247                res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
248                res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
249
250                _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
251
252                const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
253                col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
254            } /* inner loop ends here */
255        }
256    }
257}
258
259
260/**
261*******************************************************************************
262*
263* @brief
264*  Intraprediction for DC mode with reference neighboring  samples location
265* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
266* to section 8.4.4.2.5 in the standard
267*
268* @par Description:
269*
270*
271* @param[in] pu1_src
272*  UWORD8 pointer to the source
273*
274* @param[in] pu1_dst
275*  UWORD8 pointer to the destination
276*
277* @param[in] src_strd
278*  integer source stride
279*
280* @param[in] dst_strd
281*  integer destination stride
282*
283* @param[in] nt
284*  integer Transform Block size (Chroma)
285*
286* @param[in] mode
287*  integer intraprediction mode
288*
289* @returns
290*
291* @remarks
292*  None
293*
294*******************************************************************************
295*/
296
297void ihevc_intra_pred_chroma_dc_ssse3(UWORD8 *pu1_ref,
298                                      WORD32 src_strd,
299                                      UWORD8 *pu1_dst,
300                                      WORD32 dst_strd,
301                                      WORD32 nt,
302                                      WORD32 mode)
303{
304
305    WORD32 acc_dc_u, acc_dc_v;
306    WORD32 dc_val_u, dc_val_v;
307    WORD32 row;
308    WORD32 log2nt = 5;
309    __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
310    __m128i src_temp7, src_temp8, src_temp9, src_temp10;
311    __m128i m_zero = _mm_set1_epi32(0);
312    UNUSED(src_strd);
313    UNUSED(mode);
314
315    switch(nt)
316    {
317        case 32:
318            log2nt = 5;
319            break;
320        case 16:
321            log2nt = 4;
322            break;
323        case 8:
324            log2nt = 3;
325            break;
326        case 4:
327            log2nt = 2;
328            break;
329        default:
330            break;
331    }
332
333    acc_dc_u = 0;
334    acc_dc_v = 0;
335
336    /* Calculate DC value for the transform block */
337
338    m_mask = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
339
340    if(nt == 16)
341    {
342        __m128i temp_sad, sign_8x16b;
343
344        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
345        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
346        src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
347        src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
348
349        src_temp5  = _mm_unpacklo_epi8(src_temp3, m_zero);
350        src_temp6  = _mm_unpacklo_epi8(src_temp4, m_zero);
351        src_temp9  = _mm_unpacklo_epi8(src_temp7, m_zero);
352        src_temp10 = _mm_unpacklo_epi8(src_temp8, m_zero);
353
354        src_temp3 = _mm_srli_si128(src_temp3, 8);
355        src_temp4 = _mm_srli_si128(src_temp4, 8);
356        src_temp7 = _mm_srli_si128(src_temp7, 8);
357        src_temp8 = _mm_srli_si128(src_temp8, 8);
358
359        src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
360        src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
361        src_temp7 = _mm_unpacklo_epi8(src_temp7, m_zero);
362        src_temp8 = _mm_unpacklo_epi8(src_temp8, m_zero);
363
364        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
365        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
366        src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
367        src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
368
369        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
370        src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
371
372        src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
373        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
374        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
375        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
376
377        sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
378        src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
379
380        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
381        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
382        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
383    }
384
385    else if(nt == 8)
386    {
387        __m128i temp_sad, sign_8x16b;
388        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
389        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
390
391        src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
392        src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero);
393
394        src_temp3 = _mm_srli_si128(src_temp3, 8);
395        src_temp4 = _mm_srli_si128(src_temp4, 8);
396
397        src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
398        src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
399
400        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
401        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
402
403        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
404        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
405        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
406        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
407
408        sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
409        src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
410
411        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
412        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
413        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
414    }
415
416    else if(nt == 4)
417    {
418        __m128i temp_sad, sign_8x16b;
419        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
420
421        src_temp5 =  _mm_unpacklo_epi8(src_temp3, m_zero);
422        src_temp4 = _mm_srli_si128(src_temp3, 8);
423
424        src_temp4 =  _mm_unpacklo_epi8(src_temp4, m_zero);
425
426        src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
427
428        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
429        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
430        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
431
432        sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
433        src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
434
435        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
436        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
437        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
438    }
439
440
441    acc_dc_u += pu1_ref[6 * nt];
442    acc_dc_v += pu1_ref[6 * nt + 1];
443
444    acc_dc_u -= pu1_ref[4 * nt];
445    acc_dc_v -= pu1_ref[4 * nt + 1];
446
447    dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
448    dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
449
450    dc_val_u = dc_val_u | (dc_val_v << 8);
451
452    /* Fill the remaining rows with DC value*/
453
454    if(nt == 4)
455    {
456        src_temp1 = _mm_set1_epi16(dc_val_u);
457
458        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
459        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
460        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
461        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
462        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
463
464    }
465    else if(nt == 8)
466    {
467        src_temp1 = _mm_set1_epi16(dc_val_u);
468
469        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
470        _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
471        _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
472        _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
473        _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
474
475        _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
476        _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
477        _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
478        _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
479
480    }
481
482    else /* nt == 16 */
483    {
484        src_temp1 = _mm_set1_epi16(dc_val_u);
485
486        for(row = 0; row < nt; row += 8)
487        {
488            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
489            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
490            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
491            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
492            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
493            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
494            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
495            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
496            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
497
498            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
499            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
500            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
501            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
502            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
503            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
504            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
505            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
506
507            pu1_dst += 8 * dst_strd;
508        }
509    }
510
511}
512
513
514/**
515*******************************************************************************
516*
517* @brief
518*  Horizontal intraprediction(mode 10) with reference  samples location
519* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
520* to section 8.4.4.2.6 in the standard (Special case)
521*
522* @par Description:
523*
524*
525* @param[in] pu1_src
526*  UWORD8 pointer to the source
527*
528* @param[in] pu1_dst
529*  UWORD8 pointer to the destination
530*
531* @param[in] src_strd
532*  integer source stride
533*
534* @param[in] dst_strd
535*  integer destination stride
536*
537* @param[in] nt
538*  integer Transform Block size
539*
540* @param[in] mode
541*  integer intraprediction mode
542*
543* @returns
544*
545* @remarks
546*  None
547*
548*******************************************************************************
549*/
550
551void ihevc_intra_pred_chroma_horz_ssse3(UWORD8 *pu1_ref,
552                                        WORD32 src_strd,
553                                        UWORD8 *pu1_dst,
554                                        WORD32 dst_strd,
555                                        WORD32 nt,
556                                        WORD32 mode)
557{
558
559    WORD32 row;
560    __m128i temp1, temp2, temp3, temp4, temp5, temp6,  temp7, temp8;
561    UNUSED(src_strd);
562    UNUSED(mode);
563
564    /* Replication to next rows*/
565
566    if(nt == 8)
567    {
568        for(row = 0; row < nt; row += 4)
569        {
570            temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
571            temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
572            temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
573            temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
574            temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
575            temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
576            temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
577            temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
578
579            temp2 = _mm_unpacklo_epi8(temp1, temp2);
580            temp4 = _mm_unpacklo_epi8(temp3, temp4);
581            temp6 = _mm_unpacklo_epi8(temp5, temp6);
582            temp8 = _mm_unpacklo_epi8(temp7, temp8);
583
584            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), temp2);
585            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), temp4);
586            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), temp6);
587            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), temp8);
588
589        }
590    }
591    else if(nt == 16)
592    {
593        for(row = 0; row < nt; row += 4)
594        {
595            temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
596            temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
597
598            temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
599            temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
600
601            temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
602            temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
603
604            temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
605            temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
606
607            temp2 = _mm_unpacklo_epi8(temp1, temp2);
608            temp4 = _mm_unpacklo_epi8(temp3, temp4);
609            temp6 = _mm_unpacklo_epi8(temp5, temp6);
610            temp8 = _mm_unpacklo_epi8(temp7, temp8);
611
612            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 0), temp2);
613            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 16), temp2);
614
615            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 0), temp4);
616            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), temp4);
617
618            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 0), temp6);
619            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), temp6);
620
621            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 0), temp8);
622            _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), temp8);
623
624
625        }
626    }
627    else
628    {
629        temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 0]);
630        temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 0]);
631
632        temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 1]);
633        temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 1]);
634
635        temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 2]);
636        temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 2]);
637
638        temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 3]);
639        temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 3]);
640
641        temp2 = _mm_unpacklo_epi8(temp1, temp2);
642        temp4 = _mm_unpacklo_epi8(temp3, temp4);
643        temp6 = _mm_unpacklo_epi8(temp5, temp6);
644        temp8 = _mm_unpacklo_epi8(temp7, temp8);
645
646        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), temp2);
647        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), temp4);
648        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), temp6);
649        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), temp8);
650    }
651}
652
653
654/**
655*******************************************************************************
656*
657* @brief
658*  Horizontal intraprediction with reference neighboring  samples location
659* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
660* to section 8.4.4.2.6 in the standard (Special case)
661*
662* @par Description:
663*
664*
665* @param[in] pu1_src
666*  UWORD8 pointer to the source
667*
668* @param[in] pu1_dst
669*  UWORD8 pointer to the destination
670*
671* @param[in] src_strd
672*  integer source stride
673*
674* @param[in] dst_strd
675*  integer destination stride
676*
677* @param[in] nt
678*  integer Transform Block size
679*
680* @param[in] mode
681*  integer intraprediction mode
682*
683* @returns
684*
685* @remarks
686*  None
687*
688*******************************************************************************
689*/
690
691void ihevc_intra_pred_chroma_ver_ssse3(UWORD8 *pu1_ref,
692                                       WORD32 src_strd,
693                                       UWORD8 *pu1_dst,
694                                       WORD32 dst_strd,
695                                       WORD32 nt,
696                                       WORD32 mode)
697{
698    __m128i src_temp1;
699    UNUSED(src_strd);
700    UNUSED(mode);
701
702    /* Replication to next columns*/
703    if(nt == 8)
704    {
705        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
706
707        _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp1);
708        _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
709        _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp1);
710        _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp1);
711
712        _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp1);
713        _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp1);
714        _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp1);
715        _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp1);
716
717    }
718    if(nt == 16)
719    {
720        __m128i temp1, temp2;
721
722        temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
723        temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 16));
724
725        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
726        _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
727        _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
728        _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
729        _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
730        _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
731        _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
732        _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
733        _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
734
735        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
736        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
737        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
738        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
739        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
740        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
741        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
742        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
743
744        _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
745        _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
746        _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
747        _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
748        _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
749        _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
750        _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
751        _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
752
753        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
754        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
755        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
756        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
757        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
758        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
759        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
760        _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
761
762    }
763    else
764    {
765        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
766
767        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
768        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
769        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
770        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
771
772
773    }
774
775}
776
777/**
778*******************************************************************************
779*
780* @brief
781*  Intraprediction for mode 2 (sw angle) with reference  neighboring samples
782* location pointed by 'pu1_ref' to the  TU block location pointed by
783* 'pu1_dst'  Refer to section 8.4.4.2.6 in the standard
784*
785* @par Description:
786*
787*
788* @param[in] pu1_src
789*  UWORD8 pointer to the source
790*
791* @param[in] pu1_dst
792*  UWORD8 pointer to the destination
793*
794* @param[in] src_strd
795*  integer source stride
796*
797* @param[in] dst_strd
798*  integer destination stride
799*
800* @param[in] nt
801*  integer Transform Block size
802*
803* @param[in] mode
804*  integer intraprediction mode
805*
806* @returns
807*
808* @remarks
809*  None
810*
811*******************************************************************************
812*/
813
814void ihevc_intra_pred_chroma_mode2_ssse3(UWORD8 *pu1_ref,
815                                         WORD32 src_strd,
816                                         UWORD8 *pu1_dst,
817                                         WORD32 dst_strd,
818                                         WORD32 nt,
819                                         WORD32 mode)
820{
821    WORD32 row, col;
822
823
824    __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8, sm2, sm3;
825    UNUSED(src_strd);
826    UNUSED(mode);
827
828    sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
829    sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY8[0]);
830
831    /* For the angle 45, replication is done from the corresponding angle */
832    /* intra_pred_ang = tan(angle) in q5 format */
833
834    if(nt == 4)
835    {
836        /*pu1_ref[two_nt - row - (col+1) - 1]*/
837        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 8 - 2));
838        src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 8 - 2));
839        src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 8 - 2));
840        src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 8 - 2));
841
842        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm2));
843        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm2));
844        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm2));
845        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm2));
846
847    }
848    else if(nt == 8)
849    {
850        /*pu1_ref[two_nt - row - (col+1) - 1]*/
851        src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 16 - 2));
852        src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 16 - 2));
853        src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 16 - 2));
854        src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 16 - 2));
855        src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 4 - 16 - 2));
856        src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 5 - 16 - 2));
857        src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 6 - 16 - 2));
858        src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 7 - 16 - 2));
859
860        _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
861        _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
862        _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
863        _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
864        _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
865        _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
866        _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
867        _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
868
869
870    }
871    else
872    {
873        for(row = 0; row < nt; row += 8)
874        {
875            for(col = 0; col < 2 * nt; col += 16)
876            {   /*pu1_ref[two_nt - row - (col+1) - 1]*/
877                src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 0) - (col + 16) - 2));
878                src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 1) - (col + 16) - 2));
879                src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 2) - (col + 16) - 2));
880                src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 3) - (col + 16) - 2));
881                src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 4) - (col + 16) - 2));
882                src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 5) - (col + 16) - 2));
883                src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 6) - (col + 16) - 2));
884                src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 7) - (col + 16) - 2));
885
886                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
887                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
888                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
889                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
890                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
891                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
892                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
893                _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
894            }
895        }
896    }
897}
898
899/**
900*******************************************************************************
901*
902* @brief
903*  Intraprediction for mode 34 (ne angle) and  mode 18 (nw angle) with
904* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
905* block location pointed by 'pu1_dst'
906*
907* @par Description:
908*
909*
910* @param[in] pu1_src
911*  UWORD8 pointer to the source
912*
913* @param[in] pu1_dst
914*  UWORD8 pointer to the destination
915*
916* @param[in] src_strd
917*  integer source stride
918*
919* @param[in] dst_strd
920*  integer destination stride
921*
922* @param[in] nt
923*  integer Transform Block size
924*
925* @param[in] mode
926*  integer intraprediction mode
927*
928* @returns
929*
930* @remarks
931*  None
932*
933*******************************************************************************
934*/
935
936void ihevc_intra_pred_chroma_mode_18_34_ssse3(UWORD8 *pu1_ref,
937                                              WORD32 src_strd,
938                                              UWORD8 *pu1_dst,
939                                              WORD32 dst_strd,
940                                              WORD32 nt,
941                                              WORD32 mode)
942{
943    WORD32 row;
944    WORD32 idx = 0;
945
946    __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
947    UNUSED(src_strd);
948
949    if(mode == 34)
950    {
951        if(nt == 4)
952        {
953            /*pu1_ref[two_nt + col + idx + 1]*/
954            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
955            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
956            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
957            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
958
959            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
960            _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
961            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
962            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
963
964        }
965        else if(nt == 8)
966        {
967            /*pu1_ref[two_nt + col + idx + 1]*/
968            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
969            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
970            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
971            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
972            src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
973            src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
974            src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
975            src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
976
977            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
978            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
979            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
980            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
981            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
982            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
983            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
984            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
985
986
987        }
988        else
989        {
990            __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
991            for(row = 0; row < nt; row += 8)
992            {
993                /*pu1_ref[two_nt + col + idx + 1]*/
994                src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) +  0 + (4 * nt) + 2 * idx + 2));
995                src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
996                src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) +  0 + (4 * nt) + 2 * idx + 2));
997                src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
998                src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) +  0 + (4 * nt) + 2 * idx + 2));
999                src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1000                src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1001                src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1002
1003                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
1004                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
1005                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
1006                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
1007                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
1008                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
1009                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
1010                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
1011
1012                src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1013                src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1014                src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1015                src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1016                src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1017                src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1018                src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1019                src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1020
1021                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
1022                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
1023                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
1024                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
1025                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
1026                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
1027                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
1028                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
1029
1030                pu1_ref += 2 * 8;
1031                pu1_dst += 8 * dst_strd;
1032            }
1033        }
1034    }
1035    else
1036    {
1037        if(nt == 4)
1038        {
1039            /*pu1_ref[two_nt + col + idx + 1]*/
1040            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
1041            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
1042            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
1043            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
1044
1045            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
1046            _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
1047            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
1048            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
1049
1050
1051        }
1052        else if(nt == 8)
1053        {
1054            /*pu1_ref[two_nt + col + idx + 1]*/
1055            src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
1056            src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
1057            src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
1058            src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
1059            src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
1060            src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
1061            src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
1062            src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
1063
1064            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
1065            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
1066            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
1067            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
1068            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
1069            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
1070            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
1071            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
1072
1073
1074        }
1075        else
1076        {
1077            __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
1078            for(row = 0; row < nt; row += 8)
1079            {
1080                /*pu1_ref[two_nt + col + idx + 1]*/
1081                src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1082                src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1083                src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1084                src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1085                src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1086                src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1087                src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1088                src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1089
1090                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
1091                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
1092                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
1093                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
1094                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
1095                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
1096                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
1097                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
1098
1099                src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1100                src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1101                src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1102                src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1103                src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1104                src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1105                src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) +  0 + (4 * nt) + 2 * idx + 2));
1106                src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1107
1108                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
1109                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
1110                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
1111                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
1112                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
1113                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
1114                _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
1115                _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
1116
1117                pu1_ref -= 2 * 8;
1118                pu1_dst += 8 * dst_strd;
1119            }
1120        }
1121    }
1122
1123}
1124
1125/**
1126*******************************************************************************
1127*
1128* @brief
1129*  Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
1130* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
1131* block location pointed by 'pu1_dst'
1132*
1133* @par Description:
1134*
1135*
1136* @param[in] pu1_src
1137*  UWORD8 pointer to the source
1138*
1139* @param[in] pu1_dst
1140*  UWORD8 pointer to the destination
1141*
1142* @param[in] src_strd
1143*  integer source stride
1144*
1145* @param[in] dst_strd
1146*  integer destination stride
1147*
1148* @param[in] nt
1149*  integer Transform Block size
1150*
1151* @param[in] mode
1152*  integer intraprediction mode
1153*
1154* @returns
1155*
1156* @remarks
1157*  None
1158*
1159*******************************************************************************
1160*/
1161
1162void ihevc_intra_pred_chroma_mode_3_to_9_ssse3(UWORD8 *pu1_ref,
1163                                               WORD32 src_strd,
1164                                               UWORD8 *pu1_dst,
1165                                               WORD32 dst_strd,
1166                                               WORD32 nt,
1167                                               WORD32 mode)
1168{
1169    WORD32 row, col;
1170
1171    WORD32 intra_pred_ang;
1172
1173    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
1174    __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
1175    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm1;
1176    UNUSED(src_strd);
1177
1178    /* Intra Pred Angle according to the mode */
1179    intra_pred_ang = gai4_ihevc_ang_table[mode];
1180
1181    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
1182    /* samples dependent on distance to obtain destination sample */
1183
1184    sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
1185    const_temp_4x32b  = _mm_set1_epi16(16);
1186    const_temp2_4x32b = _mm_set1_epi32(31);
1187    const_temp3_4x32b = _mm_set1_epi16(32);
1188    const_temp4_4x32b = _mm_set1_epi32(4);
1189
1190    two_nt_4x32b = _mm_set1_epi32(1);
1191
1192    zero_8x16b = _mm_set1_epi16(0);
1193
1194
1195    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
1196    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
1197
1198    row_4x32b = _mm_set_epi32(4, 3, 2, 1);
1199
1200    if(nt == 4)
1201    {
1202        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
1203        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
1204        const_temp2_4x32b = _mm_set1_epi16(31);
1205        const_temp4_4x32b = _mm_set1_epi16(4);
1206        two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
1207
1208        {
1209            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
1210            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
1211
1212            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
1213            __m128i src_values10;
1214
1215            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
1216
1217            /* pos = ((row + 1) * intra_pred_ang); */
1218            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
1219
1220            /* fract = pos & (31); */
1221            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
1222
1223            ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b,  5);
1224
1225            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b,  ref_main_idx_4x32b);
1226
1227            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
1228
1229            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
1230
1231            /*(32 - fract) */
1232            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
1233
1234            _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
1235            _mm_storel_epi64((__m128i *)(ai1_src_temp_val),  src_values10);
1236
1237            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
1238            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
1239            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
1240            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
1241
1242            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
1243            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
1244            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
1245            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
1246
1247            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
1248            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
1249            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
1250            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
1251
1252            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
1253            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
1254            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
1255            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
1256
1257            {
1258                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1259                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1260
1261                /* loding 8-bit 16 pixels */
1262                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 8)); /* col=0*/
1263                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 8)); /* col=1*/
1264                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 8)); /* col=2*/
1265                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 8)); /* col=3*/
1266
1267                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
1268                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
1269                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
1270                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
1271
1272                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
1273                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
1274                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
1275                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
1276
1277                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1278                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
1279                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
1280                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
1281                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
1282
1283                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1284                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
1285                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
1286                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
1287                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
1288
1289                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1290                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
1291                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
1292                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
1293                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
1294
1295                /* converting 16 bit to 8 bit */
1296                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
1297                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
1298                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
1299                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
1300
1301                src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
1302                src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
1303                src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
1304                src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
1305
1306                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
1307                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
1308
1309                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
1310                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
1311
1312                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b);             /* row=0*/
1313
1314                src_temp2_8x16b  = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
1315                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b);       /* row=1*/
1316
1317                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b);       /* row=2*/
1318
1319                src_temp4_8x16b  = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
1320                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b);       /* row=4*/
1321
1322            }
1323        }
1324    }
1325    else
1326    {
1327        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
1328        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
1329        const_temp2_4x32b = _mm_set1_epi16(31);
1330        const_temp4_4x32b = _mm_set1_epi16(8);
1331        two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
1332
1333        for(col = 0; col < 2 * nt; col += 16)
1334        {
1335            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
1336            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
1337            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
1338
1339            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
1340            __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
1341
1342            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
1343            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
1344
1345            /* pos = ((row + 1) * intra_pred_ang); */
1346            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
1347
1348            /* fract = pos & (31); */
1349            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
1350
1351            ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b,  5);
1352
1353            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b,  ref_main_idx_4x32b);
1354
1355            ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
1356
1357            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
1358
1359            /*(32 - fract) */
1360            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
1361
1362            _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
1363            _mm_storeu_si128((__m128i *)(ai1_src_temp_val),  src_values10);
1364
1365            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
1366            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
1367            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
1368            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
1369
1370            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
1371            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
1372            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
1373            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
1374
1375            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
1376            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
1377            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
1378            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
1379
1380            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
1381            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
1382            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
1383            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
1384
1385            fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]);  /* col=5*/
1386            fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]);  /* col=6*/
1387            fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]);  /* col=7*/
1388            fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]);  /* col=8*/
1389
1390            temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]);  /* col=0*/
1391            temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]);  /* col=1*/
1392            temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]);  /* col=2*/
1393            temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]);  /* col=3*/
1394
1395            temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
1396            temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
1397            temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
1398            temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
1399
1400            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
1401            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
1402            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
1403            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
1404
1405            for(row = 0; row < nt; row += 4)
1406            {
1407                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1408                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1409
1410                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
1411                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
1412
1413                /* loding 8-bit 16 pixels */
1414                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - row - (8 + row))); /* col=0*/
1415                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - row - (8 + row))); /* col=1*/
1416                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - row - (8 + row))); /* col=2*/
1417                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - row - (8 + row))); /* col=3*/
1418
1419                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
1420                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
1421                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
1422                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
1423
1424                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
1425                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
1426                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
1427                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
1428
1429                /* loding 8-bit 16 pixels */
1430                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - row - row - 8)); /* col=5*/
1431                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - row - row - 8)); /* col=6*/
1432                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - row - row - 8)); /* col=7*/
1433                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - row - row - 8)); /* col=8*/
1434
1435                src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
1436                src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
1437                src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
1438                src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
1439
1440                src_temp11_8x16b =  _mm_unpacklo_epi8(src_temp11_8x16b, src_temp15_8x16b); /* col=0*/
1441                src_temp12_8x16b =  _mm_unpacklo_epi8(src_temp12_8x16b, src_temp16_8x16b); /* col=1*/
1442                src_temp13_8x16b =  _mm_unpacklo_epi8(src_temp13_8x16b, src_temp17_8x16b); /* col=2*/
1443                src_temp14_8x16b =  _mm_unpacklo_epi8(src_temp14_8x16b, src_temp18_8x16b); /* col=3*/
1444
1445                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1446                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
1447                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
1448                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
1449                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
1450
1451                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1452                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
1453                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
1454                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
1455                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
1456
1457                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1458                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
1459                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
1460                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
1461                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
1462
1463                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1464                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
1465                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
1466                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
1467                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
1468
1469                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1470                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
1471                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
1472                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
1473                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
1474
1475                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1476                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
1477                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
1478                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
1479                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
1480
1481                /* converting 16 bit to 8 bit */
1482                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
1483                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
1484                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
1485                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
1486
1487                src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
1488                src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
1489                src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
1490                src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
1491
1492                /* converting 16 bit to 8 bit */
1493                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
1494                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
1495                src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
1496                src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
1497
1498                src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm1);
1499                src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm1);
1500                src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm1);
1501                src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm1);
1502
1503                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
1504                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
1505
1506                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
1507                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
1508
1509                src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
1510                src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
1511
1512                src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
1513                src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
1514
1515                src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
1516                src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
1517                src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
1518                src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
1519
1520                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)),    src_temp11_8x16b);          /* row=0*/
1521                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b);       /* row=1*/
1522                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b);       /* row=2*/
1523                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b);       /* row=4*/
1524
1525            }
1526        }
1527    }
1528}
1529
1530/**
1531*******************************************************************************
1532*
1533* @brief
1534*  Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
1535* with reference  neighboring samples location pointed by 'pu1_ref' to the
1536* TU block location pointed by 'pu1_dst'
1537*
1538* @par Description:
1539*
1540*
1541* @param[in] pu1_src
1542*  UWORD8 pointer to the source
1543*
1544* @param[in] pu1_dst
1545*  UWORD8 pointer to the destination
1546*
1547* @param[in] src_strd
1548*  integer source stride
1549*
1550* @param[in] dst_strd
1551*  integer destination stride
1552*
1553* @param[in] nt
1554*  integer Transform Block size
1555*
1556* @param[in] mode
1557*  integer intraprediction mode
1558*
1559* @returns
1560*
1561* @remarks
1562*  None
1563*
1564*******************************************************************************
1565*/
1566
1567
1568void ihevc_intra_pred_chroma_mode_11_to_17_ssse3(UWORD8 *pu1_ref,
1569                                                 WORD32 src_strd,
1570                                                 UWORD8 *pu1_dst,
1571                                                 WORD32 dst_strd,
1572                                                 WORD32 nt,
1573                                                 WORD32 mode)
1574{
1575    /* This function and ihevc_intra_pred_CHROMA_mode_19_to_25 are same except*/
1576    /* for ref main & side samples assignment,can be combined for */
1577    /* optimzation*/
1578
1579    WORD32 row, col, k;
1580    WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
1581    WORD32 ref_idx;
1582
1583
1584    __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
1585    __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
1586    __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b;
1587
1588    UWORD8 ref_temp[2 * MAX_CU_SIZE + 2];
1589    UWORD8 *ref_main;
1590    UNUSED(src_strd);
1591
1592    inv_ang_sum = 128;
1593
1594    intra_pred_ang = gai4_ihevc_ang_table[mode];
1595
1596    inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
1597    /* Intermediate reference samples for negative angle modes */
1598    /* This have to be removed during optimization*/
1599
1600    /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
1601
1602
1603    ref_main = ref_temp + 2 * nt;
1604    for(k = 0; k < (2 * (nt + 1)); k += 2)
1605    {
1606        ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k];
1607        ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k + 1];
1608    }
1609
1610    ref_main = ref_temp + (2 * (nt - 1));
1611    ref_idx = (nt * intra_pred_ang) >> 5;
1612
1613    /* SIMD Optimization can be done using look-up table for the loop */
1614    /* For negative angled derive the main reference samples from side */
1615    /* reference samples refer to section 8.4.4.2.6 */
1616
1617    for(k = -2; k > (2 * ref_idx); k -= 2)
1618    {
1619        inv_ang_sum += inv_ang;
1620        ref_main[k] = pu1_ref[(4 * nt) + ((inv_ang_sum >> 8) << 1)];
1621        ref_main[k + 1] = pu1_ref[((4 * nt) + 1) + ((inv_ang_sum >> 8) << 1)];
1622    }
1623
1624    /* For the angles other then 45 degree, interpolation btw 2 neighboring */
1625    /* samples dependent on distance to obtain destination sample */
1626
1627    const_temp_4x32b  = _mm_set1_epi16(16);
1628    const_temp2_4x32b = _mm_set1_epi32(31);
1629    const_temp3_4x32b = _mm_set1_epi16(32);
1630    const_temp4_4x32b = _mm_set1_epi32(4);
1631
1632    two_nt_4x32b = _mm_set1_epi32(1);
1633
1634    zero_8x16b = _mm_set1_epi16(0);
1635
1636
1637    /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
1638    intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
1639
1640    row_4x32b = _mm_set_epi32(4, 3, 2, 1);
1641
1642    if(nt == 4)
1643    {
1644        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
1645        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
1646        const_temp2_4x32b = _mm_set1_epi16(31);
1647        const_temp4_4x32b = _mm_set1_epi16(4);
1648        two_nt_4x32b = _mm_set1_epi16(1);
1649
1650        {
1651            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
1652            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
1653
1654            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
1655            __m128i src_values10;
1656
1657            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
1658
1659            /* pos = ((row + 1) * intra_pred_ang); */
1660            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
1661
1662            /* fract = pos & (31); */
1663            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
1664
1665            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
1666            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
1667
1668            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
1669
1670            /*(32 - fract) */
1671            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
1672
1673            _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
1674            _mm_storel_epi64((__m128i *)(ai1_src_temp_val),  src_values10);
1675
1676            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
1677            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
1678            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
1679            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
1680
1681            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
1682            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
1683            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
1684            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
1685
1686            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
1687            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
1688            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
1689            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
1690
1691            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
1692            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
1693            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
1694            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
1695
1696            {
1697                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1698                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1699
1700                /* loding 8-bit 16 pixels */
1701                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
1702                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
1703                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
1704                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
1705
1706                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
1707                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
1708                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
1709                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
1710
1711                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
1712                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
1713                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
1714                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
1715
1716                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1717                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
1718                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
1719                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
1720                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
1721
1722                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1723                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
1724                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
1725                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
1726                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
1727
1728                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1729                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
1730                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
1731                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
1732                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
1733
1734                /* converting 16 bit to 8 bit */
1735                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
1736                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
1737                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
1738                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
1739
1740                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
1741                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
1742
1743                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
1744                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
1745
1746                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b);             /* row=0*/
1747
1748                src_temp2_8x16b  = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
1749                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b);       /* row=1*/
1750
1751                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b);       /* row=2*/
1752
1753                src_temp4_8x16b  = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
1754                _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b);       /* row=4*/
1755
1756            }
1757        }
1758    }
1759    else
1760    {
1761        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
1762        row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
1763        const_temp2_4x32b = _mm_set1_epi16(31);
1764        const_temp4_4x32b = _mm_set1_epi16(8);
1765        two_nt_4x32b = _mm_set1_epi16(1);
1766
1767        for(col = 0; col < 2 * nt; col += 16)
1768        {
1769            WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
1770            WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
1771            WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
1772
1773            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
1774            __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
1775
1776            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
1777            __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
1778
1779            /* pos = ((row + 1) * intra_pred_ang); */
1780            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
1781
1782            /* fract = pos & (31); */
1783            fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
1784
1785            ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
1786            ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
1787
1788            row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
1789
1790            /*(32 - fract) */
1791            src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
1792
1793            _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
1794            _mm_storeu_si128((__m128i *)(ai1_src_temp_val),  src_values10);
1795
1796            fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
1797            fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
1798            fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
1799            fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
1800
1801            temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
1802            temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
1803            temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
1804            temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
1805
1806            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
1807            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
1808            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
1809            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
1810
1811            pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
1812            pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
1813            pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
1814            pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
1815
1816            fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]);  /* col=5*/
1817            fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]);  /* col=6*/
1818            fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]);  /* col=7*/
1819            fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]);  /* col=8*/
1820
1821            temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]);  /* col=0*/
1822            temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]);  /* col=1*/
1823            temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]);  /* col=2*/
1824            temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]);  /* col=3*/
1825
1826            temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
1827            temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
1828            temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
1829            temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
1830
1831            pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
1832            pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
1833            pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
1834            pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
1835
1836            for(row = 0; row < nt; row += 4)
1837            {
1838                __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1839                __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1840
1841                __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
1842                __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
1843
1844                /* loding 8-bit 16 pixels */
1845                src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row + row)); /* col=0*/
1846                src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row + row)); /* col=1*/
1847                src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row + row)); /* col=2*/
1848                src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row + row)); /* col=3*/
1849
1850                src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
1851                src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
1852                src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
1853                src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
1854
1855                src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
1856                src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
1857                src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
1858                src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
1859
1860                /* loding 8-bit 16 pixels */
1861                src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row + row)); /* col=5*/
1862                src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row + row)); /* col=6*/
1863                src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row + row)); /* col=7*/
1864                src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row + row)); /* col=8*/
1865
1866                src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
1867                src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
1868                src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
1869                src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
1870
1871                src_temp11_8x16b =  _mm_unpacklo_epi8(src_temp15_8x16b, src_temp11_8x16b); /* col=0*/
1872                src_temp12_8x16b =  _mm_unpacklo_epi8(src_temp16_8x16b, src_temp12_8x16b); /* col=1*/
1873                src_temp13_8x16b =  _mm_unpacklo_epi8(src_temp17_8x16b, src_temp13_8x16b); /* col=2*/
1874                src_temp14_8x16b =  _mm_unpacklo_epi8(src_temp18_8x16b, src_temp14_8x16b); /* col=3*/
1875
1876                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1877                src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
1878                src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
1879                src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
1880                src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
1881
1882                /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1883                src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
1884                src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
1885                src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
1886                src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
1887
1888                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1889                src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
1890                src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
1891                src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
1892                src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
1893
1894                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1895                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
1896                src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
1897                src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
1898                src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
1899
1900                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1901                src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
1902                src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
1903                src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
1904                src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
1905
1906                /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1907                src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
1908                src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
1909                src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
1910                src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
1911
1912                /* converting 16 bit to 8 bit */
1913                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
1914                src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
1915                src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
1916                src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
1917
1918                /* converting 16 bit to 8 bit */
1919                src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
1920                src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
1921                src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
1922                src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
1923
1924                src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
1925                src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
1926
1927                src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
1928                src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
1929
1930                src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
1931                src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
1932
1933                src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
1934                src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
1935
1936                src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
1937                src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
1938                src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
1939                src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
1940
1941                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)),    src_temp11_8x16b);          /* row=0*/
1942                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b);       /* row=1*/
1943                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b);       /* row=2*/
1944                _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b);       /* row=4*/
1945
1946            }
1947        }
1948    }
1949}
1950
1951/**
1952*******************************************************************************
1953*
1954* @brief
1955*  Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
1956* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
1957* block location pointed by 'pu1_dst'
1958*
1959* @par Description:
1960*
1961*
1962* @param[in] pu1_src
1963*  UWORD8 pointer to the source
1964*
1965* @param[in] pu1_dst
1966*  UWORD8 pointer to the destination
1967*
1968* @param[in] src_strd
1969*  integer source stride
1970*
1971* @param[in] dst_strd
1972*  integer destination stride
1973*
1974* @param[in] nt
1975*  integer Transform Block size
1976*
1977* @param[in] mode
1978*  integer intraprediction mode
1979*
1980* @returns
1981*
1982* @remarks
1983*  None
1984*
1985*******************************************************************************
1986*/
1987
1988void ihevc_intra_pred_chroma_mode_19_to_25_ssse3(UWORD8 *pu1_ref,
1989                                                 WORD32 src_strd,
1990                                                 UWORD8 *pu1_dst,
1991                                                 WORD32 dst_strd,
1992                                                 WORD32 nt,
1993                                                 WORD32 mode)
1994{
1995    WORD32 row, k;
1996    WORD32 intra_pred_ang, idx;
1997    WORD32 inv_ang, inv_ang_sum, pos, fract;
1998    WORD32 ref_main_idx, ref_idx;
1999    UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 2];
2000    UWORD8 *ref_main;
2001
2002    __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
2003    UNUSED(src_strd);
2004
2005    intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
2006    inv_ang = gai4_ihevc_inv_ang_table_chroma[mode - 12];
2007
2008    /* Intermediate reference samples for negative angle modes */
2009    /* This have to be removed during optimization*/
2010    /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
2011    ref_main = ref_temp + 2 * nt;
2012    for(k = 0; k < (2 * (nt + 1)); k += 2)
2013    {
2014        ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k];
2015        ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k + 1];
2016    }
2017
2018    ref_idx = (nt * intra_pred_ang) >> 5;
2019    inv_ang_sum = 128;
2020    ref_main = ref_temp + (2 * (nt - 1));
2021    /* SIMD Optimization can be done using look-up table for the loop */
2022    /* For negative angled derive the main reference samples from side */
2023    /*  reference samples refer to section 8.4.4.2.6 */
2024    for(k = -2; k > (2 * ref_idx); k -= 2)
2025    {
2026        inv_ang_sum += inv_ang;
2027        ref_main[k] = pu1_ref[(4 * nt) - (inv_ang_sum >> 8) * 2];
2028        ref_main[k + 1] = pu1_ref[((4 * nt) + 1) - (inv_ang_sum >> 8) * 2];
2029    }
2030
2031    const_temp_8x16b = _mm_set1_epi16(16);
2032
2033    if(nt == 4) /* if nt =4*/
2034    {
2035        __m128i const_temp2_4x32b, const_temp3_4x32b;
2036        __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
2037        __m128i row_4x32b, two_nt_4x32b, src_values12;
2038
2039
2040        const_temp2_4x32b = _mm_set1_epi32(31);
2041        const_temp3_4x32b = _mm_set1_epi32(32);
2042
2043        two_nt_4x32b = _mm_set1_epi32(2);
2044
2045        zero_8x16b = _mm_set1_epi16(0);
2046
2047        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
2048        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
2049
2050        row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
2051        {
2052            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
2053            WORD8  ai1_src_temp0_val[16], ai1_src_temp1_val[16];
2054
2055            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
2056            __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
2057            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
2058            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
2059
2060            /* pos = ((row + 1) * intra_pred_ang); */
2061            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
2062            sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
2063            res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
2064
2065            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
2066            src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b,  5));
2067
2068            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
2069            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
2070            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
2071            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
2072            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
2073            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
2074            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
2075
2076            /* fract = pos & (31); */
2077            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
2078
2079            /*(32 - fract) */
2080            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
2081
2082            _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
2083            _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
2084
2085            fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]);  /* row=0*/
2086            fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]);  /* row=1*/
2087            fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]);  /* row=2*/
2088            fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]);  /* row=3*/
2089
2090            temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]);  /* row=0*/
2091            temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]);  /* row=1*/
2092            temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]);  /* row=2*/
2093            temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]);  /* row=3*/
2094
2095            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
2096            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
2097            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
2098            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
2099
2100// inner loop starts from here
2101            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));  /* col = 0-7   */
2102            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2));  /* col = 8-15  */
2103            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3));  /* col = 16-23 */
2104            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4));  /* col = 24-31 */
2105
2106            src_values10 = _mm_srli_si128(src_values0, 2);
2107            src_values11 = _mm_srli_si128(src_values1, 2);
2108            src_values12 = _mm_srli_si128(src_values2, 2);
2109            src_values13 = _mm_srli_si128(src_values3, 2);
2110
2111            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2112            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2113            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2114            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2115
2116            src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
2117            src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
2118            src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
2119            src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
2120
2121            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2122            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2123            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2124            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2125            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2126
2127            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2128            src_values0 = _mm_srai_epi16(src_values0,  5);
2129            src_values1 = _mm_srai_epi16(src_values1,  5);
2130            src_values2 = _mm_srai_epi16(src_values2,  5);
2131            src_values3 = _mm_srai_epi16(src_values3,  5);
2132
2133            /* converting 16 bit to 8 bit */
2134            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2135            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2136            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2137            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2138
2139            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0);       /* row=0*/
2140            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1);   /* row=1*/
2141            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2);   /* row=2*/
2142            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3);   /* row=3*/
2143
2144        }
2145    }
2146    else if(nt == 8) /* for nt = 16 case */
2147    {
2148        WORD32 ref_main_idx1, fract1, temp, temp1;
2149        __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
2150
2151        zero_8x16b = _mm_set1_epi16(0);
2152
2153        for(row = 0; row < nt; row += 2)
2154        {
2155            __m128i src_values0, src_values1, src_values2, src_values3;
2156            __m128i  src_values10, src_values11, src_values12, src_values13;
2157
2158            pos = ((row + 1) * intra_pred_ang);
2159            idx = pos >> 5;
2160            fract = pos & (31);
2161            temp = 32 - fract;
2162            ref_main_idx = 2 * idx + 2; /* col from 0-15 */
2163
2164            pos = ((row + 2) * intra_pred_ang);
2165            idx = pos >> 5;
2166            fract1 = pos & (31);
2167            temp1 = 32 - fract1;
2168            ref_main_idx1 = 2 * idx + 2; /* col from 0-15 */
2169
2170            fract_8x16b  = _mm_set1_epi8(fract);
2171            fract1_8x16b = _mm_set1_epi8(fract1);
2172            temp_8x16b   = _mm_set1_epi8(temp);
2173            temp1_8x16b  = _mm_set1_epi8(temp1);
2174
2175            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
2176            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
2177
2178            /* row=0 */
2179            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx));     /* col = 0-7   */
2180            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8));   /* col = 8-15  */
2181
2182            /* row=1 */
2183            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));   /* col = 0-7  */
2184            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1 + 8));  /* col = 8-15 */
2185
2186            src_values10 = _mm_srli_si128(src_values0, 2);
2187            src_values11 = _mm_srli_si128(src_values1, 2);
2188            src_values12 = _mm_srli_si128(src_values2, 2);
2189            src_values13 = _mm_srli_si128(src_values3, 2);
2190
2191            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2192            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2193            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2194            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2195
2196            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
2197            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
2198
2199            src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
2200            src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
2201
2202            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2203            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2204            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2205
2206            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2207            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2208
2209            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2210            src_values0 = _mm_srai_epi16(src_values0,  5);
2211            src_values1 = _mm_srai_epi16(src_values1,  5);
2212
2213            src_values2 = _mm_srai_epi16(src_values2,  5);
2214            src_values3 = _mm_srai_epi16(src_values3,  5);
2215
2216            /* converting 16 bit to 8 bit */
2217            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2218            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2219
2220            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2221            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2222
2223            /* loding 8-bit 8 pixels values */
2224            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
2225            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
2226
2227            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
2228            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
2229
2230            pu1_dst += 2 * dst_strd;
2231        }
2232    }
2233    else if(nt == 16)
2234    {
2235        WORD32 temp;
2236        /* unroll the col loop (inner) */
2237        zero_8x16b = _mm_set1_epi16(0);
2238
2239        for(row = 0; row < nt; row += 1)
2240        {
2241            __m128i  src_values0, src_values1, src_values2, src_values3, temp_8x16b;
2242            __m128i  src_values10, src_values11, src_values12, src_values13;
2243
2244            pos = ((row + 1) * intra_pred_ang);
2245            idx = pos >> 5;
2246            fract = pos & (31);
2247            temp = 32 - fract;
2248            ref_main_idx = 2 * idx + 2; /* col from 0-31 */
2249
2250            fract_8x16b = _mm_set1_epi8(fract);
2251            temp_8x16b  = _mm_set1_epi8(temp);
2252
2253            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
2254
2255            src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx));     /* col = 0-7   */
2256            src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8));   /* col = 8-15  */
2257            src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 16));  /* col = 16-23 */
2258            src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 24));  /* col = 24-31 */
2259
2260            src_values10 = _mm_srli_si128(src_values0, 2);
2261            src_values11 = _mm_srli_si128(src_values1, 2);
2262            src_values12 = _mm_srli_si128(src_values2, 2);
2263            src_values13 = _mm_srli_si128(src_values3, 2);
2264
2265            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2266            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2267            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2268            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2269
2270            /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
2271            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
2272            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
2273            src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
2274            src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
2275
2276            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2277            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2278            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2279            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2280            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2281
2282            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2283            src_values0 = _mm_srai_epi16(src_values0,  5);
2284            src_values1 = _mm_srai_epi16(src_values1,  5);
2285            src_values2 = _mm_srai_epi16(src_values2,  5);
2286            src_values3 = _mm_srai_epi16(src_values3,  5);
2287
2288            /* converting 16 bit to 8 bit */
2289            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2290            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2291            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2292            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2293
2294            /* loding 8-bit 8 pixels values */
2295            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
2296            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
2297            _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
2298            _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
2299
2300            pu1_dst += dst_strd;
2301
2302        }
2303    }
2304}
2305
2306
2307/**
2308*******************************************************************************
2309*
2310* @brief
2311*  Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
2312* reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2313* block location pointed by 'pu1_dst'
2314*
2315* @par Description:
2316*
2317*
2318* @param[in] pu1_src
2319*  UWORD8 pointer to the source
2320*
2321* @param[in] pu1_dst
2322*  UWORD8 pointer to the destination
2323*
2324* @param[in] src_strd
2325*  integer source stride
2326*
2327* @param[in] dst_strd
2328*  integer destination stride
2329*
2330* @param[in] nt
2331*  integer Transform Block size
2332*
2333* @param[in] mode
2334*  integer intraprediction mode
2335*
2336* @returns
2337*
2338* @remarks
2339*  None
2340*
2341*******************************************************************************
2342*/
2343
2344void ihevc_intra_pred_chroma_mode_27_to_33_ssse3(UWORD8 *pu1_ref,
2345                                                 WORD32 src_strd,
2346                                                 UWORD8 *pu1_dst,
2347                                                 WORD32 dst_strd,
2348                                                 WORD32 nt,
2349                                                 WORD32 mode)
2350{
2351    WORD32 row;
2352    WORD32 pos, fract;
2353    WORD32 intra_pred_ang;
2354    WORD32 idx, ref_main_idx;
2355
2356    __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
2357    UNUSED(src_strd);
2358
2359    intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
2360    const_temp_8x16b = _mm_set1_epi16(16);
2361
2362    if(nt == 4) /* if nt =4*/
2363    {
2364        __m128i const_temp2_4x32b, const_temp3_4x32b;
2365        __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
2366        __m128i row_4x32b, two_nt_4x32b, src_values12;
2367
2368        const_temp2_4x32b = _mm_set1_epi32(31);
2369        const_temp3_4x32b = _mm_set1_epi32(32);
2370
2371        two_nt_4x32b = _mm_set1_epi32((4 * nt) + 2);
2372
2373        zero_8x16b = _mm_set1_epi16(0);
2374
2375        /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
2376        intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
2377        row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
2378
2379        {
2380            WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
2381            WORD8  ai1_src_temp0_val[16], ai1_src_temp1_val[16];
2382
2383            __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
2384            __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
2385            __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
2386            __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
2387
2388            /* pos = ((row + 1) * intra_pred_ang); */
2389            res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
2390            sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
2391            res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
2392
2393            src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
2394            src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b,  5));
2395
2396            ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
2397            ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
2398            ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
2399            ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
2400            ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
2401            ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
2402            ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
2403
2404            /* fract = pos & (31); */
2405            src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
2406
2407            /*(32 - fract) */
2408            src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
2409
2410            _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
2411            _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
2412
2413            fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]);  /* row=0*/
2414            fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]);  /* row=1*/
2415            fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]);  /* row=2*/
2416            fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]);  /* row=3*/
2417
2418            temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]);  /* row=0*/
2419            temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]);  /* row=1*/
2420            temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]);  /* row=2*/
2421            temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]);  /* row=3*/
2422
2423            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
2424            temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
2425            temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
2426            temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
2427
2428// inner loop starts from here
2429            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));  /* col = 0-7   */
2430            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2));  /* col = 8-15  */
2431            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3));  /* col = 16-23 */
2432            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4));  /* col = 24-31 */
2433
2434            src_values10 = _mm_srli_si128(src_values0, 2);
2435            src_values11 = _mm_srli_si128(src_values1, 2);
2436            src_values12 = _mm_srli_si128(src_values2, 2);
2437            src_values13 = _mm_srli_si128(src_values3, 2);
2438
2439            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2440            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2441            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2442            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2443
2444            src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
2445            src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
2446            src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
2447            src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
2448
2449            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2450            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2451            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2452            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2453            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2454
2455            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2456            src_values0 = _mm_srai_epi16(src_values0,  5);
2457            src_values1 = _mm_srai_epi16(src_values1,  5);
2458            src_values2 = _mm_srai_epi16(src_values2,  5);
2459            src_values3 = _mm_srai_epi16(src_values3,  5);
2460
2461            /* converting 16 bit to 8 bit */
2462            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2463            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2464            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2465            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2466
2467            _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0);       /* row=0*/
2468            _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1);   /* row=1*/
2469            _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2);   /* row=2*/
2470            _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3);   /* row=3*/
2471
2472        }
2473    }
2474
2475    else if(nt == 8) /* for nt = 16 case */
2476    {
2477        WORD32 ref_main_idx1, fract1, temp, temp1;
2478        __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
2479
2480        zero_8x16b = _mm_set1_epi16(0);
2481
2482        for(row = 0; row < nt; row += 2)
2483        {
2484            __m128i src_values0, src_values1, src_values2, src_values3;
2485            __m128i  src_values10, src_values11, src_values12, src_values13;
2486
2487            pos = ((row + 1) * intra_pred_ang);
2488            idx = pos >> 5;
2489            fract = pos & (31);
2490            temp = 32 - fract;
2491            ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
2492
2493            pos = ((row + 2) * intra_pred_ang);
2494            idx = pos >> 5;
2495            fract1 = pos & (31);
2496            temp1 = 32 - fract1;
2497            ref_main_idx1 = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
2498
2499            fract_8x16b  = _mm_set1_epi8(fract);
2500            fract1_8x16b = _mm_set1_epi8(fract1);
2501            temp_8x16b   = _mm_set1_epi8(temp);
2502            temp1_8x16b  = _mm_set1_epi8(temp1);
2503
2504            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
2505            temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
2506
2507            /* row=0 */
2508            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx));     /* col = 0-7   */
2509            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8));   /* col = 8-15  */
2510
2511            /* row=1 */
2512            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));    /* col = 0-7  */
2513            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 + 8));  /* col = 8-15 */
2514
2515            src_values10 = _mm_srli_si128(src_values0, 2);
2516            src_values11 = _mm_srli_si128(src_values1, 2);
2517            src_values12 = _mm_srli_si128(src_values2, 2);
2518            src_values13 = _mm_srli_si128(src_values3, 2);
2519
2520            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2521            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2522            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2523            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2524
2525            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
2526            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
2527
2528            src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
2529            src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
2530
2531            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2532            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2533            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2534
2535            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2536            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2537
2538            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2539            src_values0 = _mm_srai_epi16(src_values0,  5);
2540            src_values1 = _mm_srai_epi16(src_values1,  5);
2541
2542            src_values2 = _mm_srai_epi16(src_values2,  5);
2543            src_values3 = _mm_srai_epi16(src_values3,  5);
2544
2545            /* converting 16 bit to 8 bit */
2546            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2547            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2548
2549            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2550            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2551
2552            /* loding 8-bit 8 pixels values */
2553            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
2554            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
2555
2556            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
2557            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
2558
2559            pu1_dst += 2 * dst_strd;
2560        }
2561    }
2562    else if(nt == 16)
2563    {
2564        WORD32 temp;
2565        /* unroll the col loop (inner) */
2566        zero_8x16b = _mm_set1_epi16(0);
2567
2568        for(row = 0; row < nt; row += 1)
2569        {
2570            __m128i  src_values0, src_values1, src_values2, src_values3, temp_8x16b;
2571            __m128i  src_values10, src_values11, src_values12, src_values13;
2572
2573            pos = ((row + 1) * intra_pred_ang);
2574            idx = pos >> 5;
2575            fract = pos & (31);
2576            temp = 32 - fract;
2577            ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-31 */
2578
2579            fract_8x16b = _mm_set1_epi8(fract);
2580            temp_8x16b  = _mm_set1_epi8(temp);
2581
2582            temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
2583
2584            src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx));     /* col = 0-7   */
2585            src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8));   /* col = 8-15  */
2586            src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 16));  /* col = 16-23 */
2587            src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 24));  /* col = 24-31 */
2588
2589            src_values10 = _mm_srli_si128(src_values0, 2);
2590            src_values11 = _mm_srli_si128(src_values1, 2);
2591            src_values12 = _mm_srli_si128(src_values2, 2);
2592            src_values13 = _mm_srli_si128(src_values3, 2);
2593
2594            src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2595            src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2596            src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2597            src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2598
2599            /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
2600            src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
2601            src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
2602            src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
2603            src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
2604
2605            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2606            src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2607            src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2608            src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2609            src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2610
2611            /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2612            src_values0 = _mm_srai_epi16(src_values0,  5);
2613            src_values1 = _mm_srai_epi16(src_values1,  5);
2614            src_values2 = _mm_srai_epi16(src_values2,  5);
2615            src_values3 = _mm_srai_epi16(src_values3,  5);
2616
2617            /* converting 16 bit to 8 bit */
2618            src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2619            src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2620            src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2621            src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2622
2623            /* loding 8-bit 8 pixels values */
2624            _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
2625            _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
2626            _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
2627            _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
2628
2629            pu1_dst += dst_strd;
2630
2631        }
2632    }
2633}
2634