1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18
19/**
20*******************************************************************************
21* @file
22*  ihevc_inter_pred_filters_atom_intr.c
23*
24* @brief
25*  Contains function definitions for inter prediction  interpolation filters
26*  coded in x86 intrinsics
27*
28*
29* @author
30*
31*
32* @par List of Functions:
33*  - ihevc_inter_pred_luma_copy_ssse3()
34*  - ihevc_inter_pred_luma_horz_ssse3()
35*  - ihevc_inter_pred_luma_vert_ssse3()
36*  - ihevc_inter_pred_luma_copy_w16out_ssse3()
37*  - ihevc_inter_pred_luma_horz_w16out_ssse3()
38*  - ihevc_inter_pred_luma_vert_w16out_ssse3()
39*  - ihevc_inter_pred_luma_vert_w16inp_ssse3()
40*  - ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3()
41*  - ihevc_inter_pred_chroma_copy_ssse3()
42*  - ihevc_inter_pred_chroma_horz_ssse3()
43*  - ihevc_inter_pred_chroma_vert_ssse3()
44*  - ihevc_inter_pred_chroma_copy_w16out_ssse3()
45*  - ihevc_inter_pred_chroma_horz_w16out_ssse3()
46*  - ihevc_inter_pred_chroma_vert_w16out_ssse3()
47*  - ihevc_inter_pred_chroma_vert_w16inp_ssse3()
48*  - ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3()
49*
50* @remarks
51*  None
52*
53*******************************************************************************
54*/
55
56
57/*****************************************************************************/
58/* File Includes                                                             */
59/*****************************************************************************/
60#include <assert.h>
61
62#include "ihevc_debug.h"
63#include "ihevc_typedefs.h"
64#include "ihevc_defs.h"
65#include "ihevc_inter_pred.h"
66#include "ihevc_platform_macros.h"
67#include "ihevc_macros.h"
68#include "ihevc_func_selector.h"
69
70#include <immintrin.h>
71
72/*****************************************************************************/
73/* Function Definitions                                                      */
74/*****************************************************************************/
75
76/**
77*******************************************************************************
78*
79* @brief
80*       Interprediction luma function for copy
81*
82* @par Description:
83*    Copies the array of width 'wd' and height 'ht' from the  location pointed
84*    by 'src' to the location pointed by 'dst'
85*
86* @param[in] pu1_src
87*  UWORD8 pointer to the source
88*
89* @param[out] pu1_dst
90*  UWORD8 pointer to the destination
91*
92* @param[in] src_strd
93*  integer source stride
94*
95* @param[in] dst_strd
96*  integer destination stride
97*
98* @param[in] pi1_coeff
99*  WORD8 pointer to the filter coefficients
100*
101* @param[in] ht
102*  integer height of the array
103*
104* @param[in] wd
105*  integer width of the array
106*
107* @returns
108*
109* @remarks
110*  None
111*
112* Assumption : ht%4 == 0, wd%4 == 0
113*
114*******************************************************************************
115*/
116
117
118void ihevc_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src,
119                                      UWORD8 *pu1_dst,
120                                      WORD32 src_strd,
121                                      WORD32 dst_strd,
122                                      WORD8 *pi1_coeff,
123                                      WORD32 ht,
124                                      WORD32 wd)
125{
126
127    WORD32 row, col;
128    __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
129    UNUSED(pi1_coeff);
130    ASSERT(wd % 4 == 0); /* checking assumption*/
131    ASSERT(ht % 4 == 0); /* checking assumption*/
132
133/*  outer for loop starts from here */
134    if(0 == (wd & 15)) /* wd multiple of 16 case */
135    {
136        for(row = 0; row < ht; row += 4)
137        {
138            for(col = 0; col < wd; col += 16)
139            {
140                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
141                src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                /* row =0 */
142                src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
143                src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
144                src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
145
146                /* storing 16 8-bit output values */
147                _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
148                _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
149                _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
150                _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
151
152                pu1_src += 16; /* pointer update */
153                pu1_dst += 16; /* pointer update */
154            } /* inner for loop ends here(16-output values in single iteration) */
155
156            pu1_src += 4 * src_strd - wd; /* pointer update */
157            pu1_dst += 4 * dst_strd - wd; /* pointer update */
158        }
159
160    }
161    else if(0 == (wd & 7)) /* multiple of 8 case */
162    {
163        for(row = 0; row < ht; row += 4)
164        {
165            for(col = 0; col < wd; col += 8)
166            {
167                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
168                src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
169                src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
170                src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
171                src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
172
173                /* storing 16 8-bit output values */
174                _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
175                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
176                _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
177                _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
178
179                pu1_src += 8; /* pointer update */
180                pu1_dst += 8; /* pointer update */
181            } /*  inner for loop ends here(8-output values in single iteration) */
182
183            pu1_src += 4 * src_strd - wd; /* pointer update */
184            pu1_dst += 4 * dst_strd - wd; /* pointer update */
185        }
186    }
187    else /* wd = multiple of 4 case */
188    {
189        WORD32 dst0, dst1, dst2, dst3;
190        for(row = 0; row < ht; row += 4)
191        {
192            for(col = 0; col < wd; col += 4)
193            {
194                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
195                src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
196                src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
197                src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
198                src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
199
200                dst0 = _mm_cvtsi128_si32(src0_16x8b);
201                dst1 = _mm_cvtsi128_si32(src1_16x8b);
202                dst2 = _mm_cvtsi128_si32(src2_16x8b);
203                dst3 = _mm_cvtsi128_si32(src3_16x8b);
204
205                /* storing 4 8-bit output values */
206                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
207                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
208                *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */
209                *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */
210
211                pu1_src += 4; /* pointer update */
212                pu1_dst += 4; /* pointer update */
213            } /*  inner for loop ends here(4- output values in single iteration) */
214
215            pu1_src += 4 * src_strd - wd; /* pointer update */
216            pu1_dst += 4 * dst_strd - wd; /* pointer update */
217        }
218    }
219}
220
221/* INTER_PRED_LUMA_COPY */
222
223/**
224*******************************************************************************
225*
226* @brief
227*     Interprediction luma filter for horizontal input
228*
229* @par Description:
230*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
231*    to the elements pointed by 'pu1_src' and  writes to the location pointed
232*    by 'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
233*
234* @param[in] pu1_src
235*  UWORD8 pointer to the source
236*
237* @param[out] pu1_dst
238*  UWORD8 pointer to the destination
239*
240* @param[in] src_strd
241*  integer source stride
242*
243* @param[in] dst_strd
244*  integer destination stride
245*
246* @param[in] pi1_coeff
247*  WORD8 pointer to the filter coefficients
248*
249* @param[in] ht
250*  integer height of the array
251*
252* @param[in] wd
253*  integer width of the array
254*
255* @returns
256*
257* @remarks
258*  None
259*
260*******************************************************************************
261*/
262void ihevc_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src,
263                                      UWORD8 *pu1_dst,
264                                      WORD32 src_strd,
265                                      WORD32 dst_strd,
266                                      WORD8 *pi1_coeff,
267                                      WORD32 ht,
268                                      WORD32 wd)
269{
270    WORD32 row, col;
271
272    /* all 128 bit registers are named with a suffix mxnb, where m is the */
273    /* number of n bits packed in the register                            */
274    __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
275    __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
276    __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
277    __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b, res_temp8_8x16b;
278    __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b, res_temp17_8x16b, res_temp18_8x16b;
279    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
280    __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
281
282    ASSERT(wd % 4 == 0); /* checking assumption*/
283
284    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
285    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
286    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
287    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
288    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
289    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
290
291    /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
292    src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
293    zero_8x16b = _mm_set1_epi32(0);
294    offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
295
296    mask_low_32b = _mm_cmpeq_epi16(zero_8x16b, zero_8x16b);
297    mask_high_96b = _mm_srli_si128(mask_low_32b, 12);
298    mask_low_32b = _mm_slli_si128(mask_low_32b, 4);
299
300    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
301    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
302    control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
303    control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
304
305    coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
306    coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
307
308    coeff4_5_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_3_8x16b);  /* pi1_coeff[4] */
309    coeff6_7_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_4_8x16b);  /* pi1_coeff[4] */
310
311    if(0 == (ht & 1)) /* ht multiple of 2 case */
312    {
313
314        if(0 == (wd & 7)) /* wd = multiple of 8 case */
315        {
316            for(row = 0; row < ht; row += 2)
317            {
318
319                int offset = 0;
320
321                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
322                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
323
324
325                for(col = 0; col < wd; col += 8)
326                {
327                    /*load 16 pixel values from row 0*/
328                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));         /* row = 0 */
329
330                    /*load 16 pixel values from row 1*/
331                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
332
333                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
334                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
335                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
336                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
337                                                                                           /* row = 0 */
338                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
339                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
340                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
341                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
342                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
343
344                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
345                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
346                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
347                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
348                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
349
350                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
351                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
352                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
353                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
354                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
355
356                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
357                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
358                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
359
360                    res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
361                    res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
362                    res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
363
364                    _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
365
366                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row =1 */
367                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row =1 */
368                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
369                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
370                                                                                              /* row = 1 */
371                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
372                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
373                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row =1 */
374                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
375                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
376
377                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
378                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
379                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row =1 */
380                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
381                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
382
383                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
384                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
385                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row =1 */
386                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
387                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
388
389                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
390                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
391                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
392
393                    res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b);             /* row = 1 */
394                    res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
395                    res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b);       /* row = 1 */
396
397                    /* to store the 1st 4 pixels res. */
398                    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
399
400                    offset += 8; /* To pointer updates*/
401                }
402                pu1_src += 2 * src_strd;  /* pointer updates*/
403                pu1_dst += 2 * dst_strd;  /* pointer updates*/
404            }
405        }
406        else /* wd = multiple of 4 case */
407        {
408            for(row = 0; row < ht; row += 2)
409            {
410                int offset = 0;
411
412                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
413                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
414
415
416                for(col = 0; col < wd; col += 4)
417                {
418                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
419                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
420                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
421
422                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
423                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
424                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
425                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
426                                                                                           /* row = 0 */
427                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
428                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
429                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
430                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
431                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
432
433                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
434                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
435                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
436                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
437                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
438
439                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
440                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
441                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
442                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
443                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
444
445                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
446                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
447                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
448
449                    res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
450                    res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
451                    res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
452
453                    res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
454                    res_temp8_8x16b =  _mm_and_si128(res_temp7_8x16b, mask_low_32b);
455                    res_temp7_8x16b =  _mm_and_si128(res_temp5_8x16b, mask_high_96b);
456                    res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
457
458                    _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
459
460                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
461                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
462                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
463                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
464                                                                                              /* row = 1 */
465                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
466                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
467                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
468                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
469                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
470
471                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
472                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
473                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
474                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
475                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
476
477                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
478                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
479                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
480                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
481                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
482
483                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
484                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
485                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
486
487                    res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b);             /* row = 1 */
488                    res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
489                    res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b);       /* row = 1 */
490
491                    res_temp17_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
492                    res_temp18_8x16b =  _mm_and_si128(res_temp17_8x16b, mask_low_32b);
493                    res_temp17_8x16b =  _mm_and_si128(res_temp15_8x16b, mask_high_96b);
494                    res_temp15_8x16b = _mm_or_si128(res_temp17_8x16b, res_temp18_8x16b);
495
496                    /* to store the 1st 4 pixels res. */
497                    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
498
499                    offset += 4; /* To pointer updates*/
500                }
501                pu1_src += 2 * src_strd;  /* Pointer update */
502                pu1_dst += 2 * dst_strd;  /* Pointer update */
503            }
504        }
505    }
506    else /* odd ht */
507    {
508        if(0 == (wd & 7)) /* multiple of 8 case */
509        {
510            for(row = 0; row < ht; row++)
511            {
512                int offset = 0;
513
514
515                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
516                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
517
518
519                for(col = 0; col < wd; col += 8)
520                {
521                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
522                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
523
524                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
525                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
526                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
527                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
528                                                                                           /* row = 0 */
529                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
530                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
531                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
532                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
533                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
534
535                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
536                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
537                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
538                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
539                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
540
541                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
542                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
543                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
544                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
545                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
546
547                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
548                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
549                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
550
551                    res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
552                    res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
553                    res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
554
555                    /* to store the 1st 4 pixels res. */
556                    _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
557
558                    offset += 8; /* To pointer updates*/
559                }
560                pu1_src += src_strd;    /* pointer updates*/
561                pu1_dst += dst_strd;    /* pointer updates*/
562            }
563        }
564        else  /* wd = multiple of 4 case */
565        {
566            for(row = 0; row < (ht - 1); row += 2)
567            {
568                int offset = 0;
569
570                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
571                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
572
573
574                for(col = 0; col < wd; col += 4)
575                {
576                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
577                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
578                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
579
580                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
581                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
582                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
583                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
584                                                                                           /* row = 0 */
585                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
586                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
587                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
588                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
589                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
590
591                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
592                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
593                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
594                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
595                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
596
597                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
598                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
599                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
600                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
601                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
602
603                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
604                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
605                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
606
607                    res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
608                    res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
609                    res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
610
611                    res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
612                    res_temp8_8x16b =  _mm_and_si128(res_temp7_8x16b, mask_low_32b);
613                    res_temp7_8x16b =  _mm_and_si128(res_temp5_8x16b, mask_high_96b);
614                    res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
615
616                    _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
617
618                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
619                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
620                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
621                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
622                                                                                              /* row = 1 */
623                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
624                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
625                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
626                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
627                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
628
629                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
630                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
631                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
632                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
633                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
634
635                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
636                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
637                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
638                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
639                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
640
641                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
642                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
643                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
644
645                    res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b);             /* row = 1 */
646                    res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
647                    res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b);       /* row = 1 */
648
649                    res_temp17_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
650                    res_temp18_8x16b =  _mm_and_si128(res_temp17_8x16b, mask_low_32b);
651                    res_temp17_8x16b =  _mm_and_si128(res_temp15_8x16b, mask_high_96b);
652                    res_temp15_8x16b = _mm_or_si128(res_temp17_8x16b, res_temp18_8x16b);
653
654                    /* to store the 1st 4 pixels res. */
655                    _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
656
657                    offset += 4; /* To pointer updates*/
658                }
659                pu1_src += 2 * src_strd;  /* Pointer update */
660                pu1_dst += 2 * dst_strd;  /* Pointer update */
661            }
662            { /* last repeat at outside the loop */
663                int offset = 0;
664                for(col = 0; col < wd; col += 4)
665                {
666                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
667                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
668
669                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
670                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
671                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
672                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
673                                                                                           /* row = 0 */
674                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
675                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
676                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
677                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
678                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
679
680                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
681                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
682                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
683                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
684                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
685
686                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
687                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
688                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
689                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
690                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
691
692                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
693                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
694                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
695
696                    res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
697                    res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
698                    res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
699
700                    res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
701                    res_temp8_8x16b =  _mm_and_si128(res_temp7_8x16b, mask_low_32b);
702                    res_temp7_8x16b =  _mm_and_si128(res_temp5_8x16b, mask_high_96b);
703                    res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
704
705                    /* to store the 1st 4 pixels res. */
706                    _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
707
708                    offset += 4; /* To pointer updates*/
709                }
710            }
711        }
712    }
713}
714
715
716/**
717*******************************************************************************
718*
719* @brief
720*    Interprediction luma filter for vertical input
721*
722* @par Description:
723*   Applies a vertcal filter with coefficients pointed to  by 'pi1_coeff' to
724*   the elements pointed by 'pu1_src' and  writes to the location pointed by
725*   'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
726*
727* @param[in] pu1_src
728*  UWORD8 pointer to the source
729*
730* @param[out] pu1_dst
731*  UWORD8 pointer to the destination
732*
733* @param[in] src_strd
734*  integer source stride
735*
736* @param[in] dst_strd
737*  integer destination stride
738*
739* @param[in] pi1_coeff
740*  WORD8 pointer to the filter coefficients
741*
742* @param[in] ht
743*  integer height of the array
744*
745* @param[in] wd
746*  integer width of the array
747*
748* @returns
749*
750* @remarks
751*  None
752*
753*******************************************************************************
754*/
755void ihevc_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src,
756                                      UWORD8 *pu1_dst,
757                                      WORD32 src_strd,
758                                      WORD32 dst_strd,
759                                      WORD8 *pi1_coeff,
760                                      WORD32 ht,
761                                      WORD32 wd)
762{
763    WORD32 row, col;
764    UWORD8 *pu1_src_copy;
765    UWORD8 *pu1_dst_copy;
766    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
767    __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
768    __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
769    __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
770    __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
771    __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s17_8x16b, s18_8x16b, s19_8x16b;
772    __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s27_8x16b, s28_8x16b, s29_8x16b;
773    __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s37_8x16b, s38_8x16b, s39_8x16b;
774
775    __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
776    __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
777
778    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
779    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
780    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
781    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
782    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
783    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
784    PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
785    PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
786
787/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
788    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
789
790    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
791    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
792    control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
793    control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
794
795    coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
796    coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
797
798    coeff4_5_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_3_8x16b);  /* pi1_coeff[4] */
799    coeff6_7_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_4_8x16b);  /* pi1_coeff[4] */
800
801/*  seting  values in register */
802    zero_8x16b = _mm_setzero_si128(); /* for saturated clipping */
803    offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
804    mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
805    mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
806
807/*  outer for loop starts from here */
808    if(wd % 8 == 0)
809    { /* wd = multiple of 8 case */
810
811        pu1_src_copy = pu1_src;
812        pu1_dst_copy = pu1_dst;
813
814        for(col = 0; col < wd; col += 8)
815        {
816
817            pu1_src = pu1_src_copy + col;
818            pu1_dst = pu1_dst_copy + col;
819
820            PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
821            PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
822            PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
823            PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
824
825            /*load 8 pixel values.*/
826            s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
827
828            /*load 8 pixel values*/
829            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
830
831            s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
832
833            s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
834
835            /*load 8 pixel values*/
836            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
837
838            /*load 8 pixel values*/
839            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
840
841            s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
842
843            s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
844
845            /*load 8 pixel values*/
846            s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
847
848            /*load 8 pixel values*/
849            s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
850
851            s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
852
853            s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
854
855            /*load 8 pixel values*/
856            s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
857
858            /*load 8 pixel values*/
859            s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
860
861            s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
862
863            s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
864
865            s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
866            s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
867            s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
868
869            s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
870
871            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
872            s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
873
874            /* i2_tmp = CLIP_U8(i2_tmp);*/
875            s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
876
877            /* store 8 8-bit output values  */
878            /* Store the output pixels of row 0*/
879            _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
880
881            /* ROW 2*/
882            s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
883            s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
884            s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
885
886            /*load 8 pixel values*/
887            s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
888
889            /*load 8 pixel values*/
890            s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
891
892            s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
893
894            s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
895
896            s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
897            s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
898            s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
899
900            s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
901
902            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
903            s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
904
905            /* i2_tmp = CLIP_U8(i2_tmp);*/
906            s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
907
908            /* store 8 8-bit output values  */
909            /* Store the output pixels of row 2*/
910            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
911
912
913            /*ROW 1*/
914            s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
915
916            s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
917
918            s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
919
920            s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
921
922            s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
923
924            s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
925
926            s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
927
928            s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
929
930            s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
931            s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
932            s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
933
934            s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
935
936            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
937            s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
938
939            /* i2_tmp = CLIP_U8(i2_tmp);*/
940            s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
941
942            /* store 8 8-bit output values  */
943            /* Store the output pixels of row 1*/
944            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s19_8x16b);
945
946
947            /* ROW 3*/
948            s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
949            s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
950            s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
951
952            /*load 8 pixel values*/
953            s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
954
955            s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
956
957            s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
958
959            s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
960            s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
961            s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
962
963            s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
964
965            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
966            s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
967
968            /* i2_tmp = CLIP_U8(i2_tmp);*/
969            s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
970
971            /* store 8 8-bit output values  */
972            /* Store the output pixels of row 2*/
973            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
974
975            pu1_src += (8 * src_strd);
976            pu1_dst += (4 * dst_strd);
977
978            for(row = 4; row < ht; row += 4)
979            {
980                PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
981                PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
982                PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
983                PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
984
985
986                s3_0_16x8b = s3_2_16x8b;
987                s3_1_16x8b = s3_3_16x8b;
988                s3_2_16x8b = s3_4_16x8b;
989
990                s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
991                s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
992                s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
993
994                /*load 8 pixel values from (cur_row + 4)th row*/
995                s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
996
997                s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
998                s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
999
1000                s4_0_16x8b = s4_2_16x8b;
1001                s4_1_16x8b = s4_3_16x8b;
1002                s4_2_16x8b = s4_4_16x8b;
1003
1004                s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
1005                s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
1006                s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
1007
1008                s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
1009
1010                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1011                s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1012
1013                /* i2_tmp = CLIP_U8(i2_tmp);*/
1014                s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
1015
1016                /* store 8 8-bit output values  */
1017                /* Store the output pixels of row 4*/
1018                _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
1019
1020                /* row + 2*/
1021                s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
1022                s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
1023                s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
1024
1025                /*load 8 pixel values from (cur_row + 5)th row*/
1026                s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
1027
1028                /*load 8 pixel values from (cur_row + 6)th row*/
1029                s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
1030
1031                /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
1032                s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
1033
1034                s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
1035
1036                s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
1037                s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
1038                s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
1039
1040                s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
1041
1042                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1043                s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1044
1045                /* i2_tmp = CLIP_U8(i2_tmp);*/
1046                s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
1047
1048                /* store 8 8-bit output values  */
1049                /* Store the output pixels of (cur_row+2)*/
1050                _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
1051
1052
1053                /*row + 1*/
1054                s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
1055                s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
1056                s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
1057
1058                /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
1059                s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
1060                s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
1061
1062                s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
1063                s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
1064                s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
1065
1066                s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
1067
1068                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1069                s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1070
1071                /* i2_tmp = CLIP_U8(i2_tmp);*/
1072                s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
1073
1074                /* store 8 8-bit output values  */
1075                /* Store the output pixels of (cur_row + 1)*/
1076                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s19_8x16b);
1077
1078
1079                /* row + 3*/
1080                s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
1081                s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
1082                s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
1083
1084                /*load 8 pixel values from (cur_row + 7)th row*/
1085                s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
1086
1087                /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
1088                s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
1089
1090                s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
1091
1092                s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
1093                s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
1094                s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
1095
1096                s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
1097
1098                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1099                s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1100
1101                /* i2_tmp = CLIP_U8(i2_tmp);*/
1102                s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
1103
1104                /* store 8 8-bit output values  */
1105                /* Store the output pixels of (cur_row+3)*/
1106                _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
1107
1108                s2_10_16x8b = s2_3_16x8b;
1109
1110                pu1_src += 4 * src_strd; /* pointer update */
1111                pu1_dst += 4 * dst_strd; /* pointer update */
1112            }
1113        }
1114    }
1115    else /* wd = multiple of 8 case */
1116    {
1117
1118        pu1_src_copy = pu1_src;
1119        pu1_dst_copy = pu1_dst;
1120
1121        for(col = 0; col < wd; col += 4)
1122        {
1123
1124            pu1_src = pu1_src_copy + col;
1125            pu1_dst = pu1_dst_copy + col;
1126
1127            PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
1128            PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
1129            PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
1130            PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
1131
1132
1133            /*load 8 pixel values */
1134            s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
1135
1136            /*load 8 pixel values */
1137            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
1138
1139            s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
1140
1141            s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
1142
1143            /*load 8 pixel values */
1144            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
1145
1146            /*load 8 pixel values */
1147            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
1148
1149            s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
1150
1151            s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
1152
1153            /*load 8 pixel values */
1154            s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
1155
1156            /*load 8 pixel values */
1157            s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
1158
1159            s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
1160
1161            s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
1162
1163            /*load 8 pixel values */
1164            s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
1165
1166            /*load 8 pixel values */
1167            s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
1168
1169            s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
1170
1171            s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
1172
1173            s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
1174            s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
1175            s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
1176
1177            s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
1178
1179            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1180            s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1181
1182            /* i2_tmp = CLIP_U8(i2_tmp);*/
1183            s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
1184            s5_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
1185            s6_8x16b =  _mm_and_si128(s5_8x16b, mask_low_32b);
1186            s7_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
1187            s8_8x16b = _mm_or_si128(s6_8x16b, s7_8x16b);
1188            /* store 8 8-bit output values  */
1189            /* Store the output pixels of row 0*/
1190            _mm_storel_epi64((__m128i *)(pu1_dst), s8_8x16b);
1191
1192            /* ROW 2*/
1193            s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
1194            s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
1195            s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
1196
1197            /*load 8 pixel values */
1198            s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
1199
1200            /*load 8 pixel values */
1201            s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
1202
1203            s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
1204
1205            s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
1206
1207            s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
1208            s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
1209            s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
1210
1211            s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
1212
1213            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1214            s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1215
1216            /* i2_tmp = CLIP_U8(i2_tmp);*/
1217            s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
1218            s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
1219            s26_8x16b =  _mm_and_si128(s25_8x16b, mask_low_32b);
1220            s27_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
1221            s28_8x16b = _mm_or_si128(s26_8x16b, s27_8x16b);
1222            /* store 8 8-bit output values  */
1223            /* Store the output pixels of row 2*/
1224            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s28_8x16b);
1225
1226
1227            /*ROW 1*/
1228            s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
1229
1230            s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
1231
1232            s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
1233
1234            s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
1235
1236            s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
1237
1238            s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
1239
1240            s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
1241
1242            s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
1243
1244            s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
1245            s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
1246            s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
1247
1248            s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
1249
1250            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1251            s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1252
1253            /* i2_tmp = CLIP_U8(i2_tmp);*/
1254            s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
1255            s15_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
1256            s16_8x16b =  _mm_and_si128(s15_8x16b, mask_low_32b);
1257            s17_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
1258            s18_8x16b = _mm_or_si128(s16_8x16b, s17_8x16b);
1259            /* store 8 8-bit output values  */
1260            /* Store the output pixels of row 1*/
1261            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s18_8x16b);
1262
1263
1264            /* ROW 3*/
1265            s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
1266            s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
1267            s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
1268
1269            /*load 8 pixel values */
1270            s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
1271
1272            s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
1273
1274            s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
1275
1276            s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
1277            s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
1278            s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
1279
1280            s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
1281
1282            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1283            s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1284
1285            /* i2_tmp = CLIP_U8(i2_tmp);*/
1286            s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
1287
1288            s35_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
1289            s36_8x16b =  _mm_and_si128(s35_8x16b, mask_low_32b);
1290            s37_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
1291            s38_8x16b = _mm_or_si128(s36_8x16b, s37_8x16b);
1292
1293            /* store 8 8-bit output values  */
1294            /* Store the output pixels of row 2*/
1295            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s38_8x16b);
1296
1297            pu1_src += (8 * src_strd);
1298            pu1_dst += (4 * dst_strd);
1299
1300            for(row = 4; row < ht; row += 4)
1301            {
1302
1303                PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
1304                PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
1305                PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
1306                PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
1307
1308
1309                s3_0_16x8b = s3_2_16x8b;
1310                s3_1_16x8b = s3_3_16x8b;
1311                s3_2_16x8b = s3_4_16x8b;
1312
1313                s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
1314                s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
1315                s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
1316
1317                /*load 16 pixel values from (cur_row + 4)th row*/
1318                s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
1319
1320                s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
1321                s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
1322
1323                s4_0_16x8b = s4_2_16x8b;
1324                s4_1_16x8b = s4_3_16x8b;
1325                s4_2_16x8b = s4_4_16x8b;
1326
1327                s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
1328                s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
1329                s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
1330
1331                s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
1332
1333                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1334                s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1335
1336                /* i2_tmp = CLIP_U8(i2_tmp);*/
1337                s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
1338
1339                s5_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
1340                s6_8x16b =  _mm_and_si128(s5_8x16b, mask_low_32b);
1341                s7_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
1342                s8_8x16b = _mm_or_si128(s6_8x16b, s7_8x16b);
1343
1344                /* store 8 8-bit output values  */
1345                /* Store the output pixels of row 4*/
1346                _mm_storel_epi64((__m128i *)(pu1_dst), s8_8x16b);
1347
1348                /* row + 2*/
1349                s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
1350                s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
1351                s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
1352
1353                /*load 16 pixel values from (cur_row + 5)th row*/
1354                s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
1355
1356                /*load 16 pixel values from (cur_row + 6)th row*/
1357                s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
1358
1359                /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
1360                s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
1361
1362                s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
1363
1364                s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
1365                s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
1366                s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
1367
1368                s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
1369
1370                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1371                s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1372
1373                /* i2_tmp = CLIP_U8(i2_tmp);*/
1374                s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
1375
1376                s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
1377                s26_8x16b =  _mm_and_si128(s25_8x16b, mask_low_32b);
1378                s27_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
1379                s28_8x16b = _mm_or_si128(s26_8x16b, s27_8x16b);
1380
1381                /* store 8 8-bit output values  */
1382                /* Store the output pixels of (cur_row+2)*/
1383                _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s28_8x16b);
1384
1385
1386                /*row + 1*/
1387                s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
1388                s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
1389                s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
1390
1391                /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
1392                s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
1393                s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
1394
1395                s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
1396                s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
1397                s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
1398
1399                s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
1400
1401                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1402                s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1403
1404                /* i2_tmp = CLIP_U8(i2_tmp);*/
1405                s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
1406
1407                s15_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
1408                s16_8x16b =  _mm_and_si128(s15_8x16b, mask_low_32b);
1409                s17_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
1410                s18_8x16b = _mm_or_si128(s16_8x16b, s17_8x16b);
1411
1412                /* store 8 8-bit output values  */
1413                /* Store the output pixels of (cur_row + 1)*/
1414                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s18_8x16b);
1415
1416
1417                /* row + 3*/
1418                s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
1419                s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
1420                s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
1421
1422                /*load 16 pixel values from (cur_row + 7)th row*/
1423                s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
1424
1425                /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
1426                s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
1427
1428                s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
1429
1430                s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
1431                s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
1432                s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
1433
1434                s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
1435
1436                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
1437                s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
1438
1439                /* i2_tmp = CLIP_U8(i2_tmp);*/
1440                s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
1441
1442                s35_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
1443                s36_8x16b =  _mm_and_si128(s35_8x16b, mask_low_32b);
1444                s37_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
1445                s38_8x16b = _mm_or_si128(s36_8x16b, s37_8x16b);
1446
1447                /* store 8 8-bit output values  */
1448                /* Store the output pixels of (cur_row+3)*/
1449                _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s38_8x16b);
1450
1451                s2_10_16x8b = s2_3_16x8b;
1452
1453                pu1_src += 4 * src_strd; /* pointer update */
1454                pu1_dst += 4 * dst_strd; /* pointer update */
1455            }
1456        }
1457    }
1458}
1459
1460
1461/**
1462*******************************************************************************
1463*
1464* @brief
1465*       Interprediction luma filter for copy 16bit output
1466*
1467* @par Description:
1468*    Copies the array of width 'wd' and height 'ht' from the  location pointed
1469*    by 'src' to the location pointed by 'dst' The output is upshifted by 6
1470*    bits and is used as input for vertical filtering or weighted prediction
1471*
1472* @param[in] pu1_src
1473*  UWORD8 pointer to the source
1474*
1475* @param[out] pi2_dst
1476*  WORD16 pointer to the destination
1477*
1478* @param[in] src_strd
1479*  integer source stride
1480*
1481* @param[in] dst_strd
1482*  integer destination stride
1483*
1484* @param[in] pi1_coeff
1485*  WORD8 pointer to the filter coefficients
1486*
1487* @param[in] ht
1488*  integer height of the array
1489*
1490* @param[in] wd
1491*  integer width of the array
1492*
1493* @returns
1494*
1495* @remarks
1496*  None
1497*
1498*******************************************************************************
1499*/
1500
1501void ihevc_inter_pred_luma_copy_w16out_ssse3(UWORD8 *pu1_src,
1502                                             WORD16 *pi2_dst,
1503                                             WORD32 src_strd,
1504                                             WORD32 dst_strd,
1505                                             WORD8 *pi1_coeff,
1506                                             WORD32 ht,
1507                                             WORD32 wd)
1508{
1509    WORD32 row, col;
1510    __m128i  s3, zero_8x16b;
1511
1512    ASSERT(wd % 2 == 0); /* checking assumption*/
1513    ASSERT(ht % 2 == 0); /* checking assumption*/
1514    UNUSED(pi1_coeff);
1515    zero_8x16b = _mm_setzero_si128();
1516/*  outer for loop starts from here */
1517    if(wd % 8 == 0) /* wd = multiple of 8 case */
1518    {
1519        for(row = 0; row < ht; row += 2)
1520        {
1521            int offset = 0;
1522            for(col = 0; col < wd; col += 8)
1523            {
1524/* row =0 */
1525                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
1526                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
1527                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
1528
1529                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
1530
1531                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
1532                _mm_store_si128((__m128i *)(pi2_dst + offset), s3);
1533
1534/* row =1 */
1535                /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
1536                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col] */
1537                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
1538
1539                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
1540
1541                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
1542                _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
1543
1544                offset += 8; /* To pointer update */
1545            } /* inner for loop ends here(8-output values in single iteration) */
1546
1547            pu1_src += 2 * src_strd; /* pointer update */
1548            pi2_dst += 2 * dst_strd; /* pointer update */
1549        }
1550    }
1551    else /* wd = multiple of 4 case */
1552    {
1553        for(row = 0; row < ht; row += 2)
1554        {
1555            int offset = 0;
1556            for(col = 0; col < wd; col += 4)
1557            {
1558/* row =0 */
1559                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
1560                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
1561                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
1562
1563                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
1564
1565                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
1566                _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
1567
1568/* row =1 */
1569                /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
1570                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col] */
1571                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
1572
1573                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
1574
1575                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
1576                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
1577                offset += 4; /* To pointer update */
1578            } /* inner for loop ends here(4-output values in single iteration) */
1579
1580            pu1_src += 2 * src_strd; /* pointer update */
1581            pi2_dst += 2 * dst_strd; /* pointer update */
1582        }
1583    }
1584
1585}
1586
1587/**
1588*******************************************************************************
1589*
1590* @brief
1591*     Interprediction luma filter for horizontal 16bit output
1592*
1593* @par Description:
1594*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
1595*    to the elements pointed by 'pu1_src' and  writes to the location pointed
1596*    by 'pu1_dst'  No downshifting or clipping is done and the output is  used
1597*    as an input for vertical filtering or weighted  prediction
1598*
1599* @param[in] pu1_src
1600*  UWORD8 pointer to the source
1601*
1602* @param[out] pi2_dst
1603*  WORD16 pointer to the destination
1604*
1605* @param[in] src_strd
1606*  integer source stride
1607*
1608* @param[in] dst_strd
1609*  integer destination stride
1610*
1611* @param[in] pi1_coeff
1612*  WORD8 pointer to the filter coefficients
1613*
1614* @param[in] ht
1615*  integer height of the array
1616*
1617* @param[in] wd
1618*  integer width of the array
1619*
1620* @returns
1621*
1622* @remarks
1623*  None
1624*
1625*******************************************************************************
1626*/
1627void ihevc_inter_pred_luma_horz_w16out_ssse3(UWORD8 *pu1_src,
1628                                             WORD16 *pi2_dst,
1629                                             WORD32 src_strd,
1630                                             WORD32 dst_strd,
1631                                             WORD8 *pi1_coeff,
1632                                             WORD32 ht,
1633                                             WORD32 wd)
1634{
1635    WORD32 row, col;
1636
1637    /* all 128 bit registers are named with a suffix mxnb, where m is the */
1638    /* number of n bits packed in the register                            */
1639
1640    __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
1641    __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
1642    __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b;
1643    __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b;
1644    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
1645    __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
1646
1647    ASSERT(wd % 4 == 0); /* checking assumption*/
1648
1649    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
1650    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
1651    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
1652    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
1653    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
1654    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
1655
1656    /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
1657    src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
1658
1659
1660    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
1661    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
1662    control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
1663    control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
1664
1665    coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
1666    coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
1667
1668    coeff4_5_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_3_8x16b);  /* pi1_coeff[4] */
1669    coeff6_7_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_4_8x16b);  /* pi1_coeff[4] */
1670
1671    if(0 == (ht & 1)) /* ht multiple of 2 case */
1672    {
1673
1674        if(0 == (wd & 7)) /* wd = multiple of 8 case */
1675        {
1676            for(row = 0; row < ht; row += 2)
1677            {
1678
1679                int offset = 0;
1680
1681                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
1682                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
1683
1684
1685
1686                for(col = 0; col < wd; col += 8)
1687                {
1688                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
1689                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
1690                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
1691
1692                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
1693                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
1694                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1695                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
1696                    /* row = 0 */
1697                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1698                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1699                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1700                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1701                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
1702
1703                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1704                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1705                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1706                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1707                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
1708
1709                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1710                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1711                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1712                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1713                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
1714
1715                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
1716                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
1717                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
1718
1719                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
1720                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
1721                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1722                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
1723                                                                                              /* row = 1 */
1724                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
1725                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
1726                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
1727                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1728                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
1729
1730                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
1731                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
1732                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
1733                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1734                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
1735
1736                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
1737                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
1738                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
1739                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1740                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
1741
1742                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
1743                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
1744                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
1745
1746                    /* to store the 1st 4 pixels res. */
1747                    _mm_store_si128((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
1748                    _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
1749
1750                    offset += 8; /* To pointer updates*/
1751                }
1752                pu1_src += 2 * src_strd;  /* pointer updates*/
1753                pi2_dst += 2 * dst_strd;  /* pointer updates*/
1754            }
1755        }
1756        else /* wd = multiple of 4 case */
1757        {
1758            for(row = 0; row < ht; row += 2)
1759            {
1760                int offset = 0;
1761
1762                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
1763                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
1764
1765
1766                for(col = 0; col < wd; col += 4)
1767                {
1768                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
1769                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
1770                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
1771
1772                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
1773                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
1774                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1775                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
1776                    /* row = 0 */
1777                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1778                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1779                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1780                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1781                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
1782
1783                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1784                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1785                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1786                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1787                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
1788
1789                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1790                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1791                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1792                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1793                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
1794
1795                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
1796                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
1797                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
1798
1799                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
1800                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
1801                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1802                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
1803                                                                                              /* row = 1 */
1804                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
1805                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
1806                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
1807                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1808                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
1809
1810                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
1811                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
1812                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
1813                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1814                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
1815
1816                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
1817                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
1818                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
1819                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1820                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
1821
1822                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
1823                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
1824                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
1825
1826                    /* to store the 1st 4 pixels res. */
1827                    _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
1828                    _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
1829
1830                    offset += 4; /* To pointer updates*/
1831                }
1832                pu1_src += 2 * src_strd;  /* Pointer update */
1833                pi2_dst += 2 * dst_strd;  /* Pointer update */
1834            }
1835        }
1836    }
1837    else /* odd ht */
1838    {
1839        if(0 == (wd & 7)) /* multiple of 8 case */
1840        {
1841            for(row = 0; row < ht; row++)
1842            {
1843                int offset = 0;
1844
1845                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
1846
1847
1848                for(col = 0; col < wd; col += 8)
1849                {
1850                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
1851                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
1852
1853                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
1854                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
1855                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1856                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
1857                    /* row = 0 */
1858                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1859                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1860                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1861                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1862                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
1863
1864                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1865                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1866                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1867                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1868                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
1869
1870                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1871                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1872                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1873                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1874                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
1875
1876                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
1877                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
1878                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
1879
1880                    /* to store the 1st 4 pixels res. */
1881                    _mm_store_si128((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
1882
1883                    offset += 8; /* To pointer updates*/
1884                }
1885                pu1_src += src_strd;    /* pointer updates*/
1886                pi2_dst += dst_strd;    /* pointer updates*/
1887            }
1888        }
1889        else  /* wd = multiple of 4 case */
1890        {
1891            for(row = 0; row < (ht - 1); row += 2)
1892            {
1893                int offset = 0;
1894
1895
1896                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
1897                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
1898
1899
1900                for(col = 0; col < wd; col += 4)
1901                {
1902                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
1903                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
1904                    src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
1905
1906                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
1907                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
1908                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1909                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
1910                                                                                           /* row = 0 */
1911                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1912                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1913                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1914                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1915                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
1916
1917                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1918                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1919                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1920                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1921                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
1922
1923                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1924                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1925                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1926                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1927                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
1928
1929                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
1930                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
1931                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
1932
1933                    src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
1934                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
1935                    src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1936                    res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
1937                                                                                              /* row = 1 */
1938                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
1939                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
1940                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
1941                    src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1942                    res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
1943
1944                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
1945                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
1946                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
1947                    src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1948                    res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
1949
1950                    src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
1951                    src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
1952                    /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
1953                    src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
1954                    res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
1955
1956                    res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
1957                    res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
1958                    res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
1959
1960                    /* to store the 1st 4 pixels res. */
1961                    _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
1962                    _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
1963
1964                    offset += 4; /* To pointer updates*/
1965                }
1966                pu1_src += 2 * src_strd;  /* Pointer update */
1967                pi2_dst += 2 * dst_strd;  /* Pointer update */
1968            }
1969            { /* last repeat at outside the loop */
1970                int offset = 0;
1971                for(col = 0; col < wd; col += 4)
1972                {
1973                    /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
1974                    src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
1975
1976                    src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
1977                    /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
1978                    src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1979                    res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
1980                                                                                           /* row = 0 */
1981                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1982                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1983                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1984                    src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1985                    res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
1986
1987                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1988                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1989                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1990                    src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1991                    res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
1992
1993                    src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
1994                    src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
1995                    /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
1996                    src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
1997                    res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
1998
1999                    res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
2000                    res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
2001                    res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
2002
2003                    /* to store the 1st 4 pixels res. */
2004                    _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
2005
2006                    offset += 4; /* To pointer updates*/
2007                }
2008            }
2009        }
2010    }
2011}
2012
2013/**
2014*******************************************************************************
2015*
2016* @brief
2017*      Interprediction luma filter for vertical 16bit output
2018*
2019* @par Description:
2020*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
2021*    the elements pointed by 'pu1_src' and  writes to the location pointed by
2022*    'pu1_dst'  No downshifting or clipping is done and the output is  used as
2023*    an input for weighted prediction
2024*
2025* @param[in] pu1_src
2026*  UWORD8 pointer to the source
2027*
2028* @param[out] pi2_dst
2029*  WORD16 pointer to the destination
2030*
2031* @param[in] src_strd
2032*  integer source stride
2033*
2034* @param[in] dst_strd
2035*  integer destination stride
2036*
2037* @param[in] pi1_coeff
2038*  WORD8 pointer to the filter coefficients
2039*
2040* @param[in] ht
2041*  integer height of the array
2042*
2043* @param[in] wd
2044*  integer width of the array
2045*
2046* @returns
2047*
2048* @remarks
2049*  None
2050*
2051*******************************************************************************
2052*/
2053void ihevc_inter_pred_luma_vert_w16out_ssse3(UWORD8 *pu1_src,
2054                                             WORD16 *pi2_dst,
2055                                             WORD32 src_strd,
2056                                             WORD32 dst_strd,
2057                                             WORD8 *pi1_coeff,
2058                                             WORD32 ht,
2059                                             WORD32 wd)
2060{
2061    WORD32 row, col;
2062    UWORD8 *pu1_src_copy;
2063    WORD16 *pi2_dst_copy;
2064    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
2065    __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b;
2066    __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
2067    __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
2068    __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
2069    __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b;
2070    __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b;
2071    __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b;
2072
2073
2074    __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
2075
2076/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
2077    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
2078
2079    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
2080    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
2081    control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
2082    control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
2083
2084    coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
2085    coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
2086
2087    coeff4_5_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_3_8x16b);  /* pi1_coeff[4] */
2088    coeff6_7_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_4_8x16b);  /* pi1_coeff[4] */
2089
2090
2091/*  outer for loop starts from here */
2092    if((wd % 8) == 0)
2093    { /* wd = multiple of 8 case */
2094
2095        pu1_src_copy = pu1_src;
2096        pi2_dst_copy = pi2_dst;
2097
2098        for(col = 0; col < wd; col += 8)
2099        {
2100
2101            pu1_src = pu1_src_copy + col;
2102            pi2_dst = pi2_dst_copy + col;
2103
2104            PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
2105            PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
2106            PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
2107            PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
2108
2109            /*load 8 pixel values */
2110            s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
2111
2112            /*load 8 pixel values */
2113            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
2114
2115            s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
2116
2117            s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
2118
2119            /*load 8 pixel values */
2120            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
2121
2122            /*load 8 pixel values */
2123            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
2124
2125            s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
2126
2127            s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
2128
2129            /*load 8 pixel values */
2130            s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
2131
2132            /*load 8 pixel values */
2133            s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
2134
2135            s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
2136
2137            s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
2138
2139            /*load 8 pixel values */
2140            s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
2141
2142            /*load 8 pixel values */
2143            s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
2144
2145            s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
2146
2147            s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
2148
2149            s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
2150            s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
2151            s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
2152
2153            /* store 8 8-bit output values  */
2154            /* Store the output pixels of row 0*/
2155            _mm_store_si128((__m128i *)(pi2_dst), s6_8x16b);
2156
2157            /* ROW 2*/
2158            s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
2159            s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
2160            s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
2161
2162            /*load 8 pixel values */
2163            s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
2164
2165            /*load 8 pixel values */
2166            s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
2167
2168            s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
2169
2170            s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
2171
2172            s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
2173            s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
2174            s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
2175
2176            /* store 8 8-bit output values  */
2177            /* Store the output pixels of row 2*/
2178            _mm_store_si128((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
2179
2180
2181            /*ROW 1*/
2182            s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
2183
2184            s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
2185
2186            s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
2187
2188            s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
2189
2190            s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
2191
2192            s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
2193
2194            s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
2195
2196            s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
2197
2198            s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
2199            s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
2200            s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
2201
2202
2203            /* store 8 8-bit output values  */
2204            /* Store the output pixels of row 1*/
2205            _mm_store_si128((__m128i *)(pi2_dst + (dst_strd)), s16_8x16b);
2206
2207
2208            /* ROW 3*/
2209            s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
2210            s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
2211            s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
2212
2213            /*load 8 pixel values */
2214            s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
2215
2216            s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
2217
2218            s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
2219
2220            s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
2221            s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
2222            s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
2223
2224
2225            /* store 8 8-bit output values  */
2226            /* Store the output pixels of row 2*/
2227            _mm_store_si128((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
2228
2229            pu1_src += (8 * src_strd);
2230            pi2_dst += (4 * dst_strd);
2231
2232            for(row = 4; row < ht; row += 4)
2233            {
2234
2235                PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
2236                PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
2237                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
2238                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
2239
2240                s3_0_16x8b = s3_2_16x8b;
2241                s3_1_16x8b = s3_3_16x8b;
2242                s3_2_16x8b = s3_4_16x8b;
2243
2244                s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
2245                s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
2246                s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
2247
2248                /*load 8 pixel values from (cur_row + 4)th row*/
2249                s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
2250
2251                s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
2252                s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
2253
2254                s4_0_16x8b = s4_2_16x8b;
2255                s4_1_16x8b = s4_3_16x8b;
2256                s4_2_16x8b = s4_4_16x8b;
2257
2258                s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
2259                s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
2260                s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
2261
2262                /* store 8 8-bit output values  */
2263                /* Store the output pixels of row 4*/
2264                _mm_store_si128((__m128i *)(pi2_dst), s6_8x16b);
2265
2266                /* row + 2*/
2267                s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
2268                s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
2269                s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
2270
2271                /*load 8 pixel values from (cur_row + 5)th row*/
2272                s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
2273
2274                /*load 8 pixel values from (cur_row + 6)th row*/
2275                s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
2276
2277                /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
2278                s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
2279
2280                s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
2281
2282                s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
2283                s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
2284                s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
2285
2286                /* store 8 8-bit output values  */
2287                /* Store the output pixels of (cur_row+2)*/
2288                _mm_store_si128((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
2289
2290
2291                /*row + 1*/
2292                s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
2293                s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
2294                s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
2295
2296                /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
2297                s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
2298                s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
2299
2300                s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
2301                s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
2302                s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
2303
2304
2305                /* store 8 8-bit output values  */
2306                /* Store the output pixels of (cur_row + 1)*/
2307                _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s16_8x16b);
2308
2309
2310                /* row + 3*/
2311                s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
2312                s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
2313                s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
2314
2315                /*load 8 pixel values from (cur_row + 7)th row*/
2316                s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
2317
2318                /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
2319                s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
2320
2321                s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
2322
2323                s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
2324                s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
2325                s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
2326
2327                /* store 8 8-bit output values  */
2328                /* Store the output pixels of (cur_row+3)*/
2329                _mm_store_si128((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
2330
2331                s2_10_16x8b = s2_3_16x8b;
2332
2333
2334                pu1_src += 4 * src_strd; /* pointer update */
2335                pi2_dst += 4 * dst_strd; /* pointer update */
2336            }
2337        }
2338    }
2339    else /* wd = multiple of 8 case */
2340    {
2341
2342        pu1_src_copy = pu1_src;
2343        pi2_dst_copy = pi2_dst;
2344
2345        for(col = 0; col < wd; col += 4)
2346        {
2347
2348            pu1_src = pu1_src_copy + col;
2349            pi2_dst = pi2_dst_copy + col;
2350
2351            PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
2352            PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
2353            PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
2354            PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
2355
2356            /*load 8 pixel values */
2357            s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
2358
2359            /*load 8 pixel values */
2360            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
2361
2362            s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
2363
2364            s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
2365
2366            /*load 8 pixel values */
2367            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
2368
2369            /*load 8 pixel values */
2370            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
2371
2372            s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
2373
2374            s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
2375
2376            /*load 8 pixel values */
2377            s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
2378
2379            /*load 8 pixel values */
2380            s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
2381
2382            s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
2383
2384            s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
2385
2386            /*load 8 pixel values */
2387            s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
2388
2389            /*load 8 pixel values */
2390            s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
2391
2392            s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
2393
2394            s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
2395
2396            s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
2397            s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
2398            s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
2399
2400            /* store 8 8-bit output values  */
2401            /* Store the output pixels of row 0*/
2402            _mm_storel_epi64((__m128i *)(pi2_dst), s6_8x16b);
2403
2404            /* ROW 2*/
2405            s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
2406            s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
2407            s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
2408
2409            /*load 8 pixel values */
2410            s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
2411
2412            /*load 8 pixel values */
2413            s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
2414
2415            s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
2416
2417            s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
2418
2419            s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
2420            s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
2421            s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
2422
2423            /* store 8 8-bit output values  */
2424            /* Store the output pixels of row 2*/
2425            _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
2426
2427
2428            /*ROW 1*/
2429            s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
2430
2431            s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
2432
2433            s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
2434
2435            s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
2436
2437            s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
2438
2439            s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
2440
2441            s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
2442
2443            s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
2444
2445            s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
2446            s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
2447            s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
2448
2449
2450            /* store 8 8-bit output values  */
2451            /* Store the output pixels of row 1*/
2452            _mm_storel_epi64((__m128i *)(pi2_dst + (dst_strd)), s16_8x16b);
2453
2454
2455            /* ROW 3*/
2456            s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
2457            s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
2458            s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
2459
2460            /*load 8 pixel values */
2461            s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
2462
2463            s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
2464
2465            s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
2466
2467            s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
2468            s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
2469            s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
2470
2471            /* store 8 8-bit output values  */
2472            /* Store the output pixels of row 2*/
2473            _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
2474
2475            pu1_src += (8 * src_strd);
2476            pi2_dst += (4 * dst_strd);
2477
2478            for(row = 4; row < ht; row += 4)
2479            {
2480
2481                PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
2482                PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
2483                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
2484                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
2485
2486                s3_0_16x8b = s3_2_16x8b;
2487                s3_1_16x8b = s3_3_16x8b;
2488                s3_2_16x8b = s3_4_16x8b;
2489
2490                s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
2491                s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
2492                s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
2493
2494                /*load 8 pixel values from (cur_row + 4)th row*/
2495                s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
2496
2497                s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
2498                s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
2499
2500                s4_0_16x8b = s4_2_16x8b;
2501                s4_1_16x8b = s4_3_16x8b;
2502                s4_2_16x8b = s4_4_16x8b;
2503
2504                s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
2505                s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
2506                s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
2507
2508                /* store 8 8-bit output values  */
2509                /* Store the output pixels of row 4*/
2510                _mm_storel_epi64((__m128i *)(pi2_dst), s6_8x16b);
2511
2512                /* row + 2*/
2513                s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
2514                s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
2515                s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
2516
2517                /*load 8 pixel values from (cur_row + 5)th row*/
2518                s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
2519
2520                /*load 8 pixel values from (cur_row + 6)th row*/
2521                s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
2522
2523                /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
2524                s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
2525
2526                s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
2527
2528                s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
2529                s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
2530                s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
2531
2532                /* store 8 8-bit output values  */
2533                /* Store the output pixels of (cur_row+2)*/
2534                _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
2535
2536
2537                /*row + 1*/
2538                s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
2539                s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
2540                s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
2541
2542                /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
2543                s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
2544                s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
2545
2546                s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
2547                s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
2548                s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
2549
2550                /* store 8 8-bit output values  */
2551                /* Store the output pixels of (cur_row + 1)*/
2552                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s16_8x16b);
2553
2554
2555                /* row + 3*/
2556                s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
2557                s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
2558                s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
2559
2560                /*load 8 pixel values from (cur_row + 7)th row*/
2561                s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
2562
2563                /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
2564                s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
2565
2566                s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
2567
2568                s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
2569                s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
2570                s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
2571
2572                /* store 8 8-bit output values  */
2573                /* Store the output pixels of (cur_row+3)*/
2574                _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
2575
2576                s2_10_16x8b = s2_3_16x8b;
2577
2578                pu1_src += 4 * src_strd; /* pointer update */
2579                pi2_dst += 4 * dst_strd; /* pointer update */
2580            }
2581        }
2582    }
2583}
2584
2585/**
2586*******************************************************************************
2587*
2588* @brief
2589*
2590*        Luma vertical filter for 16bit input.
2591*
2592* @par Description:
2593*   Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
2594*   the elements pointed by 'pu1_src' and  writes to the location pointed by
2595*   'pu1_dst'  Input is 16 bits  The filter output is downshifted by 12 and
2596*   clipped to lie  between 0 and 255
2597*
2598* @param[in] pi2_src
2599*  WORD16 pointer to the source
2600*
2601* @param[out] pu1_dst
2602*  UWORD8 pointer to the destination
2603*
2604* @param[in] src_strd
2605*  integer source stride
2606*
2607* @param[in] dst_strd
2608*  integer destination stride
2609*
2610* @param[in] pi1_coeff
2611*  WORD8 pointer to the filter coefficients
2612*
2613* @param[in] ht
2614*  integer height of the array
2615*
2616* @param[in] wd
2617*  integer width of the array
2618*
2619* @returns
2620*
2621* @remarks
2622*  None
2623*
2624*******************************************************************************
2625*/
2626void ihevc_inter_pred_luma_vert_w16inp_ssse3(WORD16 *pi2_src,
2627                                             UWORD8 *pu1_dst,
2628                                             WORD32 src_strd,
2629                                             WORD32 dst_strd,
2630                                             WORD8 *pi1_coeff,
2631                                             WORD32 ht,
2632                                             WORD32 wd)
2633{
2634    WORD32 row, col;
2635    WORD16 *pi2_src_copy;
2636    UWORD8 *pu1_dst_copy;
2637    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
2638    __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b, s9_8x16b;
2639    __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
2640    __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
2641    __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
2642    __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s18_8x16b, s19_8x16b;
2643    __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s28_8x16b, s29_8x16b;
2644    __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s38_8x16b, s39_8x16b;
2645
2646    __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b, sign_reg;
2647
2648/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
2649    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
2650
2651    zero_8x16b = _mm_setzero_si128();
2652    sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
2653    s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
2654
2655    coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
2656    coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
2657
2658    coeff4_5_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(2, 2, 2, 2));  /* pi1_coeff[4] */
2659    coeff6_7_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(3, 3, 3, 3));  /* pi1_coeff[4] */
2660
2661
2662/* seting values in register */
2663    offset_8x16b = _mm_set1_epi32(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
2664    mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
2665    mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
2666
2667
2668    pi2_src_copy = pi2_src;
2669    pu1_dst_copy = pu1_dst;
2670
2671/*  outer for loop starts from here */
2672    for(col = 0; col < wd; col += 4)
2673    {
2674
2675        pi2_src = pi2_src_copy + col;
2676        pu1_dst = pu1_dst_copy + col;
2677
2678        /*load 4 pixel values */
2679        s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-3 * src_strd)));
2680
2681        /*load 4 pixel values */
2682        s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-2 * src_strd)));
2683
2684        s3_0_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
2685
2686        s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
2687
2688        /*load 4 pixel values */
2689        s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
2690
2691        /*load 4 pixel values */
2692        s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
2693
2694        s3_1_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
2695
2696        s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
2697
2698        /*load 4 pixel values */
2699        s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
2700
2701        /*load 4 pixel values */
2702        s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
2703
2704        s3_2_16x8b = _mm_unpacklo_epi16(s2_4_16x8b, s2_5_16x8b);
2705
2706        s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
2707
2708        /*load 4 pixel values */
2709        s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
2710
2711        /*load 4 pixel values */
2712        s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (4 * src_strd)));
2713
2714        s3_3_16x8b = _mm_unpacklo_epi16(s2_6_16x8b, s2_7_16x8b);
2715
2716        s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
2717
2718        s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
2719        s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
2720        s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
2721
2722        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2723        s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2724
2725        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
2726        s9_8x16b = _mm_add_epi32(s8_8x16b, offset_8x16b);
2727
2728        /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2729        s8_8x16b = _mm_srai_epi32(s9_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2730
2731        s8_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
2732
2733
2734        /* i2_tmp = CLIP_U8(i2_tmp);*/
2735        s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
2736
2737        s4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
2738        s5_8x16b =  _mm_and_si128(s4_8x16b, mask_low_32b);
2739        s6_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
2740        s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
2741
2742        /* store 8 8-bit output values  */
2743        /* Store the output pixels of row 0*/
2744        _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
2745
2746        /* ROW 2*/
2747        s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
2748        s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
2749        s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
2750
2751        /*load 4 pixel values */
2752        s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (5 * src_strd)));
2753
2754        /*load 4 pixel values */
2755        s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (6 * src_strd)));
2756
2757        s3_4_16x8b = _mm_unpacklo_epi16(s2_8_16x8b, s2_9_16x8b);
2758
2759        s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
2760
2761        s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
2762        s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
2763        s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
2764
2765        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2766        s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2767
2768        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
2769        s29_8x16b = _mm_add_epi32(s28_8x16b, offset_8x16b);
2770
2771        /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2772        s28_8x16b = _mm_srai_epi32(s29_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2773
2774        s28_8x16b = _mm_packs_epi32(s28_8x16b, zero_8x16b);
2775
2776
2777        /* i2_tmp = CLIP_U8(i2_tmp);*/
2778        s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
2779
2780        s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
2781        s25_8x16b =  _mm_and_si128(s24_8x16b, mask_low_32b);
2782        s26_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
2783        s29_8x16b = _mm_or_si128(s25_8x16b, s26_8x16b);
2784
2785        /* store 8 8-bit output values  */
2786        /* Store the output pixels of row 2*/
2787        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
2788
2789
2790        /*ROW 1*/
2791        s4_0_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
2792
2793        s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
2794
2795        s4_1_16x8b = _mm_unpacklo_epi16(s2_3_16x8b, s2_4_16x8b);
2796
2797        s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
2798
2799        s4_2_16x8b = _mm_unpacklo_epi16(s2_5_16x8b, s2_6_16x8b);
2800
2801        s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
2802
2803        s4_3_16x8b = _mm_unpacklo_epi16(s2_7_16x8b, s2_8_16x8b);
2804
2805        s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
2806
2807        s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
2808        s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
2809        s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
2810
2811        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2812        s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2813
2814        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
2815        s19_8x16b = _mm_add_epi32(s18_8x16b, offset_8x16b);
2816
2817        /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2818        s18_8x16b = _mm_srai_epi32(s19_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2819
2820        s18_8x16b = _mm_packs_epi32(s18_8x16b, zero_8x16b);
2821
2822
2823        /* i2_tmp = CLIP_U8(i2_tmp);*/
2824        s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
2825
2826        s14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (dst_strd)));
2827        s15_8x16b =  _mm_and_si128(s14_8x16b, mask_low_32b);
2828        s16_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
2829        s19_8x16b = _mm_or_si128(s15_8x16b, s16_8x16b);
2830
2831        /* store 8 8-bit output values  */
2832        /* Store the output pixels of row 1*/
2833        _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s19_8x16b);
2834
2835
2836        /* ROW 3*/
2837        s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
2838        s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
2839        s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
2840
2841        /*load 4 pixel values */
2842        s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (7 * src_strd)));
2843
2844        s4_4_16x8b = _mm_unpacklo_epi16(s2_9_16x8b, s2_10_16x8b);
2845
2846        s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
2847
2848        s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
2849        s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
2850        s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
2851
2852        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2853        s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2854
2855
2856        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
2857        s39_8x16b = _mm_add_epi32(s38_8x16b, offset_8x16b);
2858
2859        /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2860        s38_8x16b = _mm_srai_epi32(s39_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2861
2862        s38_8x16b = _mm_packs_epi32(s38_8x16b, zero_8x16b);
2863
2864
2865        /* i2_tmp = CLIP_U8(i2_tmp);*/
2866        s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
2867
2868        s34_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
2869        s35_8x16b =  _mm_and_si128(s34_8x16b, mask_low_32b);
2870        s36_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
2871        s39_8x16b = _mm_or_si128(s35_8x16b, s36_8x16b);
2872
2873        /* store 8 8-bit output values  */
2874        /* Store the output pixels of row 2*/
2875        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
2876
2877        pi2_src += (8 * src_strd);
2878        pu1_dst += (4 * dst_strd);
2879
2880        for(row = 4; row < ht; row += 4)
2881        {
2882
2883            s3_0_16x8b = s3_2_16x8b;
2884            s3_1_16x8b = s3_3_16x8b;
2885            s3_2_16x8b = s3_4_16x8b;
2886
2887            s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
2888            s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
2889            s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
2890
2891            /*load 4 pixel values from (cur_row + 4)th row*/
2892            s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src));
2893
2894            s3_3_16x8b = _mm_unpacklo_epi16(s2_10_16x8b, s2_0_16x8b);
2895            s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
2896
2897            s4_0_16x8b = s4_2_16x8b;
2898            s4_1_16x8b = s4_3_16x8b;
2899            s4_2_16x8b = s4_4_16x8b;
2900
2901            s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
2902            s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
2903            s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
2904
2905            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2906            s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2907
2908            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
2909            s9_8x16b = _mm_add_epi32(s8_8x16b, offset_8x16b);
2910
2911            /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2912            s8_8x16b = _mm_srai_epi32(s9_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2913
2914            s8_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
2915
2916
2917            /* i2_tmp = CLIP_U8(i2_tmp);*/
2918            s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
2919
2920            s4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
2921            s5_8x16b =  _mm_and_si128(s4_8x16b, mask_low_32b);
2922            s6_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
2923            s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
2924
2925            /* store 8 8-bit output values  */
2926            /* Store the output pixels of row 4*/
2927            _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
2928
2929/* row + 2*/
2930            s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
2931            s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
2932            s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
2933
2934            /*load 4 pixel values from (cur_row + 5)th row*/
2935            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
2936
2937            /*load 4 pixel values from (cur_row + 6)th row*/
2938            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
2939
2940            /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
2941            s3_4_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
2942
2943            s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
2944
2945            s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
2946            s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
2947            s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
2948
2949            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2950            s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2951
2952            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
2953            s29_8x16b = _mm_add_epi32(s28_8x16b, offset_8x16b);
2954
2955            /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2956            s28_8x16b = _mm_srai_epi32(s29_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2957
2958            s28_8x16b = _mm_packs_epi32(s28_8x16b, zero_8x16b);
2959
2960
2961            /* i2_tmp = CLIP_U8(i2_tmp);*/
2962            s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
2963
2964            s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
2965            s25_8x16b =  _mm_and_si128(s24_8x16b, mask_low_32b);
2966            s26_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
2967            s29_8x16b = _mm_or_si128(s25_8x16b, s26_8x16b);
2968
2969            /* store 8 8-bit output values  */
2970            /* Store the output pixels of (cur_row+2)*/
2971            _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
2972
2973
2974/*row + 1*/
2975            s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
2976            s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
2977            s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
2978
2979            /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
2980            s4_3_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
2981            s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
2982
2983            s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
2984            s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
2985            s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
2986
2987            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2988            s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2989
2990            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
2991            s19_8x16b = _mm_add_epi32(s18_8x16b, offset_8x16b);
2992
2993            /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
2994            s18_8x16b = _mm_srai_epi32(s19_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
2995
2996            s18_8x16b = _mm_packs_epi32(s18_8x16b, zero_8x16b);
2997
2998            /* i2_tmp = CLIP_U8(i2_tmp);*/
2999            s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
3000
3001            s14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
3002            s15_8x16b =  _mm_and_si128(s14_8x16b, mask_low_32b);
3003            s16_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
3004            s19_8x16b = _mm_or_si128(s15_8x16b, s16_8x16b);
3005
3006            /* store 8 8-bit output values  */
3007            /* Store the output pixels of (cur_row + 1)*/
3008            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s19_8x16b);
3009
3010
3011/* row + 3*/
3012            s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
3013            s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
3014            s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
3015
3016            /*load 4 pixel values from (cur_row + 7)th row*/
3017            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
3018
3019            /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
3020            s4_4_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
3021
3022            s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
3023
3024            s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
3025            s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
3026            s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
3027
3028            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
3029            s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3030
3031            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
3032            s39_8x16b = _mm_add_epi32(s38_8x16b, offset_8x16b);
3033
3034            /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
3035            s38_8x16b = _mm_srai_epi32(s39_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3036
3037            s38_8x16b = _mm_packs_epi32(s38_8x16b, zero_8x16b);
3038
3039
3040            /* i2_tmp = CLIP_U8(i2_tmp);*/
3041            s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
3042
3043            s34_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
3044            s35_8x16b =  _mm_and_si128(s34_8x16b, mask_low_32b);
3045            s36_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
3046            s39_8x16b = _mm_or_si128(s35_8x16b, s36_8x16b);
3047
3048            /* store 8 8-bit output values  */
3049            /* Store the output pixels of (cur_row+3)*/
3050            _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
3051
3052            s2_10_16x8b = s2_3_16x8b;
3053
3054            pi2_src += 4 * src_strd; /* pointer update */
3055            pu1_dst += 4 * dst_strd; /* pointer update */
3056        }
3057    }
3058
3059}
3060
3061
3062/**
3063*******************************************************************************
3064*
3065* @brief
3066*      Luma prediction filter for vertical 16bit input & output
3067*
3068* @par Description:
3069*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
3070*    the elements pointed by 'pu1_src' and  writes to the location pointed by
3071*    'pu1_dst'  Input is 16 bits  The filter output is downshifted by 6 and
3072*    8192 is  subtracted to store it as a 16 bit number  The output is used as
3073*    a input to weighted prediction
3074*
3075* @param[in] pi2_src
3076*  WORD16 pointer to the source
3077*
3078* @param[out] pi2_dst
3079*  WORD16 pointer to the destination
3080*
3081* @param[in] src_strd
3082*  integer source stride
3083*
3084* @param[in] dst_strd
3085*  integer destination stride
3086*
3087* @param[in] pi1_coeff
3088*  WORD8 pointer to the filter coefficients
3089*
3090* @param[in] ht
3091*  integer height of the array
3092*
3093* @param[in] wd
3094*  integer width of the array
3095*
3096* @returns
3097*
3098* @remarks
3099*  None
3100*
3101*******************************************************************************
3102*/
3103void ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3(WORD16 *pi2_src,
3104                                                    WORD16 *pi2_dst,
3105                                                    WORD32 src_strd,
3106                                                    WORD32 dst_strd,
3107                                                    WORD8 *pi1_coeff,
3108                                                    WORD32 ht,
3109                                                    WORD32 wd)
3110{
3111    WORD32 row, col;
3112    WORD16 *pi2_src_copy;
3113    WORD16 *pi2_dst_copy;
3114    __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
3115    __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b, s9_8x16b;
3116    __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
3117    __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
3118    __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
3119    __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s18_8x16b, s19_8x16b;
3120    __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s28_8x16b, s29_8x16b;
3121    __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s38_8x16b, s39_8x16b;
3122
3123    __m128i zero_8x16b, offset_8x16b, sign_reg;
3124
3125/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
3126    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
3127
3128    zero_8x16b = _mm_setzero_si128();
3129    sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
3130    s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
3131
3132    coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
3133    coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
3134
3135    coeff4_5_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(2, 2, 2, 2));  /* pi1_coeff[4] */
3136    coeff6_7_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(3, 3, 3, 3));  /* pi1_coeff[4] */
3137
3138
3139/* seting values in register */
3140    offset_8x16b = _mm_set1_epi32(OFFSET14); /* for offset addition */
3141
3142    pi2_src_copy = pi2_src;
3143    pi2_dst_copy = pi2_dst;
3144
3145/*  outer for loop starts from here */
3146    for(col = 0; col < wd; col += 4)
3147    {
3148
3149        pi2_src = pi2_src_copy + col;
3150        pi2_dst = pi2_dst_copy + col;
3151
3152        /*load 4 pixel values*/
3153        s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-3 * src_strd)));
3154
3155        /*load 4 pixel values*/
3156        s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-2 * src_strd)));
3157
3158        s3_0_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
3159
3160        s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
3161
3162        /*load 4 pixel values*/
3163        s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
3164
3165        /*load 4 pixel values*/
3166        s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
3167
3168        s3_1_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
3169
3170        s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
3171
3172        /*load 4 pixel values*/
3173        s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
3174
3175        /*load 4 pixel values*/
3176        s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
3177
3178        s3_2_16x8b = _mm_unpacklo_epi16(s2_4_16x8b, s2_5_16x8b);
3179
3180        s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
3181
3182        /*load 4 pixel values*/
3183        s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
3184
3185        /*load 4 pixel values*/
3186        s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (4 * src_strd)));
3187
3188        s3_3_16x8b = _mm_unpacklo_epi16(s2_6_16x8b, s2_7_16x8b);
3189
3190        s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
3191
3192        s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
3193        s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
3194        s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
3195
3196        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
3197        s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3198
3199        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
3200        s9_8x16b = _mm_sub_epi32(s8_8x16b, offset_8x16b);
3201
3202        s8_8x16b = _mm_packs_epi32(s9_8x16b, zero_8x16b);
3203
3204        /* store 8 8-bit output values  */
3205        /* Store the output pixels of row 0*/
3206        _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
3207
3208        /* ROW 2*/
3209        s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
3210        s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
3211        s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
3212
3213        /*load 4 pixel values*/
3214        s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (5 * src_strd)));
3215
3216        /*load 4 pixel values*/
3217        s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (6 * src_strd)));
3218
3219        s3_4_16x8b = _mm_unpacklo_epi16(s2_8_16x8b, s2_9_16x8b);
3220
3221        s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
3222
3223        s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
3224        s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
3225        s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
3226
3227        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
3228        s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3229
3230        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
3231        s29_8x16b = _mm_sub_epi32(s28_8x16b, offset_8x16b);
3232
3233        s28_8x16b = _mm_packs_epi32(s29_8x16b, zero_8x16b);
3234
3235        /* store 8 8-bit output values  */
3236        /* Store the output pixels of row 2*/
3237        _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s28_8x16b);
3238
3239
3240        /*ROW 1*/
3241        s4_0_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
3242
3243        s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
3244
3245        s4_1_16x8b = _mm_unpacklo_epi16(s2_3_16x8b, s2_4_16x8b);
3246
3247        s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
3248
3249        s4_2_16x8b = _mm_unpacklo_epi16(s2_5_16x8b, s2_6_16x8b);
3250
3251        s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
3252
3253        s4_3_16x8b = _mm_unpacklo_epi16(s2_7_16x8b, s2_8_16x8b);
3254
3255        s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
3256
3257        s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
3258        s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
3259        s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
3260
3261        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
3262        s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3263
3264        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
3265        s19_8x16b = _mm_sub_epi32(s18_8x16b, offset_8x16b);
3266
3267        s18_8x16b = _mm_packs_epi32(s19_8x16b, zero_8x16b);
3268
3269        /* store 8 8-bit output values  */
3270        /* Store the output pixels of row 1*/
3271        _mm_storel_epi64((__m128i *)(pi2_dst + (dst_strd)), s18_8x16b);
3272
3273
3274        /* ROW 3*/
3275        s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
3276        s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
3277        s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
3278
3279        /*load 4 pixel values*/
3280        s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (7 * src_strd)));
3281
3282        s4_4_16x8b = _mm_unpacklo_epi16(s2_9_16x8b, s2_10_16x8b);
3283
3284        s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
3285
3286        s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
3287        s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
3288        s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
3289
3290        /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
3291        s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3292
3293
3294        /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
3295        s39_8x16b = _mm_sub_epi32(s38_8x16b, offset_8x16b);
3296
3297        s38_8x16b = _mm_packs_epi32(s39_8x16b, zero_8x16b);
3298
3299        /* store 8 8-bit output values  */
3300        /* Store the output pixels of row 2*/
3301        _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s38_8x16b);
3302
3303        pi2_src += (8 * src_strd);
3304        pi2_dst += (4 * dst_strd);
3305
3306        for(row = 4; row < ht; row += 4)
3307        {
3308
3309            s3_0_16x8b = s3_2_16x8b;
3310            s3_1_16x8b = s3_3_16x8b;
3311            s3_2_16x8b = s3_4_16x8b;
3312
3313            s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
3314            s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
3315            s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
3316
3317            /*load 4 pixel values from (cur_row + 4)th row*/
3318            s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src));
3319
3320            s3_3_16x8b = _mm_unpacklo_epi16(s2_10_16x8b, s2_0_16x8b);
3321            s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
3322
3323            s4_0_16x8b = s4_2_16x8b;
3324            s4_1_16x8b = s4_3_16x8b;
3325            s4_2_16x8b = s4_4_16x8b;
3326
3327            s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
3328            s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
3329            s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
3330
3331            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
3332            s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3333
3334            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
3335            s9_8x16b = _mm_sub_epi32(s8_8x16b, offset_8x16b);
3336
3337            s8_8x16b = _mm_packs_epi32(s9_8x16b, zero_8x16b);
3338
3339            /* store 8 8-bit output values  */
3340            /* Store the output pixels of row 4*/
3341            _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
3342
3343/* row + 2*/
3344            s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
3345            s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
3346            s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
3347
3348            /*load 4 pixel values from (cur_row + 5)th row*/
3349            s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
3350
3351            /*load 4 pixel values from (cur_row + 6)th row*/
3352            s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
3353
3354            /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
3355            s3_4_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
3356
3357            s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
3358
3359            s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
3360            s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
3361            s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
3362
3363            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
3364            s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3365
3366            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
3367            s29_8x16b = _mm_sub_epi32(s28_8x16b, offset_8x16b);
3368
3369            s28_8x16b = _mm_packs_epi32(s29_8x16b, zero_8x16b);
3370
3371            /* store 8 8-bit output values  */
3372            /* Store the output pixels of (cur_row+2)*/
3373            _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s28_8x16b);
3374
3375
3376/*row + 1*/
3377            s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
3378            s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
3379            s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
3380
3381            /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
3382            s4_3_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
3383            s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
3384
3385            s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
3386            s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
3387            s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
3388
3389            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
3390            s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3391
3392            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
3393            s19_8x16b = _mm_sub_epi32(s18_8x16b, offset_8x16b);
3394
3395            s18_8x16b = _mm_packs_epi32(s19_8x16b, zero_8x16b);
3396
3397            /* store 8 8-bit output values  */
3398            /* Store the output pixels of (cur_row + 1)*/
3399            _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s18_8x16b);
3400
3401
3402/* row + 3*/
3403            s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
3404            s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
3405            s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
3406
3407            /*load 4 pixel values from (cur_row + 7)th row*/
3408            s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
3409
3410            /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
3411            s4_4_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
3412
3413            s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
3414
3415            s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
3416            s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
3417            s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
3418
3419            /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
3420            s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3421
3422            /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
3423            s39_8x16b = _mm_sub_epi32(s38_8x16b, offset_8x16b);
3424
3425            s38_8x16b = _mm_packs_epi32(s39_8x16b, zero_8x16b);
3426
3427            /* store 8 8-bit output values  */
3428            /* Store the output pixels of (cur_row+3)*/
3429            _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s38_8x16b);
3430
3431            s2_10_16x8b = s2_3_16x8b;
3432
3433            pi2_src += 4 * src_strd; /* pointer update */
3434            pi2_dst += 4 * dst_strd; /* pointer update */
3435        }
3436    }
3437
3438}
3439
3440/**
3441*******************************************************************************
3442*
3443* @brief
3444*      Chroma interprediction filter for copy
3445*
3446* @par Description:
3447*    Copies the array of width 'wd' and height 'ht' from the  location pointed
3448*    by 'src' to the location pointed by 'dst'
3449*
3450* @param[in] pu1_src
3451*  UWORD8 pointer to the source
3452*
3453* @param[out] pu1_dst
3454*  UWORD8 pointer to the destination
3455*
3456* @param[in] src_strd
3457*  integer source stride
3458*
3459* @param[in] dst_strd
3460*  integer destination stride
3461*
3462* @param[in] pi1_coeff
3463*  WORD8 pointer to the filter coefficients
3464*
3465* @param[in] ht
3466*  integer height of the array
3467*
3468* @param[in] wd
3469*  integer width of the array
3470*
3471* @returns
3472*
3473* @remarks
3474*  None
3475*
3476*******************************************************************************
3477*/
3478
3479void ihevc_inter_pred_chroma_copy_ssse3(UWORD8 *pu1_src,
3480                                        UWORD8 *pu1_dst,
3481                                        WORD32 src_strd,
3482                                        WORD32 dst_strd,
3483                                        WORD8 *pi1_coeff,
3484                                        WORD32 ht,
3485                                        WORD32 wd)
3486{
3487    WORD32 row, col;
3488    __m128i  s3, mask_4x32b;
3489    UNUSED(pi1_coeff);
3490    ASSERT(wd % 2 == 0); /* checking assumption*/
3491    ASSERT(ht % 2 == 0); /* checking assumption*/
3492
3493    mask_4x32b = _mm_set_epi32(0, 0, 0, 0x80808080); /* Mask register */
3494
3495/*  for loop starts from here */
3496    if(wd % 8 == 0)
3497    {
3498        for(row = 0; row < ht; row += 2)
3499        {
3500            int offset = 0;
3501            for(col = 0; col < 2 * wd; col += 16)
3502            {
3503/* row =0 */
3504
3505                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
3506                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]; */
3507                /* storing 16 8-bit output values */
3508                _mm_storeu_si128((__m128i *)(pu1_dst + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
3509
3510/* row =1 */
3511                /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
3512                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col]; */
3513                /* storing 8 8-bit output values */
3514                _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd + offset), s3); /* pu1_dst[col] = pu1_src[col]*/
3515
3516                offset += 16; /*To pointer update */
3517            } /*  inner for loop ends here(16-output values in single iteration) */
3518
3519            pu1_src += 2 * src_strd; /* pointer update */
3520            pu1_dst += 2 * dst_strd; /* pointer update */
3521        }
3522    }
3523    else if(wd % 4 == 0)
3524    {
3525        for(row = 0; row < ht; row += 2)
3526        {
3527            int offset = 0;
3528            for(col = 0; col < 2 * wd; col += 8)
3529            {
3530/* row =0  */
3531                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
3532                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]; */
3533                /* storing 8 8-bit output values */
3534                _mm_storel_epi64((__m128i *)(pu1_dst + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
3535/* row =1 */
3536                /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
3537                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col]; */
3538                /* storing 8 8-bit output values */
3539                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
3540
3541                offset += 8; /* To pointer update */
3542            } /* inner for loop ends here(8-output values in single iteration) */
3543
3544            pu1_src += 2 * src_strd;  /* pointer update */
3545            pu1_dst += 2 * dst_strd;  /* pointer update */
3546        }
3547    }
3548    else
3549    {
3550        for(row = 0; row < ht; row += 2)
3551        {
3552            int offset = 0;
3553            for(col = 0; col < 2 * wd; col += 4)
3554            {
3555/* row =0 */
3556                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
3557                /* storing four 8-bit output values */
3558                _mm_maskmoveu_si128(s3, mask_4x32b, (char *)(pu1_dst + offset)); /* pu1_dst[col] = pu1_src[col]; */
3559/* row =1 */
3560                /* pu1_src[col] */
3561                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
3562
3563                /* storing four 8-bit output values */
3564                _mm_maskmoveu_si128(s3, mask_4x32b, (char *)(pu1_dst + dst_strd + offset)); /* pu1_dst[col] = pu1_src[col]; */
3565
3566                offset += 4; /* To pointer update */
3567            } /*  inner for loop ends here(4-output values in single iteration) */
3568
3569            pu1_src += 2 * src_strd; /* pointer increment */
3570            pu1_dst += 2 * dst_strd; /* pointer increment */
3571        }
3572    }
3573}
3574
3575/**
3576*******************************************************************************
3577*
3578* @brief
3579*     Chroma interprediction filter for horizontal input
3580*
3581* @par Description:
3582*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
3583*    to the elements pointed by 'pu1_src' and  writes to the location pointed
3584*    by 'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
3585*
3586* @param[in] pu1_src
3587*  UWORD8 pointer to the source
3588*
3589* @param[out] pu1_dst
3590*  UWORD8 pointer to the destination
3591*
3592* @param[in] src_strd
3593*  integer source stride
3594*
3595* @param[in] dst_strd
3596*  integer destination stride
3597*
3598* @param[in] pi1_coeff
3599*  WORD8 pointer to the filter coefficients
3600*
3601* @param[in] ht
3602*  integer height of the array
3603*
3604* @param[in] wd
3605*  integer width of the array
3606*
3607* @returns
3608*
3609* @remarks
3610*  None
3611*
3612*******************************************************************************
3613*/
3614void ihevc_inter_pred_chroma_horz_ssse3(UWORD8 *pu1_src,
3615                                        UWORD8 *pu1_dst,
3616                                        WORD32 src_strd,
3617                                        WORD32 dst_strd,
3618                                        WORD8 *pi1_coeff,
3619                                        WORD32 ht,
3620                                        WORD32 wd)
3621{
3622    WORD32 row, col;
3623
3624    __m128i coeff0_1_8x16b, coeff2_3_8x16b, control_mask_1_8x16b, control_mask_2_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
3625    __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
3626    __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
3627    __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
3628    __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b, res_temp17_8x16b;
3629
3630    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
3631    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
3632    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
3633    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
3634    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
3635    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
3636
3637    ASSERT(wd % 2 == 0); /* checking assumption*/
3638
3639/* loading four 8-bit coefficients  */
3640    src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
3641
3642    offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
3643    mask_low_32b = _mm_cmpeq_epi16(offset_8x16b, offset_8x16b);
3644    mask_high_96b = _mm_srli_si128(mask_low_32b, 12);
3645    mask_low_32b = _mm_slli_si128(mask_low_32b, 4);
3646
3647    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
3648    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
3649
3650    coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
3651    coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
3652
3653/*  outer for loop starts from here */
3654    if(wd % 2 == 0 && wd % 4 != 0)
3655    {
3656
3657        for(row = 0; row < ht; row += 2)
3658        {
3659            int offset = 0;
3660
3661            PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
3662            PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
3663
3664
3665            for(col = 0; col < 2 * wd; col += 4)
3666            {
3667
3668
3669                /*load 16 pixel values from row 0*/
3670                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
3671
3672                /*load 16 pixel values from row 1*/
3673                src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
3674
3675                /*Derive the source pixels for processing the 2nd pixel*/
3676                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3677
3678                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
3679
3680                /*Derive the source pixels for processing the 3rd pixel*/
3681                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
3682
3683                /*Derive the source pixels for processing the 4th pixel*/
3684                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
3685
3686                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
3687
3688                /*Derive the source pixels for processing the 2nd pixel*/
3689                src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
3690
3691                src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
3692
3693                /*Derive the source pixels for processing the 3rd pixel*/
3694                src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
3695                /*Derive the source pixels for processing the 4th pixel*/
3696                src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
3697
3698                src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
3699
3700                res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, src_temp15_16x8b);
3701                res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, src_temp16_16x8b);
3702                res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
3703                res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
3704
3705                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
3706                res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
3707
3708                res_temp14_8x16b = _mm_adds_epi16(res_temp13_8x16b, offset_8x16b);             /* row = 0 */
3709                res_temp15_8x16b = _mm_srai_epi16(res_temp14_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
3710                res_temp13_8x16b = _mm_packus_epi16(res_temp15_8x16b, res_temp15_8x16b);       /* row = 0 */
3711
3712                res_temp3_8x16b = _mm_srli_si128(res_temp13_8x16b, 4);
3713
3714                res_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
3715                res_temp5_8x16b =  _mm_and_si128(res_temp4_8x16b, mask_low_32b);
3716                res_temp6_8x16b =  _mm_and_si128(res_temp13_8x16b, mask_high_96b);
3717                res_temp7_8x16b = _mm_or_si128(res_temp5_8x16b, res_temp6_8x16b);
3718
3719                /* store 4 16-bit values */
3720                _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp7_8x16b); /* pu1_dst[col] = i2_tmp_u  */
3721
3722                res_temp14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
3723                res_temp15_8x16b =  _mm_and_si128(res_temp14_8x16b, mask_low_32b);
3724                res_temp16_8x16b =  _mm_and_si128(res_temp3_8x16b, mask_high_96b);
3725                res_temp17_8x16b = _mm_or_si128(res_temp15_8x16b, res_temp16_8x16b);
3726
3727                /* store 4 16-bit values */
3728                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp17_8x16b); /* pu1_dst[col] = i2_tmp_u  */
3729
3730
3731                offset += 4; /* To pointer update*/
3732
3733            } /* inner loop ends here(8- output values in single iteration)*/
3734
3735            pu1_src += 2 * src_strd; /*pointer update*/
3736            pu1_dst += 2 * dst_strd; /*pointer update*/
3737        }
3738    }
3739    else
3740    {
3741
3742        for(row = 0; row < ht; row += 2)
3743        {
3744            int offset = 0;
3745
3746            PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
3747            PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
3748
3749
3750            for(col = 0; col < 2 * wd; col += 8)
3751            {
3752
3753                /*load 16 pixel values from row 0*/
3754                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
3755
3756                /*load 16 pixel values from row 1*/
3757                src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
3758
3759                /*Derive the source pixels for processing the 2nd pixel*/
3760                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3761
3762                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
3763
3764                /*Derive the source pixels for processing the 3rd pixel*/
3765                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
3766
3767                /*Derive the source pixels for processing the 4th pixel*/
3768                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
3769
3770                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
3771
3772                res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
3773                res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
3774
3775                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
3776                res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
3777
3778                res_temp4_8x16b = _mm_adds_epi16(res_temp3_8x16b, offset_8x16b);             /* row = 0 */
3779                res_temp5_8x16b = _mm_srai_epi16(res_temp4_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
3780                res_temp6_8x16b = _mm_packus_epi16(res_temp5_8x16b, res_temp5_8x16b);        /* row = 0 */
3781
3782                /* store 4 16-bit values */
3783                _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp6_8x16b); /* pi2_dst[col] = i2_tmp_u  */
3784
3785                /*Derive the source pixels for processing the 2nd pixel of row 1*/
3786                src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
3787
3788                src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
3789
3790                /*Derive the source pixels for processing the 3rd pixel of row 1*/
3791                src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
3792
3793                /*Derive the source pixels for processing the 4th pixel of row 1*/
3794                src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
3795
3796                src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
3797
3798                res_temp11_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff0_1_8x16b);
3799                res_temp12_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff2_3_8x16b);
3800
3801                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
3802                res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
3803
3804                res_temp14_8x16b = _mm_adds_epi16(res_temp13_8x16b, offset_8x16b);             /* row = 0 */
3805                res_temp15_8x16b = _mm_srai_epi16(res_temp14_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
3806                res_temp16_8x16b = _mm_packus_epi16(res_temp15_8x16b, res_temp15_8x16b);       /* row = 0 */
3807
3808                /* store 4 16-bit values */
3809                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp16_8x16b); /* pu1_dst[col] = i2_tmp_u  */
3810
3811
3812                offset += 8; /* To pointer update*/
3813
3814            } /* inner loop ends here(8- output values in single iteration)*/
3815
3816            pu1_src += 2 * src_strd; /*pointer update*/
3817            pu1_dst += 2 * dst_strd; /*pointer update*/
3818        }
3819    }
3820}
3821
3822/**
3823*******************************************************************************
3824*
3825* @brief
3826*     Chroma interprediction filter for vertical input
3827*
3828* @par Description:
3829*    Applies a vertcal filter with coefficients pointed to  by 'pi1_coeff' to
3830*    the elements pointed by 'pu1_src' and  writes to the location pointed by
3831*    'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
3832*
3833*
3834* @param[in] pu1_src
3835*  UWORD8 pointer to the source
3836*
3837* @param[out] pu1_dst
3838*  UWORD8 pointer to the destination
3839*
3840* @param[in] src_strd
3841*  integer source stride
3842*
3843* @param[in] dst_strd
3844*  integer destination stride
3845*
3846* @param[in] pi1_coeff
3847*  WORD8 pointer to the filter coefficients
3848*
3849* @param[in] ht
3850*  integer height of the array
3851*
3852* @param[in] wd
3853*  integer width of the array
3854*
3855* @returns
3856*
3857* @remarks
3858*  None
3859*
3860*******************************************************************************
3861*/
3862void ihevc_inter_pred_chroma_vert_ssse3(UWORD8 *pu1_src,
3863                                        UWORD8 *pu1_dst,
3864                                        WORD32 src_strd,
3865                                        WORD32 dst_strd,
3866                                        WORD8 *pi1_coeff,
3867                                        WORD32 ht,
3868                                        WORD32 wd)
3869{
3870    WORD32 row, col;
3871    UWORD8 *pu1_src_copy;
3872    UWORD8 *pu1_dst_copy;
3873    __m128i coeff0_1_8x16b, coeff2_3_8x16b;
3874    __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
3875    __m128i control_mask_1_8x16b, control_mask_2_8x16b;
3876    __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
3877    __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
3878    __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
3879    __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
3880
3881    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
3882    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
3883    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
3884    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
3885    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
3886    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
3887
3888/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
3889    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
3890
3891    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
3892    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
3893
3894    coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
3895    coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
3896
3897
3898/*  seting  values in register */
3899    zero_8x16b = _mm_setzero_si128(); /* for saturated clipping */
3900    offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
3901    mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
3902    mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
3903
3904/*  outer for loop starts from here */
3905    if(wd % 8 == 0)
3906    { /* wd = multiple of 8 case */
3907
3908        pu1_src_copy = pu1_src;
3909        pu1_dst_copy = pu1_dst;
3910
3911        for(col = 0; col < 2 * wd; col += 16)
3912        {
3913
3914            pu1_src = pu1_src_copy + col;
3915            pu1_dst = pu1_dst_copy + col;
3916
3917
3918            for(row = 0; row < ht; row += 2)
3919            {
3920
3921                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
3922                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
3923
3924
3925                /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
3926                s21_8x16b  = _mm_loadu_si128((__m128i *)(pu1_src + (-1 * src_strd)));
3927
3928                /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
3929                s22_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (0 * src_strd)));
3930
3931
3932                /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
3933                s23_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (1 * src_strd)));
3934
3935                /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
3936                s24_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (2 * src_strd)));
3937
3938                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
3939
3940                s31_8x16b = _mm_unpackhi_epi8(s21_8x16b, s22_8x16b);
3941
3942                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
3943
3944                s33_8x16b = _mm_unpackhi_epi8(s23_8x16b, s24_8x16b);
3945
3946                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
3947
3948                s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
3949
3950                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
3951
3952                s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
3953
3954                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
3955
3956                s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b);
3957
3958                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
3959
3960                s31_8x16b = _mm_add_epi16(s35_8x16b, offset_8x16b);
3961
3962                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
3963                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3964
3965                s32_8x16b = _mm_srai_epi16(s31_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
3966
3967                /* i2_tmp = CLIP_U8(i2_tmp);*/
3968                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
3969
3970                s33_8x16b =  _mm_packus_epi16(s32_8x16b, zero_8x16b);
3971
3972                s7_8x16b = _mm_unpacklo_epi64(s7_8x16b, s33_8x16b);
3973/* store 8 8-bit output values  */
3974                /* pu1_dst[col] = (UWORD8)i2_tmp; */
3975                _mm_storeu_si128((__m128i *)(pu1_dst), s7_8x16b);
3976
3977
3978                s25_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (3 * src_strd)));
3979
3980                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
3981
3982                s31_8x16b = _mm_unpackhi_epi8(s22_8x16b, s23_8x16b);
3983
3984                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
3985
3986                s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
3987
3988                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
3989
3990                s33_8x16b = _mm_unpackhi_epi8(s24_8x16b, s25_8x16b);
3991
3992                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
3993
3994                s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
3995
3996                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
3997
3998                s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
3999
4000                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
4001
4002                s31_8x16b = _mm_add_epi16(s35_8x16b, offset_8x16b);
4003
4004                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
4005                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
4006
4007                s32_8x16b = _mm_srai_epi16(s31_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
4008
4009                /* i2_tmp = CLIP_U8(i2_tmp);*/
4010                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
4011
4012                s33_8x16b =  _mm_packus_epi16(s32_8x16b, zero_8x16b);
4013
4014                s7_8x16b = _mm_unpacklo_epi64(s7_8x16b, s33_8x16b);
4015/* store 8 8-bit output values  */
4016                /* pu1_dst[col] = (UWORD8)i2_tmp; */
4017                _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
4018
4019                pu1_src += 2 * src_strd;
4020                pu1_dst += 2 * dst_strd;
4021
4022
4023            } /* inner for loop ends here(8-output values in single iteration) */
4024
4025        }
4026    }
4027    else if(wd % 4 == 0)
4028    { /* wd = multiple of 8 case */
4029
4030        for(row = 0; row < ht; row += 2)
4031        {
4032            pu1_src_copy = pu1_src;
4033            pu1_dst_copy = pu1_dst;
4034            for(col = 0; col < 2 * wd; col += 8)
4035            {
4036
4037                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
4038                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
4039
4040
4041                /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
4042                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
4043
4044                /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
4045                s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
4046
4047                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
4048
4049                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
4050
4051                /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
4052                s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
4053
4054                /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
4055                s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
4056
4057                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
4058
4059                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
4060
4061                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
4062
4063                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
4064
4065                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
4066                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
4067
4068                /* i2_tmp = CLIP_U8(i2_tmp);*/
4069                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
4070
4071/* store 8 8-bit output values  */
4072                /* pu1_dst[col] = (UWORD8)i2_tmp; */
4073                _mm_storel_epi64((__m128i *)(pu1_dst), s7_8x16b);
4074
4075                s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
4076
4077                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
4078                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
4079
4080                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
4081                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
4082
4083                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
4084
4085                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
4086
4087                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
4088                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
4089
4090                /* i2_tmp = CLIP_U8(i2_tmp);*/
4091                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
4092
4093/* store 8 8-bit output values  */
4094                /* pu1_dst[col] = (UWORD8)i2_tmp; */
4095                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
4096
4097                pu1_src += 8;    /* To pointer update */
4098                pu1_dst += 8;
4099
4100            } /* inner for loop ends here(8-output values in single iteration) */
4101
4102            pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
4103            pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
4104        }
4105    }
4106
4107    else
4108    { /* wd = multiple of 4 case */
4109
4110        for(row = 0; row < ht; row += 2)
4111        {
4112            pu1_src_copy = pu1_src;
4113            pu1_dst_copy = pu1_dst;
4114            for(col = 0; col < 2 * wd; col += 4)
4115            {
4116
4117                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
4118                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
4119
4120
4121                /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
4122                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
4123
4124                /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
4125                s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
4126
4127                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
4128
4129                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
4130
4131                /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
4132                s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
4133
4134                /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
4135                s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
4136
4137                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
4138
4139                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
4140
4141                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
4142
4143                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
4144
4145                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
4146                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
4147
4148                /* i2_tmp = CLIP_U8(i2_tmp);*/
4149                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
4150
4151                s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
4152                s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
4153                s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
4154                s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
4155
4156/* store 8 8-bit output values  */
4157                /* pu1_dst[col] = (UWORD8)i2_tmp; */
4158                _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
4159
4160                s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
4161
4162                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
4163                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
4164
4165                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
4166                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
4167
4168                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
4169
4170                s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
4171
4172                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
4173                s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
4174
4175                /* i2_tmp = CLIP_U8(i2_tmp);*/
4176                s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
4177
4178                s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
4179                s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
4180                s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
4181                s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
4182
4183/* store 8 8-bit output values  */
4184                /* pu1_dst[col] = (UWORD8)i2_tmp; */
4185                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s9_8x16b);
4186
4187                pu1_src += 4;   /* To pointer update */
4188                pu1_dst += 4;
4189            } /* inner for loop ends here(8-output values in single iteration) */
4190
4191            pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
4192            pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
4193        }
4194    }
4195}
4196
4197/**
4198*******************************************************************************
4199*
4200* @brief
4201*       chroma interprediction filter for copying 16bit output
4202*
4203* @par Description:
4204*    Copies the array of width 'wd' and height 'ht' from the  location pointed
4205*    by 'src' to the location pointed by 'dst' The output is upshifted by 6
4206*    bits and is used as input for vertical filtering or weighted prediction
4207*
4208* @param[in] pu1_src
4209*  UWORD8 pointer to the source
4210*
4211* @param[out] pi2_dst
4212*  WORD16 pointer to the destination
4213*
4214* @param[in] src_strd
4215*  integer source stride
4216*
4217* @param[in] dst_strd
4218*  integer destination stride
4219*
4220* @param[in] pi1_coeff
4221*  WORD8 pointer to the filter coefficients
4222*
4223* @param[in] ht
4224*  integer height of the array
4225*
4226* @param[in] wd
4227*  integer width of the array
4228*
4229* @returns
4230*
4231* @remarks
4232*  None
4233*
4234*******************************************************************************
4235*/
4236
4237void ihevc_inter_pred_chroma_copy_w16out_ssse3(UWORD8 *pu1_src,
4238                                               WORD16 *pi2_dst,
4239                                               WORD32 src_strd,
4240                                               WORD32 dst_strd,
4241                                               WORD8 *pi1_coeff,
4242                                               WORD32 ht,
4243                                               WORD32 wd)
4244{
4245    WORD32 row, col;
4246    __m128i  s3, zero_8x16b;
4247
4248    ASSERT(wd % 2 == 0); /* checking assumption*/
4249    ASSERT(ht % 2 == 0); /* checking assumption*/
4250
4251    UNUSED(pi1_coeff);
4252    zero_8x16b = _mm_setzero_si128();
4253/*  outer for loop starts from here */
4254    if(wd == 2) /* for wd =2 */
4255    {
4256        for(row = 0; row < ht; row += 2)
4257        {
4258            int offset = 0;
4259            for(col = 0; col < 2 * wd; col += 4)
4260            {
4261/* row =0 */
4262                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
4263                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
4264                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
4265
4266                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
4267
4268                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
4269                _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
4270
4271/* row =1 */
4272                /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
4273                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
4274                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
4275
4276                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
4277
4278                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
4279                offset += 4; /* To pointer update */
4280            } /* inner for loop ends here */
4281
4282            pu1_src += 2 * src_strd; /* pointer update */
4283            pi2_dst += 2 * dst_strd; /* pointer update */
4284        }
4285    }
4286    else if(wd % 2 == 0 && wd % 4 != 0)
4287    {
4288        for(row = 0; row < ht / 2; row++)
4289        {
4290            int offset = 0;
4291            int count = (2 * wd) / 8;
4292            for(col = 0; col < count; col++)
4293            {
4294/* row =0 */
4295                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
4296                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]*/
4297                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
4298
4299                /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
4300                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH);
4301
4302                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
4303                _mm_storeu_si128((__m128i *)(pi2_dst + offset), s3);
4304
4305                /*row=1*/       /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
4306                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
4307                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
4308
4309                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
4310                _mm_storeu_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
4311
4312                offset += 8; /* To pointer update*/
4313            } /*  inner for loop ends here(8-output values in single iteration) */
4314
4315/* finding last four values */
4316            s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
4317            s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
4318
4319            s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
4320
4321            /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
4322            _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
4323
4324            /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
4325            s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
4326            s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
4327
4328            s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
4329            _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
4330
4331            pu1_src += 2 * src_strd; /* pointer update */
4332            pi2_dst += 2 * dst_strd;
4333        }
4334    }
4335    else
4336    {
4337        for(row = 0; row < ht / 2; row++)
4338        {
4339            int offset = 0;
4340            for(col = 0; col < 2 * wd / 8; col++)
4341            {
4342/* row =0 */
4343                /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
4344                s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]*/
4345                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
4346
4347                /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
4348                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH);
4349
4350                /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
4351                _mm_storeu_si128((__m128i *)(pi2_dst + offset), s3);
4352
4353                /*row=1*/       /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
4354                s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
4355                s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
4356
4357                s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
4358                _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
4359
4360                offset += 8; /* To pointer update*/
4361            } /*  inner for loop ends here(8-output values in single iteration) */
4362
4363            pu1_src += 2 * src_strd; /* pointer update */
4364            pi2_dst += 2 * dst_strd;
4365        }
4366    }
4367}
4368
4369/**
4370*******************************************************************************
4371*
4372* @brief
4373*       chroma interprediction filter to store horizontal 16bit ouput
4374*
4375* @par Description:
4376*    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
4377*    to the elements pointed by 'pu1_src' and  writes to the location pointed
4378*    by 'pu1_dst'  No downshifting or clipping is done and the output is  used
4379*    as an input for vertical filtering or weighted  prediction
4380*
4381* @param[in] pu1_src
4382*  UWORD8 pointer to the source
4383*
4384* @param[out] pi2_dst
4385*  WORD16 pointer to the destination
4386*
4387* @param[in] src_strd
4388*  integer source stride
4389*
4390* @param[in] dst_strd
4391*  integer destination stride
4392*
4393* @param[in] pi1_coeff
4394*  WORD8 pointer to the filter coefficients
4395*
4396* @param[in] ht
4397*  integer height of the array
4398*
4399* @param[in] wd
4400*  integer width of the array
4401*
4402* @returns
4403*
4404* @remarks
4405*  None
4406*
4407*******************************************************************************
4408*/
4409void ihevc_inter_pred_chroma_horz_w16out_ssse3(UWORD8 *pu1_src,
4410                                               WORD16 *pi2_dst,
4411                                               WORD32 src_strd,
4412                                               WORD32 dst_strd,
4413                                               WORD8 *pi1_coeff,
4414                                               WORD32 ht,
4415                                               WORD32 wd)
4416{
4417    WORD32 row, col;
4418
4419    __m128i coeff0_1_8x16b, coeff2_3_8x16b, control_mask_1_8x16b, control_mask_2_8x16b, all_zero;
4420    __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
4421    __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
4422    __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
4423    __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b;
4424
4425    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
4426    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
4427    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
4428    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
4429    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
4430    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
4431
4432    ASSERT(wd % 2 == 0); /* checking assumption*/
4433
4434/* loading four 8-bit coefficients and convert 8-bit into 16-bit */
4435    src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
4436
4437    all_zero = _mm_setzero_si128();
4438
4439    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
4440    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
4441
4442    coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
4443    coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
4444
4445/*  outer for loop starts from here */
4446    if(wd % 2 == 0 && wd % 4 != 0)
4447    {
4448        int offset = 0;
4449        for(row = ht; row >= 2; row -= 2)
4450        {
4451            offset = 0;
4452            PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
4453            PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
4454
4455
4456            for(col = 0; col < 2 * wd; col += 4)
4457            {
4458
4459                /*load 16 pixel values of row 0*/
4460                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
4461
4462                /*load 16 pixel values of row 1*/
4463                src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
4464
4465                /*Derive the source pixels for processing the 2nd pixel of row 0*/
4466                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
4467
4468                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
4469
4470                /*Derive the source pixels for processing the 3rd pixel of row 0*/
4471                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
4472
4473                /*Derive the source pixels for processing the 4th pixel of row 0*/
4474                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
4475
4476                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
4477
4478                /*Derive the source pixels for processing the 2nd pixel of row 1*/
4479                src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
4480
4481                src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
4482
4483                /*Derive the source pixels for processing the 3rd pixel of row 1*/
4484                src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
4485
4486                /*Derive the source pixels for processing the 4th pixel of row 1*/
4487                src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
4488
4489                src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
4490
4491                res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, src_temp15_16x8b);
4492                res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, src_temp16_16x8b);
4493                res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
4494                res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
4495
4496                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
4497                res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
4498
4499                res_temp3_8x16b = _mm_srli_si128(res_temp13_8x16b, 8);
4500
4501                /* store 4 16-bit values */
4502                _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u  */
4503
4504
4505
4506                /* store 4 16-bit values */
4507                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u  */
4508
4509
4510                offset += 4; /* To pointer update*/
4511
4512            } /* inner loop ends here(8- output values in single iteration)*/
4513
4514            pu1_src += 2 * src_strd; /*pointer update*/
4515            pi2_dst += 2 * dst_strd; /*pointer update*/
4516        }
4517
4518        /*Epilogue to handle ht= odd case*/
4519        if(row)
4520        {
4521            offset = 0;
4522            for(col = 0; col < 2 * wd; col += 4)
4523            {
4524
4525                /*load 16 pixel values of row 0*/
4526                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
4527
4528                /*Derive the source pixels for processing the 2nd pixel of row 0*/
4529                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
4530
4531                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
4532
4533                /*Derive the source pixels for processing the 3rd pixel of row 0*/
4534                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
4535
4536                /*Derive the source pixels for processing the 4th pixel of row 0*/
4537                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
4538
4539                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
4540
4541                res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, all_zero);
4542                res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, all_zero);
4543                res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
4544                res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
4545
4546                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
4547                res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
4548
4549                //res_temp3_8x16b = _mm_srli_si128 (res_temp13_8x16b, 8);
4550
4551                /* store 4 16-bit values */
4552                _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u  */
4553
4554                offset += 4; /* To pointer update*/
4555
4556            }
4557        }
4558
4559    }
4560    else
4561    {
4562        int offset = 0;
4563
4564        for(row = ht; row >= 2; row -= 2)
4565        {
4566            offset = 0;
4567            PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
4568            PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
4569
4570
4571            for(col = 0; col < 2 * wd; col += 8)
4572            {
4573
4574                /*load 16 pixel values of row 0*/
4575                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
4576
4577                /*load 16 pixel values of row 1*/
4578                src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
4579
4580                /*Derive the source pixels for processing the 2nd pixel of row 0*/
4581                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
4582
4583                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
4584
4585                /*Derive the source pixels for processing the 3rd pixel of row 0*/
4586                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
4587
4588                /*Derive the source pixels for processing the 4th pixel of row 0*/
4589                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
4590
4591                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
4592
4593                res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
4594                res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
4595
4596                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
4597                res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
4598
4599                /* store 8 16-bit values */
4600                _mm_storeu_si128((__m128i *)(pi2_dst + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u  */
4601
4602                /*Derive the source pixels for processing the 2nd pixel of row 1*/
4603                src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
4604
4605                src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
4606
4607                /*Derive the source pixels for processing the 3rd pixel of row 1*/
4608                src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
4609
4610                /*Derive the source pixels for processing the 4th pixel of row 1*/
4611                src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
4612
4613                src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
4614
4615                res_temp11_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff0_1_8x16b);
4616                res_temp12_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff2_3_8x16b);
4617
4618                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
4619                res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
4620
4621                /* store 8 16-bit values */
4622                _mm_storeu_si128((__m128i *)(pi2_dst + dst_strd + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u  */
4623
4624
4625                offset += 8; /* To pointer update*/
4626
4627            } /* inner loop ends here(8- output values in single iteration)*/
4628
4629            pu1_src += 2 * src_strd; /*pointer update*/
4630            pi2_dst += 2 * dst_strd; /*pointer update*/
4631        }
4632
4633        /*Epilogue to take care of odd ht*/
4634        if(row)
4635        {
4636            offset = 0;
4637            for(col = 0; col < 2 * wd; col += 8)
4638            {
4639
4640                /*load 16 pixel values of row 0*/
4641                src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
4642
4643                /*Derive the source pixels for processing the 2nd pixel of row 0*/
4644                src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
4645
4646                src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
4647
4648                /*Derive the source pixels for processing the 3rd pixel of row 0*/
4649                src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
4650
4651                /*Derive the source pixels for processing the 4th pixel of row 0*/
4652                src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
4653
4654                src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
4655
4656                res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
4657                res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
4658
4659                /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
4660                res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
4661
4662                /* store 8 16-bit values */
4663                _mm_storeu_si128((__m128i *)(pi2_dst + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u  */
4664
4665                offset += 8; /* To pointer update*/
4666
4667            }
4668        }
4669
4670    }
4671}
4672
4673/**
4674*******************************************************************************
4675*
4676* @brief
4677*     Interprediction chroma filter to store vertical 16bit ouput
4678*
4679* @par Description:
4680*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
4681*    the elements pointed by 'pu1_src' and  writes to the location pointed by
4682*    'pu1_dst'  No downshifting or clipping is done and the output is  used as
4683*    an input for weighted prediction
4684*
4685* @param[in] pu1_src
4686*  UWORD8 pointer to the source
4687*
4688* @param[out] pi2_dst
4689*  WORD16 pointer to the destination
4690*
4691* @param[in] src_strd
4692*  integer source stride
4693*
4694* @param[in] dst_strd
4695*  integer destination stride
4696*
4697* @param[in] pi1_coeff
4698*  WORD8 pointer to the filter coefficients
4699*
4700* @param[in] ht
4701*  integer height of the array
4702*
4703* @param[in] wd
4704*  integer width of the array
4705*
4706* @returns
4707*
4708* @remarks
4709*  None
4710*
4711*******************************************************************************
4712*/
4713void ihevc_inter_pred_chroma_vert_w16out_ssse3(UWORD8 *pu1_src,
4714                                               WORD16 *pi2_dst,
4715                                               WORD32 src_strd,
4716                                               WORD32 dst_strd,
4717                                               WORD8 *pi1_coeff,
4718                                               WORD32 ht,
4719                                               WORD32 wd)
4720{
4721    WORD32 row, col;
4722    UWORD8 *pu1_src_copy;
4723    WORD16 *pi2_dst_copy;
4724    __m128i coeff0_1_8x16b, coeff2_3_8x16b;
4725    __m128i s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b;
4726    __m128i control_mask_1_8x16b, control_mask_2_8x16b;
4727    __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
4728    __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
4729    __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
4730
4731
4732    PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
4733    PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
4734    PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
4735    PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
4736    PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
4737    PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
4738
4739/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
4740    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
4741
4742    control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
4743    control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
4744
4745    coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
4746    coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
4747
4748
4749
4750/*  outer for loop starts from here */
4751    if(wd % 8 == 0)
4752    { /* wd = multiple of 8 case */
4753
4754        pu1_src_copy = pu1_src;
4755        pi2_dst_copy = pi2_dst;
4756
4757        for(col = 0; col < 2 * wd; col += 16)
4758        {
4759
4760            pu1_src = pu1_src_copy + col;
4761            pi2_dst = pi2_dst_copy + col;
4762
4763
4764            for(row = 0; row < ht; row += 2)
4765            {
4766
4767                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
4768                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
4769
4770
4771                /*load 16 pixel values */
4772                s21_8x16b  = _mm_loadu_si128((__m128i *)(pu1_src + (-1 * src_strd)));
4773
4774                /*load 16 pixel values */
4775                s22_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (0 * src_strd)));
4776
4777
4778                /*load 16 pixel values */
4779                s23_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (1 * src_strd)));
4780
4781                /*load 16 pixel values */
4782                s24_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (2 * src_strd)));
4783
4784                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
4785
4786                s31_8x16b = _mm_unpackhi_epi8(s21_8x16b, s22_8x16b);
4787
4788                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
4789
4790                s33_8x16b = _mm_unpackhi_epi8(s23_8x16b, s24_8x16b);
4791
4792                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
4793
4794                s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
4795
4796                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
4797
4798                s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
4799
4800                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
4801
4802                s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b);
4803
4804/* store 8 8-bit output values  */
4805                /* pi2_dst[col] = (UWORD8)i2_tmp; */
4806                _mm_storeu_si128((__m128i *)(pi2_dst), s8_8x16b);
4807
4808                _mm_storeu_si128((__m128i *)(pi2_dst + 8), s35_8x16b);
4809
4810
4811                s25_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (3 * src_strd)));
4812
4813                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
4814
4815                s31_8x16b = _mm_unpackhi_epi8(s22_8x16b, s23_8x16b);
4816
4817                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
4818
4819                s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
4820
4821                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
4822
4823                s33_8x16b = _mm_unpackhi_epi8(s24_8x16b, s25_8x16b);
4824
4825                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
4826
4827                s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
4828
4829                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
4830
4831                s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
4832
4833/* store 8 8-bit output values  */
4834                /* pi2_dst[col] = (UWORD8)i2_tmp; */
4835                _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
4836
4837                _mm_store_si128((__m128i *)(pi2_dst + dst_strd + 8), s35_8x16b);
4838
4839
4840                pu1_src += 2 * src_strd;
4841                pi2_dst += 2 * dst_strd;
4842
4843
4844            } /* inner for loop ends here(8-output values in single iteration) */
4845
4846        }
4847    }
4848
4849    else if(wd % 4 == 0)
4850    { /* wd = multiple of 8 case */
4851
4852        for(row = 0; row < ht; row += 2)
4853        {
4854
4855            pu1_src_copy = pu1_src;
4856            pi2_dst_copy = pi2_dst;
4857
4858            for(col = 0; col < 2 * wd; col += 8)
4859            {
4860
4861                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
4862                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
4863
4864
4865                /*load 8 pixel values */
4866                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
4867
4868                /*load 8 pixel values */
4869                s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
4870
4871                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
4872
4873                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
4874
4875                /*load 8 pixel values */
4876                s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
4877
4878                /*load 8 pixel values */
4879                s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
4880
4881                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
4882
4883                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
4884
4885                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
4886
4887                _mm_storeu_si128((__m128i *)(pi2_dst), s8_8x16b);
4888
4889                s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
4890
4891                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
4892                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
4893
4894                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
4895                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
4896
4897                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
4898
4899                _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
4900
4901                pu1_src += 8;    /* To pointer update */
4902                pi2_dst += 8;
4903
4904            } /* inner for loop ends here(8-output values in single iteration) */
4905
4906            pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
4907            pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
4908        }
4909    }
4910
4911    else
4912    { /* wd = multiple of 4 case */
4913
4914        for(row = 0; row < ht; row += 2)
4915        {
4916            pu1_src_copy = pu1_src;
4917            pi2_dst_copy = pi2_dst;
4918            for(col = 0; col < 2 * wd; col += 4)
4919            {
4920
4921                PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
4922                PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
4923
4924
4925                /*load 8 pixel values */
4926                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
4927
4928                /*load 8 pixel values */
4929                s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
4930
4931                s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
4932
4933                s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
4934
4935                /*load 8 pixel values */
4936                s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
4937
4938                /*load 8 pixel values */
4939                s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
4940
4941                s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
4942
4943                s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
4944
4945                s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
4946
4947
4948/* store 8 8-bit output values  */
4949                /* pi2_dst[col] = (UWORD8)i2_tmp; */
4950                _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
4951
4952                s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
4953
4954                s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
4955                s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
4956
4957                s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
4958                s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
4959
4960                s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
4961
4962
4963/* store 8 8-bit output values  */
4964                /* pi2_dst[col] = (UWORD8)i2_tmp; */
4965                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
4966
4967                pu1_src += 4;   /* To pointer update */
4968                pi2_dst += 4;
4969            } /* inner for loop ends here(8-output values in single iteration) */
4970
4971            pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
4972            pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
4973        }
4974    }
4975}
4976
4977/**
4978*******************************************************************************
4979*
4980* @brief
4981*     chroma interprediction filter for vertical 16bit input
4982*
4983* @par Description:
4984*    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
4985*    the elements pointed by 'pu1_src' and  writes to the location pointed by
4986*    'pu1_dst'  Input is 16 bits  The filter output is downshifted by 12 and
4987*    clipped to lie  between 0 and 255
4988*
4989* @param[in] pi2_src
4990*  WORD16 pointer to the source
4991*
4992* @param[out] pu1_dst
4993*  UWORD8 pointer to the destination
4994*
4995* @param[in] src_strd
4996*  integer source stride
4997*
4998* @param[in] dst_strd
4999*  integer destination stride
5000*
5001* @param[in] pi1_coeff
5002*  WORD8 pointer to the filter coefficients
5003*
5004* @param[in] ht
5005*  integer height of the array
5006*
5007* @param[in] wd
5008*  integer width of the array
5009*
5010* @returns
5011*
5012* @remarks
5013*  None
5014*
5015*******************************************************************************
5016*/
5017void ihevc_inter_pred_chroma_vert_w16inp_ssse3(WORD16 *pi2_src,
5018                                               UWORD8 *pu1_dst,
5019                                               WORD32 src_strd,
5020                                               WORD32 dst_strd,
5021                                               WORD8 *pi1_coeff,
5022                                               WORD32 ht,
5023                                               WORD32 wd)
5024{
5025    WORD32 row, col;
5026    WORD16 *pi2_src_copy;
5027    UWORD8 *pu1_dst_copy;
5028    __m128i coeff0_1_8x16b, coeff2_3_8x16b;
5029    __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
5030    __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
5031    __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b, sign_reg;
5032    __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
5033    __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
5034
5035
5036/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
5037    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
5038
5039    zero_8x16b = _mm_setzero_si128();
5040    sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
5041    s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
5042
5043    coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
5044    coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
5045
5046/*  seting  values in register */
5047    offset_8x16b = _mm_set1_epi32(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
5048    mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
5049    mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
5050
5051/*  outer for loop starts from here */
5052    if(wd % 4 == 0)
5053    { /* wd = multiple of 8 case */
5054
5055        pi2_src_copy = pi2_src;
5056        pu1_dst_copy = pu1_dst;
5057
5058        for(col = 0; col < 2 * wd; col += 8)
5059        {
5060
5061            pi2_src = pi2_src_copy + col;
5062            pu1_dst = pu1_dst_copy + col;
5063
5064
5065            for(row = 0; row < ht; row += 2)
5066            {
5067
5068                /*load 16 pixel values */
5069                s21_8x16b  = _mm_load_si128((__m128i *)(pi2_src + (-1 * src_strd)));
5070
5071                /*load 16 pixel values */
5072                s22_8x16b = _mm_load_si128((__m128i *)(pi2_src + (0 * src_strd)));
5073
5074
5075                /*load 16 pixel values */
5076                s23_8x16b = _mm_load_si128((__m128i *)(pi2_src + (1 * src_strd)));
5077
5078                /*load 16 pixel values */
5079                s24_8x16b = _mm_load_si128((__m128i *)(pi2_src + (2 * src_strd)));
5080
5081                s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
5082
5083                s31_8x16b = _mm_unpackhi_epi16(s21_8x16b, s22_8x16b);
5084
5085                s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
5086
5087                s33_8x16b = _mm_unpackhi_epi16(s23_8x16b, s24_8x16b);
5088
5089                s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
5090
5091                s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
5092
5093                s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
5094
5095                s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
5096
5097                s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
5098
5099                s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b);
5100
5101                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5102                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5103
5104                s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5105
5106
5107                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
5108                s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
5109
5110                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5111                s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5112
5113                s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
5114
5115                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
5116                s33_8x16b = _mm_add_epi32(s32_8x16b, offset_8x16b);
5117
5118                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5119                s34_8x16b = _mm_srai_epi32(s33_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5120
5121                s35_8x16b = _mm_packs_epi32(s34_8x16b, zero_8x16b);
5122
5123
5124                /* i2_tmp = CLIP_U8(i2_tmp);*/
5125                s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
5126
5127                s33_8x16b =  _mm_packus_epi16(s35_8x16b, zero_8x16b);
5128
5129                s7_8x16b = _mm_unpacklo_epi32(s7_8x16b, s33_8x16b);
5130/* store 8 8-bit output values  */
5131                /* pu1_dst[col] = (UWORD8)i2_tmp; */
5132                _mm_storel_epi64((__m128i *)(pu1_dst), s7_8x16b);
5133
5134
5135                s25_8x16b = _mm_load_si128((__m128i *)(pi2_src + (3 * src_strd)));
5136
5137                s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
5138
5139                s31_8x16b = _mm_unpackhi_epi16(s22_8x16b, s23_8x16b);
5140
5141                s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
5142
5143                s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
5144
5145                s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
5146
5147                s33_8x16b = _mm_unpackhi_epi16(s24_8x16b, s25_8x16b);
5148
5149                s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
5150
5151                s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
5152
5153                s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
5154
5155                s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
5156
5157                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5158                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5159
5160                s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5161
5162
5163                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
5164                s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
5165
5166                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5167                s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5168
5169                s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
5170
5171                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
5172                s33_8x16b = _mm_add_epi32(s32_8x16b, offset_8x16b);
5173
5174                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5175                s34_8x16b = _mm_srai_epi32(s33_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5176
5177                s35_8x16b = _mm_packs_epi32(s34_8x16b, zero_8x16b);
5178
5179
5180                /* i2_tmp = CLIP_U8(i2_tmp);*/
5181                s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
5182
5183                s33_8x16b =  _mm_packus_epi16(s35_8x16b, zero_8x16b);
5184
5185                s7_8x16b = _mm_unpacklo_epi32(s7_8x16b, s33_8x16b);
5186/* store 8 8-bit output values  */
5187                /* pu1_dst[col] = (UWORD8)i2_tmp; */
5188                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
5189
5190                pi2_src += 2 * src_strd;
5191                pu1_dst += 2 * dst_strd;
5192
5193
5194            } /* inner for loop ends here(8-output values in single iteration) */
5195
5196        }
5197    }
5198    else
5199    { /* wd = multiple of 4 case */
5200
5201        for(row = 0; row < ht; row += 2)
5202        {
5203            pi2_src_copy = pi2_src;
5204            pu1_dst_copy = pu1_dst;
5205            for(col = 0; col < 2 * wd; col += 4)
5206            {
5207
5208                /*load 8 pixel values  */
5209                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
5210
5211                /*load 8 pixel values */
5212                s22_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
5213
5214                s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
5215
5216                s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
5217
5218                /*load 8 pixel values */
5219                s23_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
5220
5221                /*load 8 pixel values */
5222                s24_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
5223
5224                s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
5225
5226                s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
5227
5228                s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
5229
5230
5231                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5232                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5233
5234
5235                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
5236                s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
5237
5238                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5239                s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5240
5241                s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
5242
5243
5244                /* i2_tmp = CLIP_U8(i2_tmp);*/
5245                s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
5246
5247                s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
5248                s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
5249                s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
5250                s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
5251
5252/* store 8 8-bit output values  */
5253                /* pu1_dst[col] = (UWORD8)i2_tmp; */
5254                _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
5255
5256                s25_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
5257
5258                s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
5259                s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
5260
5261                s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
5262                s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
5263
5264                s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
5265
5266                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5267                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5268
5269                /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
5270                s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
5271
5272                /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5273                s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5274
5275                s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
5276
5277                /* i2_tmp = CLIP_U8(i2_tmp);*/
5278                s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
5279
5280                s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
5281                s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
5282                s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
5283                s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
5284
5285/* store 8 8-bit output values  */
5286                /* pu1_dst[col] = (UWORD8)i2_tmp; */
5287                _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s9_8x16b);
5288
5289                pi2_src += 4;   /* To pointer update */
5290                pu1_dst += 4;
5291            } /* inner for loop ends here(8-output values in single iteration) */
5292
5293            pi2_src = pi2_src_copy + 2 * src_strd; /* pointer update */
5294            pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
5295        }
5296    }
5297
5298}
5299
5300/**
5301*******************************************************************************
5302*
5303* @brief
5304*
5305*      Chroma interprediction filter for 16bit vertical input and output.
5306*
5307* @par Description:
5308*       Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
5309*       the elements pointed by 'pu1_src' and  writes to the location pointed by
5310*       'pu1_dst'  Input is 16 bits  The filter output is downshifted by 6 and
5311*       8192 is  subtracted to store it as a 16 bit number  The output is used as
5312*       a input to weighted prediction
5313*
5314* @param[in] pi2_src
5315*  WORD16 pointer to the source
5316*
5317* @param[out] pi2_dst
5318*  WORD16 pointer to the destination
5319*
5320* @param[in] src_strd
5321*  integer source stride
5322*
5323* @param[in] dst_strd
5324*  integer destination stride
5325*
5326* @param[in] pi1_coeff
5327*  WORD8 pointer to the filter coefficients
5328*
5329* @param[in] ht
5330*  integer height of the array
5331*
5332* @param[in] wd
5333*  integer width of the array
5334*
5335* @returns
5336*
5337* @remarks
5338*  None
5339*
5340*******************************************************************************
5341*/
5342void ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3(WORD16 *pi2_src,
5343                                                      WORD16 *pi2_dst,
5344                                                      WORD32 src_strd,
5345                                                      WORD32 dst_strd,
5346                                                      WORD8 *pi1_coeff,
5347                                                      WORD32 ht,
5348                                                      WORD32 wd)
5349{
5350    WORD32 row, col;
5351    WORD16 *pi2_src_copy;
5352    WORD16 *pi2_dst_copy;
5353    __m128i coeff0_1_8x16b, coeff2_3_8x16b;
5354    __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
5355    __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
5356    __m128i zero_8x16b, sign_reg;
5357    __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
5358    __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
5359
5360
5361/* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
5362    s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
5363
5364    zero_8x16b = _mm_setzero_si128();
5365    sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
5366    s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
5367
5368    coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
5369    coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
5370
5371
5372/*  outer for loop starts from here */
5373    if(wd % 4 == 0)
5374    { /* wd = multiple of 8 case */
5375
5376        pi2_src_copy = pi2_src;
5377        pi2_dst_copy = pi2_dst;
5378
5379        for(col = 0; col < 2 * wd; col += 8)
5380        {
5381
5382            pi2_src = pi2_src_copy + col;
5383            pi2_dst = pi2_dst_copy + col;
5384
5385
5386            for(row = 0; row < ht; row += 2)
5387            {
5388
5389                /*load 16 pixel values */
5390                s21_8x16b  = _mm_load_si128((__m128i *)(pi2_src + (-1 * src_strd)));
5391
5392                /*load 16 pixel values */
5393                s22_8x16b = _mm_load_si128((__m128i *)(pi2_src + (0 * src_strd)));
5394
5395
5396                /*load 16 pixel values */
5397                s23_8x16b = _mm_load_si128((__m128i *)(pi2_src + (1 * src_strd)));
5398
5399                /*load 16 pixel values */
5400                s24_8x16b = _mm_load_si128((__m128i *)(pi2_src + (2 * src_strd)));
5401
5402                s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
5403
5404                s31_8x16b = _mm_unpackhi_epi16(s21_8x16b, s22_8x16b);
5405
5406                s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
5407
5408                s33_8x16b = _mm_unpackhi_epi16(s23_8x16b, s24_8x16b);
5409
5410                s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
5411
5412                s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
5413
5414                s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
5415
5416                s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
5417
5418                s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
5419
5420                s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b);
5421
5422                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5423                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5424
5425                s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5426
5427                s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
5428
5429                s35_8x16b = _mm_packs_epi32(s32_8x16b, zero_8x16b);
5430
5431                s7_8x16b = _mm_unpacklo_epi64(s9_8x16b, s35_8x16b);
5432/* store 8 8-bit output values  */
5433                /* pi2_dst[col] = (UWORD8)i2_tmp; */
5434                _mm_store_si128((__m128i *)(pi2_dst), s7_8x16b);
5435
5436
5437                s25_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + (3 * src_strd)));
5438
5439                s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
5440
5441                s31_8x16b = _mm_unpackhi_epi16(s22_8x16b, s23_8x16b);
5442
5443                s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
5444
5445                s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
5446
5447                s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
5448
5449                s33_8x16b = _mm_unpackhi_epi16(s24_8x16b, s25_8x16b);
5450
5451                s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
5452
5453                s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
5454
5455                s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
5456
5457                s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
5458
5459                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5460                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5461
5462                s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5463
5464                s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
5465
5466                s35_8x16b = _mm_packs_epi32(s32_8x16b, zero_8x16b);
5467
5468                s7_8x16b = _mm_unpacklo_epi64(s9_8x16b, s35_8x16b);
5469/* store 8 8-bit output values  */
5470                /* pi2_dst[col] = (UWORD8)i2_tmp; */
5471                _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s7_8x16b);
5472
5473                pi2_src += 2 * src_strd;
5474                pi2_dst += 2 * dst_strd;
5475
5476
5477            } /* inner for loop ends here(8-output values in single iteration) */
5478
5479        }
5480    }
5481    else
5482    { /* wd = multiple of 4 case */
5483
5484        for(row = 0; row < ht; row += 2)
5485        {
5486            pi2_src_copy = pi2_src;
5487            pi2_dst_copy = pi2_dst;
5488            for(col = 0; col < 2 * wd; col += 4)
5489            {
5490
5491                /*load 4 pixel values */
5492                s21_8x16b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
5493
5494                /*load 4 pixel values */
5495                s22_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
5496
5497                s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
5498
5499                s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
5500
5501                /*load 4 pixel values */
5502                s23_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
5503
5504                /*load 4 pixel values */
5505                s24_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
5506
5507                s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
5508
5509                s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
5510
5511                s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
5512
5513                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5514                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5515
5516                s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
5517
5518/* store 8 8-bit output values  */
5519                /* pi2_dst[col] = (UWORD8)i2_tmp; */
5520                _mm_storel_epi64((__m128i *)(pi2_dst), s9_8x16b);
5521
5522                s25_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
5523
5524                s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
5525                s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
5526
5527                s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
5528                s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
5529
5530                s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
5531
5532                /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
5533                s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
5534
5535                s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
5536
5537/* store 8 8-bit output values  */
5538                /* pi2_dst[col] = (UWORD8)i2_tmp; */
5539                _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s9_8x16b);
5540
5541                pi2_src += 4;   /* To pointer update */
5542                pi2_dst += 4;
5543            } /* inner for loop ends here(8-output values in single iteration) */
5544
5545            pi2_src = pi2_src_copy + 2 * src_strd; /* pointer update */
5546            pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
5547        }
5548    }
5549
5550}
5551