1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19*******************************************************************************
20* @file
21*  ihevc_weighted_pred_atom_intr.c
22*
23* @brief
24*  Contains function definitions for weighted prediction used in inter
25* prediction
26*
27* @author
28*
29*
30* @par List of Functions:
31*   - ihevc_weighted_pred_uni_ssse3()
32*   - ihevc_weighted_pred_bi_ssse3()
33*   - ihevc_weighted_pred_bi_default_ssse3()
34*   - ihevc_weighted_pred_chroma_uni_ssse3()
35*   - ihevc_weighted_pred_chroma_bi_ssse3()
36*   - ihevc_weighted_pred_chroma_bi_default_ssse3()
37*
38* @remarks
39*  None
40*
41*******************************************************************************
42*/
43/*****************************************************************************/
44/* File Includes                                                             */
45/*****************************************************************************/
46#include <stdio.h>
47#include <assert.h>
48
49#include "ihevc_debug.h"
50#include "ihevc_typedefs.h"
51#include "ihevc_macros.h"
52#include "ihevc_platform_macros.h"
53#include "ihevc_func_selector.h"
54#include "ihevc_defs.h"
55#include "ihevc_weighted_pred.h"
56#include "ihevc_inter_pred.h"
57
58
59#include <immintrin.h>
60
61/**
62*******************************************************************************
63*
64* @brief
65*  Does uni-weighted prediction on the array pointed by  pi2_src and stores
66* it at the location pointed by pi2_dst
67*
68* @par Description:
69*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
70* offset
71*
72* @param[in] pi2_src
73*  Pointer to the source
74*
75* @param[out] pu1_dst
76*  Pointer to the destination
77*
78* @param[in] src_strd
79*  Source stride
80*
81* @param[in] dst_strd
82*  Destination stride
83*
84* @param[in] wgt0
85*  weight to be multiplied to the source
86*
87* @param[in] off0
88*  offset to be added after rounding and
89*
90* @param[in] shifting
91*
92*
93* @param[in] shift
94*  (14 Bit depth) + log2_weight_denominator
95*
96* @param[in] lvl_shift
97*  added before shift and offset
98*
99* @param[in] ht
100*  height of the source
101*
102* @param[in] wd
103*  width of the source
104*
105* @returns
106*
107* @remarks
108*  None
109*
110*******************************************************************************
111*/
112
113void ihevc_weighted_pred_uni_ssse3(WORD16 *pi2_src,
114                                   UWORD8 *pu1_dst,
115                                   WORD32 src_strd,
116                                   WORD32 dst_strd,
117                                   WORD32 wgt0,
118                                   WORD32 off0,
119                                   WORD32 shift,
120                                   WORD32 lvl_shift,
121                                   WORD32 ht,
122                                   WORD32 wd)
123{
124    WORD32 row, col, temp;
125
126    /* all 128 bit registers are named with a suffix mxnb, where m is the */
127    /* number of n bits packed in the register                            */
128    __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
129    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
130    __m128i res_temp0_4x32b, res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b;
131
132    ASSERT(wd % 4 == 0); /* checking assumption*/
133    ASSERT(ht % 4 == 0); /* checking assumption*/
134
135    temp = 1 << (shift - 1);
136
137    // seting values in register
138    lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
139    wgt0_8x16b = _mm_set1_epi16(wgt0);
140
141    /* lvl_shift * wgt0 */
142    res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
143    res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
144
145    const_temp_4x32b = _mm_set1_epi32(temp);
146    off0_4x32b = _mm_set1_epi32(off0);
147
148
149    /* lvl_shift * wgt0 */
150    lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
151    /* lvl_shift * wgt0 + 1 << (shift - 1) */
152    lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
153
154    if(0 == (wd & 7)) /* wd multiple of 8 case */
155    {
156        __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
157
158        /*  outer for loop starts from here */
159        for(row = 0; row < ht; row += 4)
160        {
161            for(col = 0; col < wd; col += 8)
162            {   /* for row =0 ,1,2,3*/
163
164                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
165                src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
166                /* row = 1 */
167                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
168                /* row = 2 */
169                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
170                /* row = 3 */
171                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
172
173                /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
174                res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
175                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
176                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
177                res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
178
179                /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
180                src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
181                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
182                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
183                src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
184
185                /* Get 32 bit Result */
186                res_temp4_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
187                res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
188                res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
189                res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
190
191                res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
192                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
193                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
194                res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
195
196                /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
197                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
198                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
199                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
200                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
201                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
202                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
203                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
204                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
205
206                /* (i4_tmp >> shift) */ /* First 4 pixels */
207                res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
208                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
209                res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
210                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
211
212                /* (i4_tmp >> shift) */ /* Last 4 pixels */
213                res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift);
214                res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
215                res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift);
216                res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
217
218                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
219                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
220                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
221                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
222                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
223
224                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
225                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
226                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
227                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
228                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
229
230                res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp4_4x32b);
231                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
232                res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp6_4x32b);
233                res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
234                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
235                res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
236                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
237                res_temp2_4x32b = _mm_packus_epi16(res_temp2_4x32b, res_temp2_4x32b);
238                res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
239
240                /* store four 8-bit output values  */
241                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
242                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 2*/
243                _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), res_temp2_4x32b); /* row = 1*/
244                _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), res_temp3_4x32b); /* row = 3*/
245
246                /* To update pointer */
247                pi2_src += 8;
248                pu1_dst += 8;
249
250            } /* inner loop ends here(4-output values in single iteration) */
251
252            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
253            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
254
255        }
256    }
257    else  /* wd multiple of 4 case */
258    {
259        WORD32 dst0, dst1, dst2, dst3;
260        /*  outer for loop starts from here */
261        for(row = 0; row < ht; row += 4)
262        {
263            for(col = 0; col < wd; col += 4)
264            {   /* for row =0 ,1,2,3*/
265
266                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
267                src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
268                /* row = 1 */
269                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
270                /* row = 2 */
271                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 2 * src_strd));
272                /* row = 3 */
273                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 3 * src_strd));
274
275                /* 2 rows together */
276                src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp2_8x16b);
277                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
278
279                /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
280                res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
281                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
282                /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Higher 16 bit */
283                src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
284                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
285
286                /* Get 32 bit Result */
287                res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
288                res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
289
290                res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
291                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
292
293                /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
294                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
295                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
296                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
297                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
298
299                /* (i4_tmp >> shift) */
300                res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
301                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
302                res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
303                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
304
305                /*i4_tmp = (i4_tmp >> shift) + off0; */
306                res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
307                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
308                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
309                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
310
311                res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
312                res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp3_4x32b);
313
314                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
315                res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp2_4x32b);
316
317                dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
318                /* dst row = 1 to 3 */
319                res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
320                res_temp2_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 2);
321                res_temp3_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 3);
322
323                /* store four 8-bit output values  */
324                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
325
326                dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
327                dst2 = _mm_cvtsi128_si32(res_temp2_4x32b);
328                dst3 = _mm_cvtsi128_si32(res_temp3_4x32b);
329
330                /* row = 1 to row = 3 */
331                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
332                *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
333                *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
334
335                /* To update pointer */
336                pi2_src += 4;
337                pu1_dst += 4;
338
339            } /* inner loop ends here(4-output values in single iteration) */
340
341            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
342            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
343
344        }
345    }
346}
347
348/**
349*******************************************************************************
350*
351* @brief
352* Does chroma uni-weighted prediction on array pointed by pi2_src and stores
353* it at the location pointed by pi2_dst
354*
355* @par Description:
356*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
357* offset
358*
359* @param[in] pi2_src
360*  Pointer to the source
361*
362* @param[out] pu1_dst
363*  Pointer to the destination
364*
365* @param[in] src_strd
366*  Source stride
367*
368* @param[in] dst_strd
369*  Destination stride
370*
371* @param[in] wgt0
372*  weight to be multiplied to the source
373*
374* @param[in] off0
375*  offset to be added after rounding and
376*
377* @param[in] shifting
378*
379*
380* @param[in] shift
381*  (14 Bit depth) + log2_weight_denominator
382*
383* @param[in] lvl_shift
384*  added before shift and offset
385*
386* @param[in] ht
387*  height of the source
388*
389* @param[in] wd
390*  width of the source (each colour component)
391*
392* @returns
393*
394* @remarks
395*  None
396*
397*******************************************************************************
398*/
399
400
401void ihevc_weighted_pred_chroma_uni_ssse3(WORD16 *pi2_src,
402                                          UWORD8 *pu1_dst,
403                                          WORD32 src_strd,
404                                          WORD32 dst_strd,
405                                          WORD32 wgt0_cb,
406                                          WORD32 wgt0_cr,
407                                          WORD32 off0_cb,
408                                          WORD32 off0_cr,
409                                          WORD32 shift,
410                                          WORD32 lvl_shift,
411                                          WORD32 ht,
412                                          WORD32 wd)
413{
414    WORD32 row, col, temp, wdx2;
415    /* all 128 bit registers are named with a suffix mxnb, where m is the */
416    /* number of n bits packed in the register                            */
417
418    __m128i src_temp0_8x16b, src_temp1_8x16b;
419    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
420    __m128i res_temp0_4x32b, res_temp1_4x32b;
421
422    ASSERT(wd % 2 == 0); /* checking assumption*/
423    ASSERT(ht % 2 == 0); /* checking assumption*/
424
425    temp = 1 << (shift - 1);
426    wdx2 = 2 * wd;
427
428    // seting values in register
429    lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
430    wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
431
432    /* lvl_shift * wgt0 */
433    res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
434    res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
435
436    const_temp_4x32b = _mm_set1_epi32(temp);
437    off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
438
439    /* lvl_shift * wgt0 */
440    lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
441    /* lvl_shift * wgt0 + 1 << (shift - 1) */
442    lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
443
444    {
445        if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
446        {
447            __m128i src_temp2_8x16b, src_temp3_8x16b;
448            __m128i res_temp2_4x32b, res_temp3_4x32b;
449            __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
450
451            /*  outer for loop starts from here */
452            for(row = 0; row < ht; row += 2)
453            {
454                for(col = 0; col < wdx2; col += 16)
455                {
456                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
457                    src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
458                    /* row = 1 */
459                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
460                    /* row = 0 */ /* Next 8 pixels */
461                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
462                    /* row = 1 */
463                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
464
465                    /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
466                    res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
467                    res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
468                    res_temp4_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
469                    res_temp5_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
470
471                    /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
472                    src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
473                    src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
474                    src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
475                    src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
476
477                    /* Get 32 bit Result */
478                    res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
479                    res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
480                    res_temp6_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp2_8x16b);
481                    res_temp7_4x32b = _mm_unpackhi_epi16(res_temp5_4x32b, src_temp3_8x16b);
482
483                    res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
484                    res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
485                    res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp2_8x16b);
486                    res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, src_temp3_8x16b);
487
488                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
489                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
490                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
491                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
492                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
493                    res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
494                    res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
495                    res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
496                    res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
497
498                    /* (i4_tmp >> shift) */
499                    res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
500                    res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
501                    res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b,  shift);
502                    res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
503                    /*i4_tmp = (i4_tmp >> shift) + off0; */
504                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
505                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
506                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
507                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
508                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
509
510                    /* (i4_tmp >> shift) */
511                    res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b,  shift);
512                    res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
513                    res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b,  shift);
514                    res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
515                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
516                    res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
517                    res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
518                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
519                    res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
520                    res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
521
522                    res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
523                    res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
524                    res_temp4_4x32b = _mm_packs_epi32(res_temp4_4x32b, res_temp6_4x32b);
525                    res_temp5_4x32b = _mm_packs_epi32(res_temp5_4x32b, res_temp7_4x32b);
526                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
527                    res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp4_4x32b);
528                    res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp5_4x32b);
529
530                    /* store 16 8-bit output values  */
531                    _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
532                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
533
534                    pi2_src += 16;  /* Pointer update */
535                    pu1_dst += 16; /* Pointer update */
536
537                } /* inner loop ends here(4-output values in single iteration) */
538                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
539                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
540            }
541        }
542        else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
543        {
544            __m128i res_temp2_4x32b, res_temp3_4x32b;
545            /*  outer for loop starts from here */
546            for(row = 0; row < ht; row += 2)
547            {
548                for(col = 0; col < wdx2; col += 8)
549                {
550                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
551                    src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
552                    /* row = 1 */
553                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
554
555                    /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
556                    res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
557                    res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
558                    /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
559                    src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
560                    src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
561
562                    /* Get 32 bit Result */
563                    res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
564                    res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
565
566                    res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
567                    res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
568
569                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
570                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
571                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
572                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
573                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
574
575                    /* (i4_tmp >> shift) */
576                    res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
577                    res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
578                    res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b,  shift);
579                    res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
580
581                    /*i4_tmp = (i4_tmp >> shift) + off0; */
582                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
583                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
584                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
585                    res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
586                    res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
587
588                    res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
589                    res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
590
591                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
592                    res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
593                    res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
594
595                    /* store four 8-bit output values  */
596                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
597                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
598
599                    pi2_src += 8;   /* Pointer update */
600                    pu1_dst += 8; /* Pointer update */
601
602                } /* inner loop ends here(4-output values in single iteration) */
603                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
604                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
605            }
606        }
607        else /* 2*wd multiple of 4 case */
608        {
609            WORD32 dst0, dst1;
610            /*  outer for loop starts from here */
611            for(row = 0; row < ht; row += 2)
612            {
613                for(col = 0; col < wdx2; col += 4)
614                {
615                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
616                    src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
617                    /* row = 1 */
618                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
619
620                    /* 2 rows together */
621                    src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
622
623                    /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
624                    res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
625                    /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
626                    src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
627
628                    /* Get 32 bit Result */
629                    res_temp1_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
630                    res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
631
632                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
633                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
634                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
635
636                    /* (i4_tmp >> shift) */
637                    res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
638                    res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
639
640                    /*i4_tmp = (i4_tmp >> shift) + off0; */
641                    res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
642                    res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
643
644                    res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
645
646                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
647                    res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
648
649                    dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
650                    /* dst row = 1 to 3 */
651                    res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
652
653                    /* store four 8-bit output values  */
654                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
655
656                    dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
657                    /* row = 1 */
658                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
659
660                    pi2_src += 4;   /* Pointer update */
661                    pu1_dst += 4; /* Pointer update */
662
663                } /* inner loop ends here(4-output values in single iteration) */
664                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
665                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
666            }
667        }
668    }
669}
670
671/**
672*******************************************************************************
673*
674* @brief
675*  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
676* pi2_src2 and stores it at location pointed  by pi2_dst
677*
678* @par Description:
679*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
680* off1 + 1) << (shift - 1) ) >> shift
681*
682* @param[in] pi2_src1
683*  Pointer to source 1
684*
685* @param[in] pi2_src2
686*  Pointer to source 2
687*
688* @param[out] pu1_dst
689*  Pointer to destination
690*
691* @param[in] src_strd1
692*  Source stride 1
693*
694* @param[in] src_strd2
695*  Source stride 2
696*
697* @param[in] dst_strd
698*  Destination stride
699*
700* @param[in] wgt0
701*  weight to be multiplied to source 1
702*
703* @param[in] off0
704*  offset 0
705*
706* @param[in] wgt1
707*  weight to be multiplied to source 2
708*
709* @param[in] off1
710*  offset 1
711*
712* @param[in] shift
713*  (14 Bit depth) + log2_weight_denominator
714*
715* @param[in] lvl_shift1
716*  added before shift and offset
717*
718* @param[in] lvl_shift2
719*  added before shift and offset
720*
721* @param[in] ht
722*  height of the source
723*
724* @param[in] wd
725*  width of the source
726*
727* @returns
728*
729* @remarks
730*  None
731*
732*******************************************************************************
733*/
734
735
736void ihevc_weighted_pred_bi_ssse3(WORD16 *pi2_src1,
737                                  WORD16 *pi2_src2,
738                                  UWORD8 *pu1_dst,
739                                  WORD32 src_strd1,
740                                  WORD32 src_strd2,
741                                  WORD32 dst_strd,
742                                  WORD32 wgt0,
743                                  WORD32 off0,
744                                  WORD32 wgt1,
745                                  WORD32 off1,
746                                  WORD32 shift,
747                                  WORD32 lvl_shift1,
748                                  WORD32 lvl_shift2,
749                                  WORD32 ht,
750                                  WORD32 wd)
751{
752    WORD32 row, col, temp;
753
754    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
755    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
756    __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
757
758#include <assert.h>
759    ASSERT(wd % 4 == 0); /* checking assumption*/
760    ASSERT(ht % 4 == 0); /* checking assumption*/
761
762    temp = (off0 + off1 + 1) << (shift - 1);
763
764    // seting values in register
765    lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
766    wgt0_8x16b = _mm_set1_epi16(wgt0);
767    lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
768    wgt1_8x16b = _mm_set1_epi16(wgt1);
769
770    /* lvl_shift1 * wgt0 */
771    res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
772    res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
773    /* lvl_shift2 * wgt1 */
774    res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
775    res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
776
777    const_temp_4x32b = _mm_set1_epi32(temp);
778
779    /* lvl_shift1 * wgt0 */
780    lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
781    /* lvl_shift2 * wgt1 */
782    lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
783
784    if(0 == (wd & 7)) /* wd multiple of 8 case */
785    {
786        __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
787        /*  outer for loop starts from here */
788        for(row = 0; row < ht; row += 2)
789        {
790            for(col = 0; col < wd; col += 8)
791            {
792                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
793                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
794                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
795                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
796                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
797
798                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
799                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
800                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
801                res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
802                res_temp4_4x32b  = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
803                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
804                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
805                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
806                src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
807                src_temp4_8x16b  = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
808
809                /* Get 32 bit Result */
810                res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
811                res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
812                res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
813                res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
814
815                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
816                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
817                res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
818                res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
819
820                /* (pi2_src[col] + lvl_shift) * wgt */
821                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
822                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
823                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
824                res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
825                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
826                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
827                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
828                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
829
830                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
831                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
832                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
833                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
834                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
835                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
836                /* (i4_tmp >> shift) */
837                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
838                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
839
840                /* Next 4 Pixels */
841                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
842                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
843                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
844                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
845                res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
846                res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
847
848                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
849                res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
850
851                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
852                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
853                res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
854
855                /* store four 8-bit output values  */
856                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
857                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
858
859                pi2_src1 += 8;  /* Pointer update */
860                pi2_src2 += 8;  /* Pointer update */
861                pu1_dst  += 8;  /* Pointer update */
862
863            } /* inner loop ends here(4-output values in single iteration) */
864
865            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
866            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
867            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
868
869        } /* outer loop ends */
870    }
871    else /* wd multiple of 4 case */
872    {
873        WORD32 dst0, dst1;
874        /*  outer for loop starts from here */
875        for(row = 0; row < ht; row += 2)
876        {
877            for(col = 0; col < wd; col += 4)
878            {
879                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
880                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
881                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
882                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
883                src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
884
885                /* 2 rows together */
886                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
887                src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
888
889                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
890                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
891                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
892                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
893                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
894                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
895
896                /* Get 32 bit Result */
897                res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
898                res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
899
900                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
901                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
902
903                /* (pi2_src[col] + lvl_shift) * wgt */
904                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
905                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
906                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
907                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
908
909                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
910                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
911                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
912
913                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
914                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
915                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
916
917                /* (i4_tmp >> shift) */
918                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
919                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
920
921                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
922
923                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
924                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
925
926                dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
927
928                /* dst row = 1 to 3 */
929                res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
930
931                /* store four 8-bit output values  */
932                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
933
934                dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
935
936                /* row = 1 */
937                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
938
939                pi2_src1 += 4;  /* Pointer update */
940                pi2_src2 += 4;  /* Pointer update */
941                pu1_dst  += 4;  /* Pointer update */
942
943            } /* inner loop ends here(4-output values in single iteration) */
944
945            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
946            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
947            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
948
949        } /* outer loop ends */
950    }
951
952}
953
954/**
955*******************************************************************************
956*
957* @brief
958* Does chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
959* pi2_src2 and stores it at location pointed  by pi2_dst
960*
961* @par Description:
962*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
963* off1 + 1) << (shift - 1) ) >> shift
964*
965* @param[in] pi2_src1
966*  Pointer to source 1
967*
968* @param[in] pi2_src2
969*  Pointer to source 2
970*
971* @param[out] pu1_dst
972*  Pointer to destination
973*
974* @param[in] src_strd1
975*  Source stride 1
976*
977* @param[in] src_strd2
978*  Source stride 2
979*
980* @param[in] dst_strd
981*  Destination stride
982*
983* @param[in] wgt0
984*  weight to be multiplied to source 1
985*
986* @param[in] off0
987*  offset 0
988*
989* @param[in] wgt1
990*  weight to be multiplied to source 2
991*
992* @param[in] off1
993*  offset 1
994*
995* @param[in] shift
996*  (14 Bit depth) + log2_weight_denominator
997*
998* @param[in] lvl_shift1
999*  added before shift and offset
1000*
1001* @param[in] lvl_shift2
1002*  added before shift and offset
1003*
1004* @param[in] ht
1005*  height of the source
1006*
1007* @param[in] wd
1008*  width of the source (each colour component)
1009*
1010* @returns
1011*
1012* @remarks
1013*  None
1014*
1015*******************************************************************************
1016*/
1017
1018
1019void ihevc_weighted_pred_chroma_bi_ssse3(WORD16 *pi2_src1,
1020                                         WORD16 *pi2_src2,
1021                                         UWORD8 *pu1_dst,
1022                                         WORD32 src_strd1,
1023                                         WORD32 src_strd2,
1024                                         WORD32 dst_strd,
1025                                         WORD32 wgt0_cb,
1026                                         WORD32 wgt0_cr,
1027                                         WORD32 off0_cb,
1028                                         WORD32 off0_cr,
1029                                         WORD32 wgt1_cb,
1030                                         WORD32 wgt1_cr,
1031                                         WORD32 off1_cb,
1032                                         WORD32 off1_cr,
1033                                         WORD32 shift,
1034                                         WORD32 lvl_shift1,
1035                                         WORD32 lvl_shift2,
1036                                         WORD32 ht,
1037                                         WORD32 wd)
1038{
1039    WORD32 row, col, temp1, temp2;
1040    WORD32 wdx2;
1041
1042    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1043    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
1044    __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
1045
1046    ASSERT(wd % 2 == 0); /* checking assumption*/
1047    ASSERT(ht % 2 == 0); /* checking assumption*/
1048
1049    temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
1050    temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
1051
1052    // seting values in register
1053    lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
1054    wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
1055    lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
1056    wgt1_8x16b = _mm_set_epi16(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
1057
1058    /* lvl_shift1 * wgt0 */
1059    res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
1060    res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
1061    /* lvl_shift2 * wgt1 */
1062    res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
1063    res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
1064
1065    const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
1066    wdx2 = wd * 2;
1067
1068    /* lvl_shift1 * wgt0 */
1069    lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
1070    /* lvl_shift2 * wgt1 */
1071    lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
1072
1073    if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
1074    {
1075        __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
1076        /*  outer for loop starts from here */
1077        for(row = 0; row < ht; row += 2)
1078        {
1079            for(col = 0; col < wdx2; col += 8)
1080            {
1081                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1082                src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1083                src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1084                src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1085                src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1086
1087                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
1088                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
1089                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
1090                res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
1091                res_temp4_4x32b  = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
1092                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
1093                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
1094                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
1095                src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
1096                src_temp4_8x16b  = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
1097
1098                /* Get 32 bit Result */
1099                res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
1100                res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
1101                res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
1102                res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
1103
1104                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
1105                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
1106                res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
1107                res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
1108
1109                /* (pi2_src[col] + lvl_shift) * wgt */
1110                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
1111                res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
1112                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
1113                res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
1114                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
1115                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
1116                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
1117                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
1118
1119                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1120                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
1121                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
1122                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1123                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
1124                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
1125                /* (i4_tmp >> shift) */
1126                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
1127                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
1128
1129                /* Next 4 Pixels */
1130                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
1131                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
1132                res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
1133                res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
1134                res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
1135                res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
1136
1137                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
1138                res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
1139
1140                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1141                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
1142                res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
1143
1144                /* store four 8-bit output values  */
1145                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
1146                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
1147
1148                pi2_src1 += 8;  /* Pointer update */
1149                pi2_src2 += 8;  /* Pointer update */
1150                pu1_dst  += 8;  /* Pointer update */
1151
1152            } /* inner loop ends here(4-output values in single iteration) */
1153
1154            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
1155            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
1156            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
1157
1158        } /* outer loop ends */
1159    }
1160    else /* wdx2 multiple of 4 case */
1161    {
1162        WORD32 dst0, dst1;
1163        /*  outer for loop starts from here */
1164        for(row = 0; row < ht; row += 2)
1165        {
1166            for(col = 0; col < wdx2; col += 4)
1167            {
1168                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1169                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
1170                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
1171                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1172                src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1173
1174                /* 2 rows together */
1175                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1176                src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1177
1178                /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
1179                res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
1180                res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
1181                /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
1182                src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
1183                src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
1184
1185                /* Get 32 bit Result */
1186                res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
1187                res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
1188
1189                res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
1190                res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
1191
1192                /* (pi2_src[col] + lvl_shift) * wgt */
1193                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
1194                res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
1195                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
1196                res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
1197
1198                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1199                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
1200                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
1201
1202                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1203                res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
1204                res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
1205
1206                /* (i4_tmp >> shift) */
1207                res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
1208                res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
1209
1210                res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
1211
1212                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1213                res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
1214
1215                dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
1216
1217                /* dst row = 1 to 3 */
1218                res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
1219
1220                /* store four 8-bit output values  */
1221                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1222
1223                dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
1224
1225                /* row = 1 */
1226                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1227
1228                pi2_src1 += 4;  /* Pointer update */
1229                pi2_src2 += 4;  /* Pointer update */
1230                pu1_dst  += 4;  /* Pointer update */
1231
1232            } /* inner loop ends here(4-output values in single iteration) */
1233
1234            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
1235            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
1236            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
1237        }
1238    }
1239
1240}
1241
1242/**
1243*******************************************************************************
1244*
1245* @brief
1246*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
1247* pi2_src2 and stores it at location  pointed by pi2_dst
1248*
1249* @par Description:
1250*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
1251* >> shift  where shift = 15 - BitDepth
1252*
1253* @param[in] pi2_src1
1254*  Pointer to source 1
1255*
1256* @param[in] pi2_src2
1257*  Pointer to source 2
1258*
1259* @param[out] pu1_dst
1260*  Pointer to destination
1261*
1262* @param[in] src_strd1
1263*  Source stride 1
1264*
1265* @param[in] src_strd2
1266*  Source stride 2
1267*
1268* @param[in] dst_strd
1269*  Destination stride
1270*
1271* @param[in] lvl_shift1
1272*  added before shift and offset
1273*
1274* @param[in] lvl_shift2
1275*  added before shift and offset
1276*
1277* @param[in] ht
1278*  height of the source
1279*
1280* @param[in] wd
1281*  width of the source
1282*
1283* @returns
1284*
1285* @remarks
1286*  None
1287*
1288* Assumption : ht%4 == 0, wd%4 == 0
1289* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
1290* final result will match even if intermediate precision is in 16 bit.
1291*
1292*******************************************************************************
1293*/
1294void ihevc_weighted_pred_bi_default_ssse3(WORD16 *pi2_src1,
1295                                          WORD16 *pi2_src2,
1296                                          UWORD8 *pu1_dst,
1297                                          WORD32 src_strd1,
1298                                          WORD32 src_strd2,
1299                                          WORD32 dst_strd,
1300                                          WORD32 lvl_shift1,
1301                                          WORD32 lvl_shift2,
1302                                          WORD32 ht,
1303                                          WORD32 wd)
1304{
1305    {
1306        WORD32 row, col, temp;
1307        WORD32 shift;
1308
1309        __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1310        __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
1311        __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1312
1313        ASSERT(wd % 4 == 0); /* checking assumption*/
1314        ASSERT(ht % 2 == 0); /* checking assumption*/
1315
1316        shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
1317        temp = 1 << (shift - 1);
1318
1319        // seting values in register
1320        lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
1321        lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
1322        const_temp_8x16b = _mm_set1_epi16(temp);
1323
1324        lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
1325        lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
1326
1327        if(0 == (ht & 3)) /* ht multiple of 4*/
1328        {
1329            if(0 == (wd & 15)) /* wd multiple of 16 case */
1330            {
1331                __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1332                __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
1333                /*  outer for loop starts from here */
1334                for(row = 0; row < ht; row += 4)
1335                {
1336                    for(col = 0; col < wd; col += 16)
1337                    {
1338                        /*load 8 pixel values */ /* First 8 Values */
1339                        src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1340                        src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1341                        /* row = 1 */
1342                        src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1343                        src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1344                        /* row = 2 */
1345                        src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1346                        src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1347                        /* row = 3 */
1348                        src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1349                        src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1350
1351                        /*load 8 pixel values */ /* Second 8 Values */
1352                        src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1353                        src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1354                        /* row = 1 */
1355                        src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1356                        src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1357                        /* row = 2 */
1358                        src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
1359                        src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
1360
1361                        /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1362                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1363                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1364                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1365                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1366
1367                        /*load 8 pixel values */ /* Second 8 Values */
1368                        /* row = 3 */
1369                        src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
1370                        src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
1371
1372                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1373                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1374                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1375                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1376                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1377
1378                        /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1379                        src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
1380                        src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
1381                        src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
1382                        src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
1383
1384                        /* (i4_tmp >> shift) */ /* First 8 Values */
1385                        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1386                        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1387                        src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1388                        src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1389
1390                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
1391                        src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
1392                        src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
1393                        src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
1394                        src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
1395
1396                        /* (i4_tmp >> shift) */ /* Second 8 Values */
1397                        src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
1398                        src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
1399                        src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
1400                        src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
1401
1402                        /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
1403                        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
1404                        src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
1405                        src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
1406                        src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
1407
1408                        /* store four 8-bit output values  */ /* 16 8 Values */
1409                        _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1410                        _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1411                        _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1412                        _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1413
1414                        /* To update pointer */
1415                        pi2_src1 += 16;
1416                        pi2_src2 += 16;
1417                        pu1_dst  += 16;
1418
1419                    } /* inner loop ends here(8-output values in single iteration) */
1420
1421                    pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
1422                    pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
1423                    pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
1424
1425                }
1426            }
1427            else if(0 == (wd & 7)) /* multiple of 8 case */
1428            {
1429                /*  outer for loop starts from here */
1430                for(row = 0; row < ht; row += 4)
1431                {
1432                    for(col = 0; col < wd; col += 8)
1433                    {
1434                        /*load 8 pixel values */
1435                        src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1436                        src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1437                        /* row = 1 */
1438                        src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1439                        src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1440                        /* row = 2 */
1441                        src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1442                        src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1443                        /* row = 3 */
1444                        src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1445                        src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1446
1447                        /* (pi2_src1[col] + pi2_src2[col]) */
1448                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1449                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1450                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1451                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1452
1453                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1454                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1455                        src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1456                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1457                        src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1458
1459                        /* (i4_tmp >> shift) */
1460                        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1461                        src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1462                        src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1463                        src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1464
1465                        /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1466                        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1467                        src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1468                        src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1469                        src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1470
1471                        /* store four 8-bit output values  */
1472                        _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1473                        _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1474                        _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1475                        _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1476
1477                        /* To update pointer */
1478                        pi2_src1 += 8;
1479                        pi2_src2 += 8;
1480                        pu1_dst  += 8;
1481
1482                    } /* inner loop ends here(8-output values in single iteration) */
1483
1484                    pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
1485                    pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
1486                    pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
1487
1488                }
1489            }
1490            else /* wd multiple of 4 case*/
1491            {
1492                WORD32 dst0, dst1, dst2, dst3;
1493
1494                /*  outer for loop starts from here */
1495                for(row = 0; row < ht; row += 4)
1496                {
1497                    for(col = 0; col < wd; col += 4)
1498                    {
1499                        /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1500                        src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1501                        /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1502                        src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1503
1504                        /* row = 1 */
1505                        src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1506                        src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1507                        /* row = 2 */
1508                        src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
1509                        src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
1510                        /* row = 3 */
1511                        src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
1512                        src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
1513
1514                        /* Pack two rows together */
1515                        src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1516                        src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1517                        src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
1518                        src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
1519
1520                        /* (pi2_src1[col] + pi2_src2[col]) */
1521                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1522                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1523
1524                        /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1525                        src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1526                        src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1527
1528                        /* (i4_tmp >> shift) */
1529                        src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1530                        src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1531
1532                        /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1533                        src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1534                        src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1535
1536                        dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
1537                        /* dst row = 1 to 3 */
1538                        src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
1539                        src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
1540
1541                        /* store four 8-bit output values  */
1542                        *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1543
1544                        dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
1545                        dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
1546                        dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
1547
1548                        /* row = 1 to row = 3 */
1549                        *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1550                        *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
1551                        *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
1552
1553                        /* To update pointer */
1554                        pi2_src1 += 4;
1555                        pi2_src2 += 4;
1556                        pu1_dst  += 4;
1557
1558                    } /* inner loop ends here(4-output values in single iteration) */
1559
1560                    pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
1561                    pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
1562                    pu1_dst  = pu1_dst  - wd + 4 * dst_strd;  /* Pointer update */
1563
1564                }
1565            }
1566        }
1567        else /* ht multiple of 2 case and wd multiple of 4 case*/
1568        {
1569
1570            WORD32 dst0, dst1;
1571
1572            /*  outer for loop starts from here */
1573            for(row = 0; row < ht; row += 2)
1574            {
1575                for(col = 0; col < wd; col += 4)
1576                {
1577                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1578                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1579                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1580                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1581
1582                    /* row = 1 */
1583                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1584                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1585
1586                    /* Pack two rows together */
1587                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1588                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1589
1590                    /* (pi2_src1[col] + pi2_src2[col]) */
1591                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1592
1593                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1594                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1595
1596                    /* (i4_tmp >> shift) */
1597                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1598
1599                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1600                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1601
1602                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
1603                    /* dst row = 1 to 3 */
1604                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
1605
1606                    /* store four 8-bit output values  */
1607                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1608
1609                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
1610
1611                    /* row = 1 to row = 3 */
1612                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1613
1614                    /* To update pointer */
1615                    pi2_src1 += 4;
1616                    pi2_src2 += 4;
1617                    pu1_dst  += 4;
1618
1619                } /* inner loop ends here(4-output values in single iteration) */
1620
1621                pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
1622                pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
1623                pu1_dst  = pu1_dst  - wd + 2 * dst_strd;  /* Pointer update */
1624
1625            }
1626
1627        }
1628
1629    }
1630}
1631
1632
1633/**
1634*******************************************************************************
1635*
1636* @brief
1637*  Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and
1638* pi2_src2 and stores it at location  pointed by pi2_dst
1639*
1640* @par Description:
1641*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
1642* >> shift  where shift = 15 - BitDepth
1643*
1644* @param[in] pi2_src1
1645*  Pointer to source 1
1646*
1647* @param[in] pi2_src2
1648*  Pointer to source 2
1649*
1650* @param[out] pu1_dst
1651*  Pointer to destination
1652*
1653* @param[in] src_strd1
1654*  Source stride 1
1655*
1656* @param[in] src_strd2
1657*  Source stride 2
1658*
1659* @param[in] dst_strd
1660*  Destination stride
1661*
1662* @param[in] lvl_shift1
1663*  added before shift and offset
1664*
1665* @param[in] lvl_shift2
1666*  added before shift and offset
1667*
1668* @param[in] ht
1669*  height of the source
1670*
1671* @param[in] wd
1672*  width of the source (each colour component)
1673*
1674* @returns
1675*
1676* @remarks
1677*  None
1678*
1679* Assumption : ht%2 == 0, wd%2 == 0, lvl_shift1==0, lvl_shift2==0.
1680* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
1681* final result will match even if intermediate precision is in 16 bit.
1682*******************************************************************************
1683*/
1684
1685void ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 *pi2_src1,
1686                                                 WORD16 *pi2_src2,
1687                                                 UWORD8 *pu1_dst,
1688                                                 WORD32 src_strd1,
1689                                                 WORD32 src_strd2,
1690                                                 WORD32 dst_strd,
1691                                                 WORD32 lvl_shift1,
1692                                                 WORD32 lvl_shift2,
1693                                                 WORD32 ht,
1694                                                 WORD32 wd)
1695{
1696    WORD32 row, col, temp;
1697    WORD32 shift, wdx2;
1698
1699    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1700    __m128i lvl_shift1_8x16b;
1701    __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1702
1703    ASSERT(wd % 2 == 0); /* checking assumption*/
1704    ASSERT(ht % 2 == 0); /* checking assumption*/
1705    UNUSED(lvl_shift1);
1706    UNUSED(lvl_shift2);
1707    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
1708    temp = 1 << (shift - 1);
1709    wdx2 = wd * 2;
1710
1711    // seting values in register
1712    lvl_shift1_8x16b = _mm_set1_epi16(temp);
1713
1714    if(0 == (ht & 3)) /* ht multiple of 4 case */
1715    {
1716        if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
1717        {
1718            __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1719            __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
1720            /*  outer for loop starts from here */
1721            for(row = 0; row < ht; row += 4)
1722            {
1723                for(col = 0; col < wdx2; col += 16)
1724                {
1725                    /*load 8 pixel values */ /* First 8 Values */
1726                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1727                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1728                    /* row = 1 */
1729                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1730                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1731                    /* row = 2 */
1732                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1733                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1734                    /* row = 3 */
1735                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1736                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1737
1738                    /*load 8 pixel values */ /* Second 8 Values */
1739                    src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1740                    src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1741                    /* row = 1 */
1742                    src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1743                    src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1744                    /* row = 2 */
1745                    src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
1746                    src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
1747
1748                    /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1749                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1750                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1751                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1752                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1753
1754                    /*load 8 pixel values */ /* Second 8 Values */
1755                    /* row = 3 */
1756                    src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
1757                    src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
1758
1759                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1760                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1761                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1762                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1763                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1764
1765                    /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1766                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
1767                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
1768                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
1769                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
1770
1771                    /* (i4_tmp >> shift) */ /* First 8 Values */
1772                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1773                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1774                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1775                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1776
1777                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
1778                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
1779                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
1780                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
1781                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
1782
1783                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
1784                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1785                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1786                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1787                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1788
1789                    /* (i4_tmp >> shift) */ /* Second 8 Values */
1790                    src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
1791                    src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
1792                    src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
1793                    src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
1794
1795                    /* store four 8-bit output values  */ /* First 8 Values */
1796                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1797                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1798                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1799                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1800
1801                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
1802                    src_temp9_8x16b  = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
1803                    src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
1804                    src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, src_temp13_8x16b);
1805                    src_temp15_8x16b = _mm_packus_epi16(src_temp15_8x16b, src_temp15_8x16b);
1806
1807                    /* store four 8-bit output values  */ /* Second 8 Values */
1808                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
1809                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
1810                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd + 8), src_temp13_8x16b); /* row = 1*/
1811                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd + 8), src_temp15_8x16b); /* row = 3*/
1812
1813                    /* To update pointer */
1814                    pi2_src1 += 16;
1815                    pi2_src2 += 16;
1816                    pu1_dst  += 16;
1817
1818                } /* inner loop ends here(8-output values in single iteration) */
1819
1820                pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;    /* Pointer update */
1821                pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;    /* Pointer update */
1822                pu1_dst  = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
1823
1824            }
1825        }
1826        else if(0 == (wdx2 & 7)) /* multiple of 8 case */
1827        {
1828            /*  outer for loop starts from here */
1829            for(row = 0; row < ht; row += 4)
1830            {
1831                for(col = 0; col < wdx2; col += 8)
1832                {
1833                    /*load 8 pixel values */
1834                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1835                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1836                    /* row = 1 */
1837                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1838                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1839                    /* row = 2 */
1840                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1841                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1842                    /* row = 3 */
1843                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1844                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1845
1846                    /* (pi2_src1[col] + pi2_src2[col]) */
1847                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1848                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1849                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1850                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1851
1852                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1853                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1854                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1855                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1856                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1857
1858                    /* (i4_tmp >> shift) */
1859                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1860                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1861                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1862                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1863
1864                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1865                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1866                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1867                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1868                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1869
1870                    /* store four 8-bit output values  */
1871                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1872                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1873                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1874                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1875
1876                    /* To update pointer */
1877                    pi2_src1 += 8;
1878                    pi2_src2 += 8;
1879                    pu1_dst  += 8;
1880
1881                } /* inner loop ends here(8-output values in single iteration) */
1882
1883                pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;    /* Pointer update */
1884                pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;    /* Pointer update */
1885                pu1_dst  = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
1886
1887            }
1888        }
1889        else /* 2*wd multiple of 4 case */
1890        {
1891            WORD32 dst0, dst1, dst2, dst3;
1892            /*  outer for loop starts from here */
1893            for(row = 0; row < ht; row += 4)
1894            {
1895                for(col = 0; col < wdx2; col += 4)
1896                {
1897                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1898                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1899                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1900                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1901
1902                    /* row = 1 */
1903                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1904                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1905                    /* row = 2 */
1906                    src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
1907                    src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
1908                    /* row = 3 */
1909                    src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
1910                    src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
1911
1912                    /* Pack two rows together */
1913                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1914                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1915                    src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
1916                    src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
1917
1918                    /* (pi2_src1[col] + pi2_src2[col]) */
1919                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1920                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1921
1922                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1923                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1924                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1925
1926                    /* (i4_tmp >> shift) */
1927                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1928                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1929
1930                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1931                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1932                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1933
1934                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
1935                    /* dst row = 1 to 3 */
1936                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
1937                    src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
1938
1939                    /* store four 8-bit output values  */
1940                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1941
1942                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
1943                    dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
1944                    dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
1945
1946                    /* row = 1 to row = 3 */
1947                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1948                    *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
1949                    *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
1950
1951                    /* To update pointer */
1952                    pi2_src1 += 4;
1953                    pi2_src2 += 4;
1954                    pu1_dst  += 4;
1955
1956                } /* inner loop ends here(4-output values in single iteration) */
1957
1958                pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;   /* Pointer update */
1959                pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;   /* Pointer update */
1960                pu1_dst  = pu1_dst  - wdx2 + 4 * dst_strd;    /* Pointer update */
1961
1962            }
1963        }
1964    }
1965    else /* ht multiple of 2 case */
1966    {
1967        if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
1968        {
1969            __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1970            /*  outer for loop starts from here */
1971            for(row = 0; row < ht; row += 2)
1972            {
1973                for(col = 0; col < wdx2; col += 16)
1974                {
1975                    /*load 8 pixel values */ /* First 8 Values */
1976                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1977                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1978                    /* row = 1 */
1979                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1980                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1981
1982                    /*load 8 pixel values */ /* Second 8 Values */
1983                    src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1984                    src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1985                    /* row = 1 */
1986                    src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1987                    src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1988
1989                    /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1990                    src_temp1_8x16b  = _mm_adds_epi16(src_temp1_8x16b,  src_temp2_8x16b);
1991                    src_temp3_8x16b  = _mm_adds_epi16(src_temp3_8x16b,  src_temp4_8x16b);
1992
1993                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1994                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1995                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1996
1997                    /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1998                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
1999                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
2000
2001                    /* (i4_tmp >> shift) */ /* First 8 Values */
2002                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
2003                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
2004
2005                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
2006                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
2007                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
2008
2009                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
2010                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2011                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
2012
2013                    /* (i4_tmp >> shift) */ /* Second 8 Values */
2014                    src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
2015                    src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
2016
2017                    /* store four 8-bit output values  */ /* First 8 Values */
2018                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
2019                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
2020
2021                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
2022                    src_temp9_8x16b  = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
2023                    src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
2024
2025                    /* store four 8-bit output values  */ /* Second 8 Values */
2026                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
2027                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
2028
2029                    /* To update pointer */
2030                    pi2_src1 += 16;
2031                    pi2_src2 += 16;
2032                    pu1_dst  += 16;
2033
2034                } /* inner loop ends here(8-output values in single iteration) */
2035
2036                pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
2037                pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
2038                pu1_dst  = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
2039
2040            }
2041        }
2042        else if(0 == (wdx2 & 7)) /* multiple of 8 case */
2043        {
2044            /*  outer for loop starts from here */
2045            for(row = 0; row < ht; row += 2)
2046            {
2047                for(col = 0; col < wdx2; col += 8)
2048                {
2049                    /*load 8 pixel values */
2050                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
2051                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
2052                    /* row = 1 */
2053                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
2054                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
2055
2056                    /* (pi2_src1[col] + pi2_src2[col]) */
2057                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2058                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
2059
2060                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2061                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2062                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
2063
2064                    /* (i4_tmp >> shift) */
2065                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
2066                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
2067
2068                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2069                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2070                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
2071
2072                    /* store four 8-bit output values  */
2073                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
2074                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 1*/
2075
2076                    /* To update pointer */
2077                    pi2_src1 += 8;
2078                    pi2_src2 += 8;
2079                    pu1_dst  += 8;
2080
2081                } /* inner loop ends here(8-output values in single iteration) */
2082
2083                pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
2084                pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
2085                pu1_dst  = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
2086
2087            }
2088        }
2089        else /* 2*wd multiple of 4 case */
2090        {
2091            WORD32 dst0, dst1;
2092            /*  outer for loop starts from here */
2093            for(row = 0; row < ht; row += 2)
2094            {
2095                for(col = 0; col < wdx2; col += 4)
2096                {
2097                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
2098                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
2099                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
2100                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
2101                    /* row = 1 */
2102                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
2103                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
2104
2105                    /* Pack two rows together */
2106                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
2107                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
2108
2109                    /* (pi2_src1[col] + pi2_src2[col]) */
2110                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2111                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2112                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2113
2114                    /* (i4_tmp >> shift) */
2115                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
2116                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2117                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2118
2119                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
2120                    /* dst row = 1 */
2121                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
2122
2123                    /* store four 8-bit output values  */
2124                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
2125
2126                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
2127                    /* row = 1 */
2128                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
2129
2130                    /* To update pointer */
2131                    pi2_src1 += 4;
2132                    pi2_src2 += 4;
2133                    pu1_dst  += 4;
2134                } /* inner loop ends here(4-output values in single iteration) */
2135
2136                pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;   /* Pointer update */
2137                pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;   /* Pointer update */
2138                pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;    /* Pointer update */
2139
2140            }
2141        }
2142    }
2143}
2144