1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19*******************************************************************************
20* @file
21*  ihevc_weighted_pred_x86_intr.c
22*
23* @brief
24*  Contains function definitions for weighted prediction used in inter
25* prediction
26*
27* @author
28*
29*
30* @par List of Functions:
31*   - ihevc_weighted_pred_uni_sse42()
32*   - ihevc_weighted_pred_bi_sse42()
33*   - ihevc_weighted_pred_bi_default_sse42()
34*   - ihevc_weighted_pred_chroma_uni_sse42()
35*   - ihevc_weighted_pred_chroma_bi_sse42()
36*
37* @remarks
38*  None
39*
40*******************************************************************************
41*/
42/*****************************************************************************/
43/* File Includes                                                             */
44/*****************************************************************************/
45#include <stdio.h>
46#include <assert.h>
47
48#include "ihevc_debug.h"
49#include "ihevc_typedefs.h"
50#include "ihevc_macros.h"
51#include "ihevc_platform_macros.h"
52#include "ihevc_func_selector.h"
53#include "ihevc_defs.h"
54#include "ihevc_weighted_pred.h"
55#include "ihevc_inter_pred.h"
56
57#include <immintrin.h>
58
59/**
60*******************************************************************************
61*
62* @brief
63*  Does uni-weighted prediction on the array pointed by  pi2_src and stores
64* it at the location pointed by pi2_dst
65*
66* @par Description:
67*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
68* offset
69*
70* @param[in] pi2_src
71*  Pointer to the source
72*
73* @param[out] pu1_dst
74*  Pointer to the destination
75*
76* @param[in] src_strd
77*  Source stride
78*
79* @param[in] dst_strd
80*  Destination stride
81*
82* @param[in] wgt0
83*  weight to be multiplied to the source
84*
85* @param[in] off0
86*  offset to be added after rounding and
87*
88* @param[in] shifting
89*
90*
91* @param[in] shift
92*  (14 Bit depth) + log2_weight_denominator
93*
94* @param[in] lvl_shift
95*  added before shift and offset
96*
97* @param[in] ht
98*  height of the source
99*
100* @param[in] wd
101*  width of the source
102*
103* @returns
104*
105* @remarks
106*  None
107*
108*******************************************************************************
109*/
110
111void ihevc_weighted_pred_uni_sse42(WORD16 *pi2_src,
112                                   UWORD8 *pu1_dst,
113                                   WORD32 src_strd,
114                                   WORD32 dst_strd,
115                                   WORD32 wgt0,
116                                   WORD32 off0,
117                                   WORD32 shift,
118                                   WORD32 lvl_shift,
119                                   WORD32 ht,
120                                   WORD32 wd)
121{
122    WORD32 row, col, temp;
123    WORD32 dst0, dst1, dst2, dst3;
124
125    /* all 128 bit registers are named with a suffix mxnb, where m is the */
126    /* number of n bits packed in the register                            */
127    __m128i src_temp0_4x32b, src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b;
128    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
129
130    ASSERT(wd % 4 == 0); /* checking assumption*/
131    ASSERT(ht % 4 == 0); /* checking assumption*/
132
133    temp = 1 << (shift - 1);
134
135    // seting values in register
136    const_temp_4x32b = _mm_set1_epi32(temp);
137    lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
138    wgt0_4x32b = _mm_set1_epi32(wgt0);
139    off0_4x32b = _mm_set1_epi32(off0);
140
141    if(0 == (wd & 7)) /* wd multiple of 8 case */
142    {
143        __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
144
145        /*  outer for loop starts from here */
146        for(row = 0; row < ht; row += 4)
147        {
148            for(col = 0; col < wd; col += 8)
149            {   /* for row =0 ,1,2,3*/
150
151                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
152                src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
153                /* row = 1 */
154                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
155                /* row = 2 */
156                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
157                /* row = 3 */
158                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
159
160                /* row = 0 */ /* Last 4 pixels */
161                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
162                /* row = 1 */
163                src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
164                /* row = 2 */
165                src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd + 4));
166                /* row = 3 */
167                src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd + 4));
168
169                /* considering pix. 4:0 by converting 16-into 32 bit */ /* First 4 pixels */
170                src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
171                src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
172                src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
173                src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
174
175                /* (pi2_src[col] + lvl_shift)*/ /* First 4 pixels */
176                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
177                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
178                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
179                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
180
181                /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* First 4 pixels */
182                src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
183                src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
184                src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
185                src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
186
187                /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
188                src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
189                src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
190                src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
191                src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
192
193                /* (pi2_src[col] + lvl_shift)*/ /* Last 4 pixels */
194                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
195                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
196                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
197                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
198
199                /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* Last 4 pixels */
200                src_temp4_4x32b  = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
201                src_temp5_4x32b  = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
202                src_temp6_4x32b  = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
203                src_temp7_4x32b  = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
204
205                /* i4_tmp += 1 << (shift - 1) */ /* First 4 pixels */
206                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
207                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
208                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
209                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
210
211                /* (i4_tmp >> shift) */ /* First 4 pixels */
212                src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
213                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
214                src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
215                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
216
217                /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
218                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
219                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
220                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
221                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
222
223                /* (i4_tmp >> shift) */ /* Last 4 pixels */
224                src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
225                src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
226                src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
227                src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
228
229                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
230                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
231                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
232                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
233                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
234
235                /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
236                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
237                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
238                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
239                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
240
241                src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp4_4x32b);
242                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
243                src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp6_4x32b);
244                src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
245                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
246                src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
247                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
248                src_temp2_4x32b = _mm_packus_epi16(src_temp2_4x32b, src_temp2_4x32b);
249                src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
250
251                /* store four 8-bit output values  */
252                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
253                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 2*/
254                _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp2_4x32b); /* row = 1*/
255                _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp3_4x32b); /* row = 3*/
256
257                /* To update pointer */
258                pi2_src += 8;
259                pu1_dst += 8;
260
261            } /* inner loop ends here(4-output values in single iteration) */
262
263            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
264            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
265
266        }
267    }
268    else  /* wd multiple of 4 case */
269    {
270        /*  outer for loop starts from here */
271        for(row = 0; row < ht; row += 4)
272        {
273            for(col = 0; col < wd; col += 4)
274            {   /* for row =0 ,1,2,3*/
275
276                /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
277                src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
278                /* row = 1 */
279                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
280                /* row = 2 */
281                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
282                /* row = 3 */
283                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
284
285                /* considering pix. 4:0 by converting 16-into 32 bit */
286                src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
287                src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
288                src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
289                src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
290
291                /* (pi2_src[col] + lvl_shift)*/
292                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
293                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
294                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
295                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
296
297                /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
298                src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
299                src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
300                src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
301                src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
302
303                /* i4_tmp += 1 << (shift - 1) */
304                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
305                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
306                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
307                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
308
309                /* (i4_tmp >> shift) */
310                src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
311                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
312                src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
313                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
314
315                /*i4_tmp = (i4_tmp >> shift) + off0; */
316                src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
317                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
318                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
319                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
320
321                src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
322                src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp3_4x32b);
323
324                /* pu1_dst[col] = CLIP_U8(i4_tmp); */
325                src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp2_4x32b);
326
327                dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
328                /* dst row = 1 to 3 */
329                src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
330                src_temp2_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 2);
331                src_temp3_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 3);
332
333                /* store four 8-bit output values  */
334                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
335
336                dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
337                dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
338                dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
339
340                /* row = 1 to row = 3 */
341                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
342                *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
343                *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
344
345                /* To update pointer */
346                pi2_src += 4;
347                pu1_dst += 4;
348
349            } /* inner loop ends here(4-output values in single iteration) */
350
351            pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
352            pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
353
354        }
355    }
356}
357
358/**
359*******************************************************************************
360*
361* @brief
362* Does chroma uni-weighted prediction on array pointed by pi2_src and stores
363* it at the location pointed by pi2_dst
364*
365* @par Description:
366*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
367* offset
368*
369* @param[in] pi2_src
370*  Pointer to the source
371*
372* @param[out] pu1_dst
373*  Pointer to the destination
374*
375* @param[in] src_strd
376*  Source stride
377*
378* @param[in] dst_strd
379*  Destination stride
380*
381* @param[in] wgt0
382*  weight to be multiplied to the source
383*
384* @param[in] off0
385*  offset to be added after rounding and
386*
387* @param[in] shifting
388*
389*
390* @param[in] shift
391*  (14 Bit depth) + log2_weight_denominator
392*
393* @param[in] lvl_shift
394*  added before shift and offset
395*
396* @param[in] ht
397*  height of the source
398*
399* @param[in] wd
400*  width of the source (each colour component)
401*
402* @returns
403*
404* @remarks
405*  None
406*
407*******************************************************************************
408*/
409
410void ihevc_weighted_pred_chroma_uni_sse42(WORD16 *pi2_src,
411                                          UWORD8 *pu1_dst,
412                                          WORD32 src_strd,
413                                          WORD32 dst_strd,
414                                          WORD32 wgt0_cb,
415                                          WORD32 wgt0_cr,
416                                          WORD32 off0_cb,
417                                          WORD32 off0_cr,
418                                          WORD32 shift,
419                                          WORD32 lvl_shift,
420                                          WORD32 ht,
421                                          WORD32 wd)
422{
423    WORD32 row, col, temp, wdx2;
424    /* all 128 bit registers are named with a suffix mxnb, where m is the */
425    /* number of n bits packed in the register                            */
426
427    __m128i src_temp0_4x32b, src_temp1_4x32b;
428    __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
429
430    ASSERT(wd % 2 == 0); /* checking assumption*/
431    ASSERT(ht % 2 == 0); /* checking assumption*/
432
433    temp = 1 << (shift - 1);
434    wdx2 = 2 * wd;
435
436    // seting values in register
437    const_temp_4x32b = _mm_set1_epi32(temp);
438    lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
439    wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
440    off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
441
442#if 0 /* Enable this for ht%4=0 case. But was degrading performance for lower sizes and improving for higher sizes!!! */
443    if( 0 == (ht & 3)) /* ht multiple of 4 case */
444    {
445        if( 0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
446        {
447            __m128i src_temp2_4x32b, src_temp3_4x32b;
448            __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
449            __m128i src_temp8_4x32b, src_temp9_4x32b, src_temp10_4x32b, src_temp11_4x32b;
450            __m128i src_temp12_4x32b, src_temp13_4x32b, src_temp14_4x32b, src_temp15_4x32b;
451            /*  outer for loop starts from here */
452            for(row = 0; row < ht; row +=4)
453            {
454                for(col = 0; col < wdx2; col +=16)
455                {
456                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
457                    src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
458                    /* row = 1 */
459                    src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
460                    /* row = 0 */ /* Second 4 pixels */
461                    src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
462                    /* row = 1 */
463                    src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
464                    /* row = 0 */ /* Third 4 pixels */
465                    src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+8));
466                    /* row = 1 */
467                    src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+8));
468                    /* row = 0 */ /* Last 4 pixels */
469                    src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+12));
470                    /* row = 1 */
471                    src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+12));
472
473                    /* considering pix. 4:0 by converting 16-into 32 bit */
474                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
475                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
476                    /* (pi2_src[col] + lvl_shift)*/
477                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
478                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
479                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
480                    src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
481                    src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
482
483                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
484                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
485                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
486                    /* (pi2_src[col] + lvl_shift)*/
487                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
488                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
489                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
490                    src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
491                    src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
492
493                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
494                    src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
495                    src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
496                    /* (pi2_src[col] + lvl_shift)*/
497                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
498                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
499                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
500                    src_temp4_4x32b  = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
501                    src_temp5_4x32b  = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
502
503                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
504                    src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
505                    src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
506                    /* (pi2_src[col] + lvl_shift)*/
507                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
508                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
509                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
510                    src_temp6_4x32b  = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
511                    src_temp7_4x32b  = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
512
513                    /* i4_tmp += 1 << (shift - 1) */
514                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
515                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
516                    /* (i4_tmp >> shift) */
517                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
518                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
519
520                    /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
521                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
522                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
523                    /* (i4_tmp >> shift) */
524                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
525                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
526
527                    /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
528                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
529                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
530                    /* (i4_tmp >> shift) */
531                    src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
532                    src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
533
534                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
535                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
536                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
537                    /* (i4_tmp >> shift) */
538                    src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
539                    src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
540
541                    /*i4_tmp = (i4_tmp >> shift) + off0; */
542                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
543                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
544                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
545                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
546                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
547                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
548                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
549                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
550                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
551                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
552                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
553
554                    src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp2_4x32b);
555                    src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp3_4x32b);
556                    src_temp4_4x32b = _mm_packs_epi32 (src_temp4_4x32b, src_temp6_4x32b);
557                    src_temp5_4x32b = _mm_packs_epi32 (src_temp5_4x32b, src_temp7_4x32b);
558                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
559                    src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp4_4x32b);
560                    src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp5_4x32b);
561
562                    /* store 16 8-bit output values  */
563                    _mm_storeu_si128((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
564                    _mm_storeu_si128((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
565
566                    /* row = 2 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
567                    src_temp8_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
568                    /* row = 3 */
569                    src_temp9_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
570                    /* row = 2 */ /* Second 4 pixels */
571                    src_temp10_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
572                    /* row = 3 */
573                    src_temp11_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
574                    /* row = 2 */ /* Third 4 pixels */
575                    src_temp12_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+8));
576                    /* row = 3 */
577                    src_temp13_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+8));
578                    /* row = 2 */ /* Last 4 pixels */
579                    src_temp14_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+12));
580                    /* row = 3 */
581                    src_temp15_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+12));
582
583                    /* considering pix. 4:0 by converting 16-into 32 bit */
584                    src_temp8_4x32b  = _mm_cvtepi16_epi32(src_temp8_4x32b);
585                    src_temp9_4x32b  = _mm_cvtepi16_epi32(src_temp9_4x32b);
586                    /* (pi2_src[col] + lvl_shift)*/
587                    src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, lvl_shift_4x32b);
588                    src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, lvl_shift_4x32b);
589                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
590                    src_temp8_4x32b  = _mm_mullo_epi32 (src_temp8_4x32b, wgt0_4x32b);
591                    src_temp9_4x32b  = _mm_mullo_epi32 (src_temp9_4x32b, wgt0_4x32b);
592
593                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
594                    src_temp10_4x32b  = _mm_cvtepi16_epi32(src_temp10_4x32b);
595                    src_temp11_4x32b  = _mm_cvtepi16_epi32(src_temp11_4x32b);
596                    /* (pi2_src[col] + lvl_shift)*/
597                    src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, lvl_shift_4x32b);
598                    src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, lvl_shift_4x32b);
599                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
600                    src_temp10_4x32b  = _mm_mullo_epi32 (src_temp10_4x32b, wgt0_4x32b);
601                    src_temp11_4x32b  = _mm_mullo_epi32 (src_temp11_4x32b, wgt0_4x32b);
602
603                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
604                    src_temp12_4x32b  = _mm_cvtepi16_epi32(src_temp12_4x32b);
605                    src_temp13_4x32b  = _mm_cvtepi16_epi32(src_temp13_4x32b);
606                    /* (pi2_src[col] + lvl_shift)*/
607                    src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, lvl_shift_4x32b);
608                    src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, lvl_shift_4x32b);
609                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
610                    src_temp12_4x32b  = _mm_mullo_epi32 (src_temp12_4x32b, wgt0_4x32b);
611                    src_temp13_4x32b  = _mm_mullo_epi32 (src_temp13_4x32b, wgt0_4x32b);
612
613                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
614                    src_temp14_4x32b  = _mm_cvtepi16_epi32(src_temp14_4x32b);
615                    src_temp15_4x32b  = _mm_cvtepi16_epi32(src_temp15_4x32b);
616                    /* (pi2_src[col] + lvl_shift)*/
617                    src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, lvl_shift_4x32b);
618                    src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, lvl_shift_4x32b);
619                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
620                    src_temp14_4x32b  = _mm_mullo_epi32 (src_temp14_4x32b, wgt0_4x32b);
621                    src_temp15_4x32b  = _mm_mullo_epi32 (src_temp15_4x32b, wgt0_4x32b);
622
623                    /* i4_tmp += 1 << (shift - 1) */
624                    src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, const_temp_4x32b);
625                    src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, const_temp_4x32b);
626                    /* (i4_tmp >> shift) */
627                    src_temp8_4x32b = _mm_srai_epi32(src_temp8_4x32b,  shift);
628                    src_temp9_4x32b = _mm_srai_epi32(src_temp9_4x32b,  shift);
629
630                    /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
631                    src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, const_temp_4x32b);
632                    src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, const_temp_4x32b);
633                    /* (i4_tmp >> shift) */
634                    src_temp10_4x32b = _mm_srai_epi32(src_temp10_4x32b,  shift);
635                    src_temp11_4x32b = _mm_srai_epi32(src_temp11_4x32b,  shift);
636
637                    /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
638                    src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, const_temp_4x32b);
639                    src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, const_temp_4x32b);
640                    /* (i4_tmp >> shift) */
641                    src_temp12_4x32b = _mm_srai_epi32(src_temp12_4x32b,  shift);
642                    src_temp13_4x32b = _mm_srai_epi32(src_temp13_4x32b,  shift);
643
644                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
645                    src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, const_temp_4x32b);
646                    src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, const_temp_4x32b);
647                    /* (i4_tmp >> shift) */
648                    src_temp14_4x32b = _mm_srai_epi32(src_temp14_4x32b,  shift);
649                    src_temp15_4x32b = _mm_srai_epi32(src_temp15_4x32b,  shift);
650
651                    /*i4_tmp = (i4_tmp >> shift) + off0; */
652                    src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, off0_4x32b);
653                    src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, off0_4x32b);
654                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
655                    src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, off0_4x32b);
656                    src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, off0_4x32b);
657                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
658                    src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, off0_4x32b);
659                    src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, off0_4x32b);
660                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
661                    src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, off0_4x32b);
662                    src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, off0_4x32b);
663
664                    src_temp8_4x32b = _mm_packs_epi32 (src_temp8_4x32b, src_temp10_4x32b);
665                    src_temp9_4x32b = _mm_packs_epi32 (src_temp9_4x32b, src_temp11_4x32b);
666                    src_temp12_4x32b = _mm_packs_epi32 (src_temp12_4x32b, src_temp14_4x32b);
667                    src_temp13_4x32b = _mm_packs_epi32 (src_temp13_4x32b, src_temp15_4x32b);
668                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
669                    src_temp8_4x32b = _mm_packus_epi16 (src_temp8_4x32b, src_temp12_4x32b);
670                    src_temp9_4x32b = _mm_packus_epi16 (src_temp9_4x32b, src_temp13_4x32b);
671
672                    /* store 16 8-bit output values  */
673                    _mm_storeu_si128((__m128i*)(pu1_dst+2*dst_strd), src_temp8_4x32b); /* row = 2*/
674                    _mm_storeu_si128((__m128i*)(pu1_dst+3*dst_strd), src_temp9_4x32b); /* row = 3*/
675
676                    pi2_src += 16;  /* Pointer update */
677                    pu1_dst += 16; /* Pointer update */
678
679                } /* inner loop ends here(4-output values in single iteration) */
680                pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
681                pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
682            }
683        }
684        else if( 0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
685        {
686            __m128i src_temp2_4x32b,src_temp3_4x32b;
687            __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
688            /*  outer for loop starts from here */
689            for(row = 0; row < ht; row +=4)
690            {
691                for(col = 0; col < wdx2; col +=8)
692                {
693                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
694                    src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
695                    /* row = 1 */
696                    src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
697                    /* row = 2 */
698                    src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
699                    /* row = 3 */
700                    src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
701
702                    /* row = 0 */ /* Last 4 pixels */
703                    src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
704                    /* row = 1 */
705                    src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
706                    /* row = 2 */
707                    src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
708                    /* row = 3 */
709                    src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
710
711                    /* considering pix. 4:0 by converting 16-into 32 bit */
712                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
713                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
714                    /* (pi2_src[col] + lvl_shift)*/
715                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
716                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
717                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
718                    src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
719                    src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
720
721                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
722                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
723                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
724                    /* (pi2_src[col] + lvl_shift)*/
725                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
726                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
727                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
728                    src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
729                    src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
730
731                    /* considering pix. 4:0 by converting 16-into 32 bit */
732                    src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
733                    src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
734                    /* (pi2_src[col] + lvl_shift)*/
735                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
736                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
737                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
738                    src_temp4_4x32b  = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
739                    src_temp5_4x32b  = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
740
741                    /* considering pix. 4:0 by converting 16-into 32 bit */
742                    src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
743                    src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
744                    /* (pi2_src[col] + lvl_shift)*/
745                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
746                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
747                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
748                    src_temp6_4x32b  = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
749                    src_temp7_4x32b  = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
750
751                    /* i4_tmp += 1 << (shift - 1) */
752                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
753                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
754                    /* (i4_tmp >> shift) */
755                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
756                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
757
758                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
759                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
760                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
761                    /* (i4_tmp >> shift) */
762                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
763                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
764
765                    /* i4_tmp += 1 << (shift - 1) */
766                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
767                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
768                    /* (i4_tmp >> shift) */
769                    src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
770                    src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
771
772                    /* i4_tmp += 1 << (shift - 1) */
773                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
774                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
775                    /* (i4_tmp >> shift) */
776                    src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
777                    src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
778
779                    /*i4_tmp = (i4_tmp >> shift) + off0; */
780                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
781                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
782                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
783                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
784                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
785                    /*i4_tmp = (i4_tmp >> shift) + off0; */
786                    src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
787                    src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
788                    /*i4_tmp = (i4_tmp >> shift) + off0; */
789                    src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
790                    src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
791
792                    src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp4_4x32b);
793                    src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp5_4x32b);
794                    src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp6_4x32b);
795                    src_temp3_4x32b = _mm_packs_epi32 (src_temp3_4x32b, src_temp7_4x32b);
796
797                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
798                    src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp0_4x32b);
799                    src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp1_4x32b);
800                    src_temp2_4x32b = _mm_packus_epi16 (src_temp2_4x32b, src_temp2_4x32b);
801                    src_temp3_4x32b = _mm_packus_epi16 (src_temp3_4x32b, src_temp3_4x32b);
802
803                    /* store four 8-bit output values  */
804                    _mm_storel_epi64((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
805                    _mm_storel_epi64((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
806                    _mm_storel_epi64((__m128i*)(pu1_dst+2*dst_strd), src_temp2_4x32b); /* row = 0*/
807                    _mm_storel_epi64((__m128i*)(pu1_dst+3*dst_strd), src_temp3_4x32b); /* row = 1*/
808
809                    pi2_src += 8;   /* Pointer update */
810                    pu1_dst += 8; /* Pointer update */
811
812                } /* inner loop ends here(4-output values in single iteration) */
813                pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
814                pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
815            }
816        }
817        else /* 2*wd multiple of 4 case */
818        {
819            WORD32 dst0, dst1, dst2, dst3;
820            __m128i src_temp2_4x32b,src_temp3_4x32b;
821            /*  outer for loop starts from here */
822            for(row = 0; row < ht; row +=4)
823            {
824                for(col = 0; col < wdx2; col +=4)
825                {
826                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
827                    src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
828                    /* row = 1 */
829                    src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+1*src_strd));
830                    /* row = 2 */
831                    src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
832                    /* row = 3 */
833                    src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
834
835                    /* considering pix. 4:0 by converting 16-into 32 bit */
836                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
837                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
838                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
839                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
840
841                    /* (pi2_src[col] + lvl_shift)*/
842                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
843                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
844                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
845                    src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
846                    src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
847
848                    /* (pi2_src[col] + lvl_shift)*/
849                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
850                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
851                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
852                    src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
853                    src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
854
855                    /* i4_tmp += 1 << (shift - 1) */
856                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
857                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
858                    /* (i4_tmp >> shift) */
859                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
860                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
861                    /*i4_tmp = (i4_tmp >> shift) + off0; */
862                    src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
863                    src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
864
865                    /* i4_tmp += 1 << (shift - 1) */
866                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
867                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
868                    /* (i4_tmp >> shift) */
869                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
870                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
871                    /*i4_tmp = (i4_tmp >> shift) + off0; */
872                    src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
873                    src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
874
875                    src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp1_4x32b);
876                    src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp3_4x32b);
877
878                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
879                    src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp2_4x32b);
880
881                    dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
882                    /* dst row = 1 to 3 */
883                    src_temp1_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 1);
884                    src_temp2_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 2);
885                    src_temp3_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 3);
886
887                    /* store four 8-bit output values  */
888                    *(WORD32 *) (&pu1_dst[0*dst_strd]) = dst0;
889
890                    dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
891                    dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
892                    dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
893                    /* row = 1 */
894                    *(WORD32 *) (&pu1_dst[1*dst_strd]) = dst1;
895                    /* row = 2 */
896                    *(WORD32 *) (&pu1_dst[2*dst_strd]) = dst2;
897                    /* row = 3 */
898                    *(WORD32 *) (&pu1_dst[3*dst_strd]) = dst3;
899
900                    pi2_src += 4;   /* Pointer update */
901                    pu1_dst += 4; /* Pointer update */
902
903                } /* inner loop ends here(4-output values in single iteration) */
904                pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
905                pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
906            }
907        }
908    }
909    else /* ht multiple of 2 case */
910#endif
911
912    {
913        if(0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
914        {
915            __m128i src_temp2_4x32b, src_temp3_4x32b;
916            __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
917            /*  outer for loop starts from here */
918            for(row = 0; row < ht; row += 2)
919            {
920                for(col = 0; col < wdx2; col += 16)
921                {
922                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
923                    src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
924                    /* row = 1 */
925                    src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
926
927                    /* row = 0 */ /* Second 4 pixels */
928                    src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
929                    /* row = 1 */
930                    src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
931                    /* row = 0 */ /* Third 4 pixels */
932                    src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
933                    /* row = 1 */
934                    src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
935                    /* row = 0 */ /* Last 4 pixels */
936                    src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 12));
937                    /* row = 1 */
938                    src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 12));
939
940                    /* considering pix. 4:0 by converting 16-into 32 bit */
941                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
942                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
943                    /* (pi2_src[col] + lvl_shift)*/
944                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
945                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
946                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
947                    src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
948                    src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
949
950                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
951                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
952                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
953                    /* (pi2_src[col] + lvl_shift)*/
954                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
955                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
956                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
957                    src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
958                    src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
959
960                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
961                    src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
962                    src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
963                    /* (pi2_src[col] + lvl_shift)*/
964                    src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
965                    src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
966                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
967                    src_temp4_4x32b  = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
968                    src_temp5_4x32b  = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
969
970                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
971                    src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
972                    src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
973                    /* (pi2_src[col] + lvl_shift)*/
974                    src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
975                    src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
976                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
977                    src_temp6_4x32b  = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
978                    src_temp7_4x32b  = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
979
980                    /* i4_tmp += 1 << (shift - 1) */
981                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
982                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
983                    /* (i4_tmp >> shift) */
984                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
985                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
986
987                    /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
988                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
989                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
990                    /* (i4_tmp >> shift) */
991                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
992                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
993
994                    /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
995                    src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
996                    src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
997                    /* (i4_tmp >> shift) */
998                    src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
999                    src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
1000
1001                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
1002                    src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
1003                    src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
1004                    /* (i4_tmp >> shift) */
1005                    src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
1006                    src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
1007
1008                    /*i4_tmp = (i4_tmp >> shift) + off0; */
1009                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
1010                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
1011                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
1012                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
1013                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
1014                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
1015                    src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
1016                    src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
1017                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
1018                    src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
1019                    src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
1020
1021                    src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
1022                    src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1023                    src_temp4_4x32b = _mm_packs_epi32(src_temp4_4x32b, src_temp6_4x32b);
1024                    src_temp5_4x32b = _mm_packs_epi32(src_temp5_4x32b, src_temp7_4x32b);
1025                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
1026                    src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp4_4x32b);
1027                    src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp5_4x32b);
1028
1029                    /* store 16 8-bit output values  */
1030                    _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
1031                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
1032
1033                    pi2_src += 16;  /* Pointer update */
1034                    pu1_dst += 16; /* Pointer update */
1035
1036                } /* inner loop ends here(4-output values in single iteration) */
1037                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
1038                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1039            }
1040        }
1041        else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
1042        {
1043            __m128i src_temp2_4x32b, src_temp3_4x32b;
1044            /*  outer for loop starts from here */
1045            for(row = 0; row < ht; row += 2)
1046            {
1047                for(col = 0; col < wdx2; col += 8)
1048                {
1049                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1050                    src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
1051                    /* row = 1 */
1052                    src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
1053
1054                    /* row = 0 */ /* Last 4 pixels */
1055                    src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
1056                    /* row = 1 */
1057                    src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
1058
1059                    /* considering pix. 4:0 by converting 16-into 32 bit */
1060                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
1061                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
1062                    /* (pi2_src[col] + lvl_shift)*/
1063                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
1064                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
1065                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
1066                    src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
1067                    src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1068
1069                    /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
1070                    src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
1071                    src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
1072                    /* (pi2_src[col] + lvl_shift)*/
1073                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
1074                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
1075                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
1076                    src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
1077                    src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1078
1079                    /* i4_tmp += 1 << (shift - 1) */
1080                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
1081                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1082                    /* (i4_tmp >> shift) */
1083                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
1084                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1085
1086                    /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
1087                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
1088                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1089                    /* (i4_tmp >> shift) */
1090                    src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
1091                    src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
1092
1093                    /*i4_tmp = (i4_tmp >> shift) + off0; */
1094                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
1095                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
1096                    /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
1097                    src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
1098                    src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
1099
1100                    src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
1101                    src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1102
1103                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
1104                    src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
1105                    src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1106
1107                    /* store four 8-bit output values  */
1108                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
1109                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
1110
1111                    pi2_src += 8;   /* Pointer update */
1112                    pu1_dst += 8; /* Pointer update */
1113
1114                } /* inner loop ends here(4-output values in single iteration) */
1115                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
1116                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1117            }
1118        }
1119        else /* 2*wd multiple of 4 case */
1120        {
1121            WORD32 dst0, dst1;
1122            /*  outer for loop starts from here */
1123            for(row = 0; row < ht; row += 2)
1124            {
1125                for(col = 0; col < wdx2; col += 4)
1126                {
1127                    /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1128                    src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
1129                    /* row = 1 */
1130                    src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
1131
1132                    /* considering pix. 4:0 by converting 16-into 32 bit */
1133                    src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
1134                    src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
1135
1136                    /* (pi2_src[col] + lvl_shift)*/
1137                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
1138                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
1139
1140                    /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
1141                    src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
1142                    src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1143
1144                    /* i4_tmp += 1 << (shift - 1) */
1145                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
1146                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1147
1148                    /* (i4_tmp >> shift) */
1149                    src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
1150                    src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1151
1152                    /*i4_tmp = (i4_tmp >> shift) + off0; */
1153                    src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
1154                    src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
1155
1156                    src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
1157
1158                    /* pu1_dst[col] = CLIP_U8(i4_tmp); */
1159                    src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
1160
1161                    dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
1162                    /* dst row = 1 to 3 */
1163                    src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
1164
1165                    /* store four 8-bit output values  */
1166                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1167
1168                    dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
1169                    /* row = 1 */
1170                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1171
1172                    pi2_src += 4;   /* Pointer update */
1173                    pu1_dst += 4; /* Pointer update */
1174
1175                } /* inner loop ends here(4-output values in single iteration) */
1176                pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
1177                pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1178            }
1179        }
1180    }
1181}
1182
1183/**
1184*******************************************************************************
1185*
1186* @brief
1187*  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
1188* pi2_src2 and stores it at location pointed  by pi2_dst
1189*
1190* @par Description:
1191*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
1192* off1 + 1) << (shift - 1) ) >> shift
1193*
1194* @param[in] pi2_src1
1195*  Pointer to source 1
1196*
1197* @param[in] pi2_src2
1198*  Pointer to source 2
1199*
1200* @param[out] pu1_dst
1201*  Pointer to destination
1202*
1203* @param[in] src_strd1
1204*  Source stride 1
1205*
1206* @param[in] src_strd2
1207*  Source stride 2
1208*
1209* @param[in] dst_strd
1210*  Destination stride
1211*
1212* @param[in] wgt0
1213*  weight to be multiplied to source 1
1214*
1215* @param[in] off0
1216*  offset 0
1217*
1218* @param[in] wgt1
1219*  weight to be multiplied to source 2
1220*
1221* @param[in] off1
1222*  offset 1
1223*
1224* @param[in] shift
1225*  (14 Bit depth) + log2_weight_denominator
1226*
1227* @param[in] lvl_shift1
1228*  added before shift and offset
1229*
1230* @param[in] lvl_shift2
1231*  added before shift and offset
1232*
1233* @param[in] ht
1234*  height of the source
1235*
1236* @param[in] wd
1237*  width of the source
1238*
1239* @returns
1240*
1241* @remarks
1242*  None
1243*
1244*******************************************************************************
1245*/
1246
1247void ihevc_weighted_pred_bi_sse42(WORD16 *pi2_src1,
1248                                  WORD16 *pi2_src2,
1249                                  UWORD8 *pu1_dst,
1250                                  WORD32 src_strd1,
1251                                  WORD32 src_strd2,
1252                                  WORD32 dst_strd,
1253                                  WORD32 wgt0,
1254                                  WORD32 off0,
1255                                  WORD32 wgt1,
1256                                  WORD32 off1,
1257                                  WORD32 shift,
1258                                  WORD32 lvl_shift1,
1259                                  WORD32 lvl_shift2,
1260                                  WORD32 ht,
1261                                  WORD32 wd)
1262{
1263    WORD32 row, col, temp;
1264
1265    __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
1266    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
1267
1268
1269    ASSERT(wd % 4 == 0); /* checking assumption*/
1270    ASSERT(ht % 2 == 0); /* checking assumption*/
1271
1272    temp = (off0 + off1 + 1) << (shift - 1);
1273
1274    // seting values in register
1275    const_temp_4x32b = _mm_set1_epi32(temp);
1276    lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
1277    lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
1278    wgt0_4x32b = _mm_set1_epi32(wgt0);
1279    wgt1_4x32b = _mm_set1_epi32(wgt1);
1280
1281    if(0 == (wd & 7)) /* wd multiple of 8 case */
1282    {
1283        __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
1284        /*  outer for loop starts from here */
1285        for(row = 0; row < ht; row += 2)
1286        {
1287            for(col = 0; col < wd; col += 8)
1288            {
1289                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1290                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1291                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1292                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1293                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1294                /* Next 4 pixels */
1295                src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
1296                src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
1297                src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
1298                src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
1299
1300                /* considering pix. 4:0 by converting 16-into 32 bit */
1301                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1302                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1303                /* (pi2_src1[col] + lvl_shift1) */
1304                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1305                /* (pi2_src2[col] + lvl_shift2) */
1306                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1307                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1308                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1309                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1310                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1311
1312                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1313                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1314                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1315                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1316                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1317                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1318
1319                /* Next 4 Pixels */
1320                src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
1321                src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
1322                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
1323                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
1324                src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
1325                src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
1326                src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
1327                src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
1328                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
1329                src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
1330                src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
1331                src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
1332
1333                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1334                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1335                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1336                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1337                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1338                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1339                /* (i4_tmp >> shift) */
1340                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1341                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
1342
1343                /* Next 4 Pixels */
1344                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
1345                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
1346                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
1347                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
1348                src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
1349                src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
1350
1351                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
1352                src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
1353
1354                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1355                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1356                src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
1357
1358                /* store four 8-bit output values  */
1359                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
1360                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
1361
1362                pi2_src1 += 8;  /* Pointer update */
1363                pi2_src2 += 8;  /* Pointer update */
1364                pu1_dst  += 8;  /* Pointer update */
1365
1366            } /* inner loop ends here(4-output values in single iteration) */
1367
1368            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
1369            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
1370            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
1371
1372        } /* outer loop ends */
1373    }
1374    else /* wd multiple of 4 case */
1375    {
1376        WORD32 dst0, dst1;
1377        /*  outer for loop starts from here */
1378        for(row = 0; row < ht; row += 2)
1379        {
1380            for(col = 0; col < wd; col += 4)
1381            {
1382                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1383                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1384                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1385                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1386                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1387
1388                /* considering pix. 4:0 by converting 16-into 32 bit */
1389                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1390                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1391                /* (pi2_src1[col] + lvl_shift1) */
1392                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1393                /* (pi2_src2[col] + lvl_shift2) */
1394                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1395                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1396                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1397                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1398                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1399
1400                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1401                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1402                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1403                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1404                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1405                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1406
1407                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1408                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1409                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1410
1411                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1412                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1413                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1414
1415                /* (i4_tmp >> shift) */
1416                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1417                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
1418
1419                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1420
1421                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1422                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1423
1424                dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
1425
1426                /* dst row = 1 to 3 */
1427                src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
1428
1429                /* store four 8-bit output values  */
1430                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1431
1432                dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
1433
1434                /* row = 1 to 3 */
1435                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1436
1437                pi2_src1 += 4;  /* Pointer update */
1438                pi2_src2 += 4;  /* Pointer update */
1439                pu1_dst  += 4;  /* Pointer update */
1440
1441            } /* inner loop ends here(4-output values in single iteration) */
1442
1443            pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
1444            pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
1445            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
1446
1447        } /* outer loop ends */
1448    }
1449
1450}
1451
1452/**
1453*******************************************************************************
1454*
1455* @brief
1456* Does chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
1457* pi2_src2 and stores it at location pointed  by pi2_dst
1458*
1459* @par Description:
1460*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
1461* off1 + 1) << (shift - 1) ) >> shift
1462*
1463* @param[in] pi2_src1
1464*  Pointer to source 1
1465*
1466* @param[in] pi2_src2
1467*  Pointer to source 2
1468*
1469* @param[out] pu1_dst
1470*  Pointer to destination
1471*
1472* @param[in] src_strd1
1473*  Source stride 1
1474*
1475* @param[in] src_strd2
1476*  Source stride 2
1477*
1478* @param[in] dst_strd
1479*  Destination stride
1480*
1481* @param[in] wgt0
1482*  weight to be multiplied to source 1
1483*
1484* @param[in] off0
1485*  offset 0
1486*
1487* @param[in] wgt1
1488*  weight to be multiplied to source 2
1489*
1490* @param[in] off1
1491*  offset 1
1492*
1493* @param[in] shift
1494*  (14 Bit depth) + log2_weight_denominator
1495*
1496* @param[in] lvl_shift1
1497*  added before shift and offset
1498*
1499* @param[in] lvl_shift2
1500*  added before shift and offset
1501*
1502* @param[in] ht
1503*  height of the source
1504*
1505* @param[in] wd
1506*  width of the source (each colour component)
1507*
1508* @returns
1509*
1510* @remarks
1511*  None
1512*
1513*******************************************************************************
1514*/
1515
1516void ihevc_weighted_pred_chroma_bi_sse42(WORD16 *pi2_src1,
1517                                         WORD16 *pi2_src2,
1518                                         UWORD8 *pu1_dst,
1519                                         WORD32 src_strd1,
1520                                         WORD32 src_strd2,
1521                                         WORD32 dst_strd,
1522                                         WORD32 wgt0_cb,
1523                                         WORD32 wgt0_cr,
1524                                         WORD32 off0_cb,
1525                                         WORD32 off0_cr,
1526                                         WORD32 wgt1_cb,
1527                                         WORD32 wgt1_cr,
1528                                         WORD32 off1_cb,
1529                                         WORD32 off1_cr,
1530                                         WORD32 shift,
1531                                         WORD32 lvl_shift1,
1532                                         WORD32 lvl_shift2,
1533                                         WORD32 ht,
1534                                         WORD32 wd)
1535{
1536    WORD32 row, col, temp1, temp2;
1537    WORD32 wdx2;
1538
1539    __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
1540    __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
1541
1542
1543    ASSERT(wd % 2 == 0); /* checking assumption*/
1544    ASSERT(ht % 2 == 0); /* checking assumption*/
1545
1546    temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
1547    temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
1548
1549    // seting values in register
1550    const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
1551    lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
1552    lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
1553    wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
1554    wgt1_4x32b = _mm_set_epi32(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
1555
1556    wdx2 = wd * 2;
1557
1558    if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
1559    {
1560        __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
1561        /*  outer for loop starts from here */
1562        for(row = 0; row < ht; row += 2)
1563        {
1564            for(col = 0; col < wdx2; col += 8)
1565            {
1566                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1567                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1568                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1569                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1570                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1571                /* Next 4 pixels */
1572                src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
1573                src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
1574                src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
1575                src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
1576
1577                /* considering pix. 4:0 by converting 16-into 32 bit */
1578                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1579                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1580                /* (pi2_src1[col] + lvl_shift1) */
1581                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1582                /* (pi2_src2[col] + lvl_shift2) */
1583                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1584                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1585                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1586                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1587                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1588
1589                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1590                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1591                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1592                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1593                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1594                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1595
1596                /* Next 4 Pixels */
1597                src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
1598                src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
1599                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
1600                src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
1601                src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
1602                src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
1603                src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
1604                src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
1605                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
1606                src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
1607                src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
1608                src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
1609
1610                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1611                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1612                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1613                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1614                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1615                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1616                /* (i4_tmp >> shift) */
1617                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1618                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
1619
1620                /* Next 4 Pixels */
1621                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
1622                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
1623                src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
1624                src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
1625                src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
1626                src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
1627
1628                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
1629                src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
1630
1631                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1632                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1633                src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
1634
1635                /* store four 8-bit output values  */
1636                _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
1637                _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
1638
1639                pi2_src1 += 8;  /* Pointer update */
1640                pi2_src2 += 8;  /* Pointer update */
1641                pu1_dst  += 8;  /* Pointer update */
1642
1643            } /* inner loop ends here(4-output values in single iteration) */
1644
1645            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
1646            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
1647            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
1648
1649        } /* outer loop ends */
1650    }
1651    else /* wdx2 multiple of 4 case */
1652    {
1653        WORD32 dst0, dst1;
1654        /*  outer for loop starts from here */
1655        for(row = 0; row < ht; row += 2)
1656        {
1657            for(col = 0; col < wdx2; col += 4)
1658            {
1659                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1660                src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1661                src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1662                src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1663                src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1664
1665                /* considering pix. 4:0 by converting 16-into 32 bit */
1666                src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1667                src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1668                /* (pi2_src1[col] + lvl_shift1) */
1669                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1670                /* (pi2_src2[col] + lvl_shift2) */
1671                src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1672                /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1673                src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1674                /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1675                src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1676
1677                src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1678                src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1679                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1680                src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1681                src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1682                src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1683
1684                /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1685                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1686                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1687
1688                /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1689                src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1690                src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1691
1692                /* (i4_tmp >> shift) */
1693                src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1694                src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
1695
1696                src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1697
1698                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1699                src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1700
1701                dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
1702
1703                /* dst row = 1 to 3 */
1704                src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
1705
1706                /* store four 8-bit output values  */
1707                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1708
1709                dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
1710
1711                /* row = 1 to 3 */
1712                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1713
1714                pi2_src1 += 4;  /* Pointer update */
1715                pi2_src2 += 4;  /* Pointer update */
1716                pu1_dst  += 4;  /* Pointer update */
1717
1718            } /* inner loop ends here(4-output values in single iteration) */
1719
1720            pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
1721            pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
1722            pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
1723        }
1724    }
1725
1726}
1727
1728/**
1729*******************************************************************************
1730*
1731* @brief
1732*  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
1733* pi2_src2 and stores it at location  pointed by pi2_dst
1734*
1735* @par Description:
1736*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
1737* >> shift  where shift = 15 - BitDepth
1738*
1739* @param[in] pi2_src1
1740*  Pointer to source 1
1741*
1742* @param[in] pi2_src2
1743*  Pointer to source 2
1744*
1745* @param[out] pu1_dst
1746*  Pointer to destination
1747*
1748* @param[in] src_strd1
1749*  Source stride 1
1750*
1751* @param[in] src_strd2
1752*  Source stride 2
1753*
1754* @param[in] dst_strd
1755*  Destination stride
1756*
1757* @param[in] lvl_shift1
1758*  added before shift and offset
1759*
1760* @param[in] lvl_shift2
1761*  added before shift and offset
1762*
1763* @param[in] ht
1764*  height of the source
1765*
1766* @param[in] wd
1767*  width of the source
1768*
1769* @returns
1770*
1771* @remarks
1772*  None
1773*
1774* Assumption : ht%4 == 0, wd%4 == 0
1775* shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
1776* final result will match even if intermediate precision is in 16 bit.
1777*
1778*******************************************************************************
1779*/
1780
1781void ihevc_weighted_pred_bi_default_sse42(WORD16 *pi2_src1,
1782                                          WORD16 *pi2_src2,
1783                                          UWORD8 *pu1_dst,
1784                                          WORD32 src_strd1,
1785                                          WORD32 src_strd2,
1786                                          WORD32 dst_strd,
1787                                          WORD32 lvl_shift1,
1788                                          WORD32 lvl_shift2,
1789                                          WORD32 ht,
1790                                          WORD32 wd)
1791{
1792    WORD32 row, col, temp;
1793    WORD32 shift;
1794
1795    __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1796    __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
1797    __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1798
1799    ASSERT(wd % 4 == 0); /* checking assumption*/
1800    ASSERT(ht % 2 == 0); /* checking assumption*/
1801
1802    shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
1803    temp = 1 << (shift - 1);
1804
1805    // seting values in register
1806    lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
1807    lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
1808    const_temp_8x16b = _mm_set1_epi16(temp);
1809
1810    lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
1811    lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
1812
1813    if(0 == (ht & 3)) /* ht multiple of 4*/
1814    {
1815        if(0 == (wd & 15)) /* wd multiple of 16 case */
1816        {
1817            __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1818            __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
1819            /*  outer for loop starts from here */
1820            for(row = 0; row < ht; row += 4)
1821            {
1822                for(col = 0; col < wd; col += 16)
1823                {
1824                    /*load 8 pixel values */ /* First 8 Values */
1825                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1826                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1827                    /* row = 1 */
1828                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1829                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1830                    /* row = 2 */
1831                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1832                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1833                    /* row = 3 */
1834                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1835                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1836
1837                    /*load 8 pixel values */ /* Second 8 Values */
1838                    src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1839                    src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1840                    /* row = 1 */
1841                    src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1842                    src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1843                    /* row = 2 */
1844                    src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
1845                    src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
1846
1847                    /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1848                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1849                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1850                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1851                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1852
1853                    /*load 8 pixel values */ /* Second 8 Values */
1854                    /* row = 3 */
1855                    src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
1856                    src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
1857
1858                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1859                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1860                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1861                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1862                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1863
1864                    /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1865                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
1866                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
1867                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
1868                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
1869
1870                    /* (i4_tmp >> shift) */ /* First 8 Values */
1871                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1872                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1873                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1874                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1875
1876                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
1877                    src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
1878                    src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
1879                    src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
1880                    src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
1881
1882                    /* (i4_tmp >> shift) */ /* Second 8 Values */
1883                    src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
1884                    src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
1885                    src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
1886                    src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
1887
1888                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
1889                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
1890                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
1891                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
1892                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
1893
1894                    /* store four 8-bit output values  */ /* 16 8 Values */
1895                    _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1896                    _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1897                    _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1898                    _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1899
1900                    /* To update pointer */
1901                    pi2_src1 += 16;
1902                    pi2_src2 += 16;
1903                    pu1_dst  += 16;
1904
1905                } /* inner loop ends here(8-output values in single iteration) */
1906
1907                pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
1908                pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
1909                pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
1910
1911            }
1912        }
1913        else if(0 == (wd & 7)) /* multiple of 8 case */
1914        {
1915            /*  outer for loop starts from here */
1916            for(row = 0; row < ht; row += 4)
1917            {
1918                for(col = 0; col < wd; col += 8)
1919                {
1920                    /*load 8 pixel values */
1921                    src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1922                    src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1923                    /* row = 1 */
1924                    src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1925                    src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1926                    /* row = 2 */
1927                    src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1928                    src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1929                    /* row = 3 */
1930                    src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1931                    src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1932
1933                    /* (pi2_src1[col] + pi2_src2[col]) */
1934                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1935                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1936                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1937                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1938
1939                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1940                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1941                    src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1942                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1943                    src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1944
1945                    /* (i4_tmp >> shift) */
1946                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1947                    src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1948                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1949                    src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1950
1951                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1952                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1953                    src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1954                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1955                    src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1956
1957                    /* store four 8-bit output values  */
1958                    _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1959                    _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1960                    _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1961                    _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1962
1963                    /* To update pointer */
1964                    pi2_src1 += 8;
1965                    pi2_src2 += 8;
1966                    pu1_dst  += 8;
1967
1968                } /* inner loop ends here(8-output values in single iteration) */
1969
1970                pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
1971                pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
1972                pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
1973
1974            }
1975        }
1976        else /* wd multiple of 4 case*/
1977        {
1978            WORD32 dst0, dst1, dst2, dst3;
1979
1980            /*  outer for loop starts from here */
1981            for(row = 0; row < ht; row += 4)
1982            {
1983                for(col = 0; col < wd; col += 4)
1984                {
1985                    /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1986                    src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1987                    /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1988                    src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1989
1990                    /* row = 1 */
1991                    src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1992                    src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1993                    /* row = 2 */
1994                    src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
1995                    src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
1996                    /* row = 3 */
1997                    src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
1998                    src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
1999
2000                    /* Pack two rows together */
2001                    src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
2002                    src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
2003                    src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
2004                    src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
2005
2006                    /* (pi2_src1[col] + pi2_src2[col]) */
2007                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2008                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
2009
2010                    /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2011                    src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2012                    src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
2013
2014                    /* (i4_tmp >> shift) */
2015                    src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
2016                    src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
2017
2018                    /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2019                    src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2020                    src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
2021
2022                    dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
2023                    /* dst row = 1 to 3 */
2024                    src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
2025                    src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
2026
2027                    /* store four 8-bit output values  */
2028                    *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
2029
2030                    dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
2031                    dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
2032                    dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
2033
2034                    /* row = 1 to row = 3 */
2035                    *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
2036                    *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
2037                    *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
2038
2039                    /* To update pointer */
2040                    pi2_src1 += 4;
2041                    pi2_src2 += 4;
2042                    pu1_dst  += 4;
2043
2044                } /* inner loop ends here(4-output values in single iteration) */
2045
2046                pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
2047                pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
2048                pu1_dst  = pu1_dst  - wd + 4 * dst_strd;  /* Pointer update */
2049
2050            }
2051        }
2052    }
2053    else /* ht multiple of 2 case and wd multiple of 4 case*/
2054    {
2055
2056        WORD32 dst0, dst1;
2057
2058        /*  outer for loop starts from here */
2059        for(row = 0; row < ht; row += 2)
2060        {
2061            for(col = 0; col < wd; col += 4)
2062            {
2063                /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
2064                src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
2065                /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
2066                src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
2067
2068                /* row = 1 */
2069                src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
2070                src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
2071
2072                /* Pack two rows together */
2073                src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
2074                src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
2075
2076                /* (pi2_src1[col] + pi2_src2[col]) */
2077                src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2078
2079                /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2080                src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2081
2082                /* (i4_tmp >> shift) */
2083                src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
2084
2085                /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2086                src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2087
2088                dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
2089                /* dst row = 1 to 3 */
2090                src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
2091
2092                /* store four 8-bit output values  */
2093                *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
2094
2095                dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
2096
2097                /* row = 1 to row = 3 */
2098                *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
2099
2100                /* To update pointer */
2101                pi2_src1 += 4;
2102                pi2_src2 += 4;
2103                pu1_dst  += 4;
2104
2105            } /* inner loop ends here(4-output values in single iteration) */
2106
2107            pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
2108            pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
2109            pu1_dst  = pu1_dst  - wd + 2 * dst_strd;  /* Pointer update */
2110
2111        }
2112
2113    }
2114
2115}
2116