1/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20/*****************************************************************************/
21/*                                                                           */
22/*  File Name         : ih264_weighted_pred_intr_sse42.c                     */
23/*                                                                           */
24/*  Description       : Contains function definitions for weighted           */
25/*                      prediction functions in x86 sse4 intrinsics          */
26/*                                                                           */
27/*  List of Functions : ih264_default_weighted_pred_luma_sse42()             */
28/*                      ih264_default_weighted_pred_chroma_sse42()           */
29/*                      ih264_weighted_pred_luma_sse42()                     */
30/*                      ih264_weighted_pred_chroma_sse42()                   */
31/*                      ih264_weighted_bipred_luma_sse42()                   */
32/*                      ih264_weighted_bipred_chroma_sse42()                 */
33/*                                                                           */
34/*  Issues / Problems : None                                                 */
35/*                                                                           */
36/*  Revision History  :                                                      */
37/*                                                                           */
38/*         DD MM YYYY   Author(s)       Changes                              */
39/*         30 01 2015   Kaushik         Initial version                      */
40/*                      Senthoor                                             */
41/*                                                                           */
42/*****************************************************************************/
43/*****************************************************************************/
44/* File Includes                                                             */
45/*****************************************************************************/
46
47#include <immintrin.h>
48#include "ih264_typedefs.h"
49#include "ih264_macros.h"
50#include "ih264_platform_macros.h"
51#include "ih264_weighted_pred.h"
52
53/*****************************************************************************/
54/*  Function definitions .                                                   */
55/*****************************************************************************/
56/*****************************************************************************/
57/*                                                                           */
58/*  Function Name : ih264_default_weighted_pred_luma_sse42                   */
59/*                                                                           */
60/*  Description   : This function performs the default weighted prediction   */
61/*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
62/*                  sample prediction process" for luma. The function gets   */
63/*                  two ht x wd blocks, calculates their rounded-average and */
64/*                  stores it in the destination block. (ht,wd) can be       */
65/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
66/*                                                                           */
67/*  Inputs        : pu1_src1  - Pointer to source 1                          */
68/*                  pu1_src2  - Pointer to source 2                          */
69/*                  pu1_dst   - Pointer to destination                       */
70/*                  src_strd1 - stride for source 1                          */
71/*                  src_strd1 - stride for source 2                          */
72/*                  dst_strd  - stride for destination                       */
73/*                  ht        - height of the block                          */
74/*                  wd        - width of the block                           */
75/*                                                                           */
76/*  Issues        : None                                                     */
77/*                                                                           */
78/*  Revision History:                                                        */
79/*                                                                           */
80/*         DD MM YYYY   Author(s)       Changes                              */
81/*         04 02 2015   Kaushik         Initial Version                      */
82/*                      Senthoor                                             */
83/*                                                                           */
84/*****************************************************************************/
85void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1,
86                                            UWORD8 *pu1_src2,
87                                            UWORD8 *pu1_dst,
88                                            WORD32 src_strd1,
89                                            WORD32 src_strd2,
90                                            WORD32 dst_strd,
91                                            WORD32 ht,
92                                            WORD32 wd)
93{
94    __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b;
95    __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b;
96
97    if(wd == 4)
98    {
99        do
100        {
101            y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
102            y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
103            y0_2_16x8b = _mm_loadl_epi64(
104                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
105            y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
106
107            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
108            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
109            y1_2_16x8b = _mm_loadl_epi64(
110                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
111            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
112
113            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
114            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
115            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
116            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
117
118            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b);
119            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b);
120            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b);
121            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b);
122
123            ht -= 4;
124            pu1_src1 += src_strd1 << 2;
125            pu1_src2 += src_strd2 << 2;
126            pu1_dst += dst_strd << 2;
127        }
128        while(ht > 0);
129    }
130    else if(wd == 8)
131    {
132        do
133        {
134            y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
135            y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
136            y0_2_16x8b = _mm_loadl_epi64(
137                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
138            y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
139
140            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
141            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
142            y1_2_16x8b = _mm_loadl_epi64(
143                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
144            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
145
146            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
147            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
148            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
149            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
150
151            _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b);
152            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
153            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
154            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
155
156            ht -= 4;
157            pu1_src1 += src_strd1 << 2;
158            pu1_src2 += src_strd2 << 2;
159            pu1_dst += dst_strd << 2;
160        }
161        while(ht > 0);
162    }
163    else // wd == 16
164    {
165        __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b;
166        __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b;
167
168        do
169        {
170            y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
171            y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
172            y0_2_16x8b = _mm_loadu_si128(
173                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
174            y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3));
175            y0_4_16x8b = _mm_loadu_si128(
176                            (__m128i *)(pu1_src1 + (src_strd1 << 2)));
177            y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5));
178            y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6));
179            y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7));
180
181            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
182            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
183            y1_2_16x8b = _mm_loadu_si128(
184                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
185            y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3));
186            y1_4_16x8b = _mm_loadu_si128(
187                            (__m128i *)(pu1_src2 + (src_strd2 << 2)));
188            y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5));
189            y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6));
190            y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7));
191
192            y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
193            y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
194            y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
195            y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
196            y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b);
197            y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b);
198            y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b);
199            y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b);
200
201            _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b);
202            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
203            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
204            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
205            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b);
206            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b);
207            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b);
208            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b);
209
210            ht -= 8;
211            pu1_src1 += src_strd1 << 3;
212            pu1_src2 += src_strd2 << 3;
213            pu1_dst += dst_strd << 3;
214        }
215        while(ht > 0);
216    }
217}
218
219/*****************************************************************************/
220/*                                                                           */
221/*  Function Name : ih264_default_weighted_pred_chroma_sse42                 */
222/*                                                                           */
223/*  Description   : This function performs the default weighted prediction   */
224/*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
225/*                  sample prediction process" for chroma. The function gets */
226/*                  two ht x wd blocks, calculates their rounded-average and */
227/*                  stores it in the destination block. (ht,wd) can be       */
228/*                  (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8).      */
229/*                                                                           */
230/*  Inputs        : pu1_src1  - Pointer to source 1                          */
231/*                  pu1_src2  - Pointer to source 2                          */
232/*                  pu1_dst   - Pointer to destination                       */
233/*                  src_strd1 - stride for source 1                          */
234/*                  src_strd1 - stride for source 2                          */
235/*                  dst_strd  - stride for destination                       */
236/*                  ht        - height of the block                          */
237/*                  wd        - width of the block                           */
238/*                                                                           */
239/*  Issues        : None                                                     */
240/*                                                                           */
241/*  Revision History:                                                        */
242/*                                                                           */
243/*         DD MM YYYY   Author(s)       Changes                              */
244/*         04 02 2015   Kaushik         Initial Version                      */
245/*                      Senthoor                                             */
246/*                                                                           */
247/*****************************************************************************/
248void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1,
249                                              UWORD8 *pu1_src2,
250                                              UWORD8 *pu1_dst,
251                                              WORD32 src_strd1,
252                                              WORD32 src_strd2,
253                                              WORD32 dst_strd,
254                                              WORD32 ht,
255                                              WORD32 wd)
256{
257    __m128i uv0_0_16x8b, uv0_1_16x8b;
258    __m128i uv1_0_16x8b, uv1_1_16x8b;
259
260    if(wd == 2)
261    {
262        do
263        {
264            uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
265            uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
266
267            uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
268            uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
269
270            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
271            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
272
273            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b);
274            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b);
275
276            ht -= 2;
277            pu1_src1 += src_strd1 << 1;
278            pu1_src2 += src_strd2 << 1;
279            pu1_dst += dst_strd << 1;
280        }
281        while(ht > 0);
282    }
283    else if(wd == 4)
284    {
285        do
286        {
287            uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
288            uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
289
290            uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
291            uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
292
293            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
294            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
295
296            _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b);
297            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
298
299            ht -= 2;
300            pu1_src1 += src_strd1 << 1;
301            pu1_src2 += src_strd2 << 1;
302            pu1_dst += dst_strd << 1;
303        }
304        while(ht > 0);
305    }
306    else // wd == 8
307    {
308        __m128i uv0_2_16x8b, uv0_3_16x8b;
309        __m128i uv1_2_16x8b, uv1_3_16x8b;
310
311        do
312        {
313            uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
314            uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
315            uv0_2_16x8b = _mm_loadu_si128(
316                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
317            uv0_3_16x8b = _mm_loadu_si128(
318                            (__m128i *)(pu1_src1 + src_strd1 * 3));
319
320            uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
321            uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
322            uv1_2_16x8b = _mm_loadu_si128(
323                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
324            uv1_3_16x8b = _mm_loadu_si128(
325                            (__m128i *)(pu1_src2 + src_strd2 * 3));
326
327            uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
328            uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
329            uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b);
330            uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b);
331
332            _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b);
333            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
334            _mm_storeu_si128(
335                            (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b);
336            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b);
337
338            ht -= 4;
339            pu1_src1 += src_strd1 << 2;
340            pu1_src2 += src_strd2 << 2;
341            pu1_dst += dst_strd << 2;
342        }
343        while(ht > 0);
344    }
345}
346
347/*****************************************************************************/
348/*                                                                           */
349/*  Function Name : ih264_weighted_pred_luma_sse42                           */
350/*                                                                           */
351/*  Description   : This function performs the weighted prediction as        */
352/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
353/*                  prediction process" for luma. The function gets one      */
354/*                  ht x wd block, weights it, rounds it off, offsets it,    */
355/*                  saturates it to unsigned 8-bit and stores it in the      */
356/*                  destination block. (ht,wd) can be (4,4), (8,4), (4,8),   */
357/*                  (8,8), (16,8), (8,16) or (16,16).                        */
358/*                                                                           */
359/*  Inputs        : pu1_src  - Pointer to source                             */
360/*                  pu1_dst  - Pointer to destination                        */
361/*                  src_strd - stride for source                             */
362/*                  dst_strd - stride for destination                        */
363/*                  log_wd   - number of bits to be rounded off              */
364/*                  wt       - weight value                                  */
365/*                  ofst     - offset value                                  */
366/*                  ht       - height of the block                           */
367/*                  wd       - width of the block                            */
368/*                                                                           */
369/*  Issues        : None                                                     */
370/*                                                                           */
371/*  Revision History:                                                        */
372/*                                                                           */
373/*         DD MM YYYY   Author(s)       Changes                              */
374/*         04 02 2015   Kaushik         Initial Version                      */
375/*                      Senthoor                                             */
376/*                                                                           */
377/*****************************************************************************/
378void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src,
379                                    UWORD8 *pu1_dst,
380                                    WORD32 src_strd,
381                                    WORD32 dst_strd,
382                                    WORD32 log_wd,
383                                    WORD32 wt,
384                                    WORD32 ofst,
385                                    WORD32 ht,
386                                    WORD32 wd)
387{
388    __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
389
390    __m128i wt_8x16b, round_8x16b, ofst_8x16b;
391
392    WORD32 round_val;
393
394    wt = (WORD16)(wt & 0xffff);
395    round_val = 1 << (log_wd - 1);
396    ofst = (WORD8)(ofst & 0xff);
397
398    wt_8x16b = _mm_set1_epi16(wt);
399    round_8x16b = _mm_set1_epi16(round_val);
400    ofst_8x16b = _mm_set1_epi16(ofst);
401
402    if(wd == 4)
403    {
404        __m128i y_0_8x16b, y_2_8x16b;
405
406        do
407        {
408            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
409            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
410            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
411            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
412
413            y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
414            y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b);
415
416            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
417            y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
418
419            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
420            y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
421
422            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
423            y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
424
425            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
426            y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
427
428            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
429            y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
430
431            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b);
432            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
433            y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8);
434            y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12);
435
436            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
437            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
438            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b);
439            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b);
440
441            ht -= 4;
442            pu1_src += src_strd << 2;
443            pu1_dst += dst_strd << 2;
444        }
445        while(ht > 0);
446    }
447    else if(wd == 8)
448    {
449        __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b;
450
451        do
452        {
453            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
454            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
455            y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
456            y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
457
458            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
459            y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
460            y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
461            y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
462
463            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
464            y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
465            y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
466            y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b);
467
468            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
469            y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
470            y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
471            y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b);
472
473            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
474            y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
475            y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
476            y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd);
477
478            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
479            y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
480            y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
481            y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b);
482
483            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
484            y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b);
485            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
486            y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8);
487
488            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
489            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
490            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
491            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
492
493            ht -= 4;
494            pu1_src += src_strd << 2;
495            pu1_dst += dst_strd << 2;
496        }
497        while(ht > 0);
498    }
499    else // wd == 16
500    {
501        __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
502        __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
503
504        __m128i zero_16x8b;
505        zero_16x8b = _mm_set1_epi8(0);
506
507        do
508        {
509            y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
510            y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
511            y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
512            y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
513
514            y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
515            y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
516            y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
517            y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
518            y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
519            y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
520            y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
521            y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
522
523            y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
524            y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
525            y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
526            y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
527            y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
528            y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
529            y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
530            y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
531
532            y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
533            y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
534            y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
535            y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
536            y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
537            y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
538            y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
539            y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
540
541            y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
542            y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
543            y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
544            y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
545            y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
546            y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
547            y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
548            y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
549
550            y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
551            y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
552            y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
553            y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
554            y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
555            y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
556            y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
557            y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
558
559            y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
560            y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
561            y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
562            y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
563
564            _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
565            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
566            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
567            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
568
569            ht -= 4;
570            pu1_src += src_strd << 2;
571            pu1_dst += dst_strd << 2;
572        }
573        while(ht > 0);
574    }
575}
576
577/*****************************************************************************/
578/*                                                                           */
579/*  Function Name : ih264_weighted_pred_chroma_sse42                         */
580/*                                                                           */
581/*  Description   : This function performs the weighted prediction as        */
582/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
583/*                  prediction process" for chroma. The function gets one    */
584/*                  ht x wd block, weights it, rounds it off, offsets it,    */
585/*                  saturates it to unsigned 8-bit and stores it in the      */
586/*                  destination block. (ht,wd) can be (2,2), (4,2), (2,4),   */
587/*                  (4,4), (8,4), (4,8) or (8,8).                            */
588/*                                                                           */
589/*  Inputs        : pu1_src  - Pointer to source                             */
590/*                  pu1_dst  - Pointer to destination                        */
591/*                  src_strd - stride for source                             */
592/*                  dst_strd - stride for destination                        */
593/*                  log_wd   - number of bits to be rounded off              */
594/*                  wt       - weight values for u and v                     */
595/*                  ofst     - offset values for u and v                     */
596/*                  ht       - height of the block                           */
597/*                  wd       - width of the block                            */
598/*                                                                           */
599/*  Issues        : None                                                     */
600/*                                                                           */
601/*  Revision History:                                                        */
602/*                                                                           */
603/*         DD MM YYYY   Author(s)       Changes                              */
604/*         04 02 2015   Kaushik         Initial Version                      */
605/*                      Senthoor                                             */
606/*                                                                           */
607/*****************************************************************************/
608void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src,
609                                      UWORD8 *pu1_dst,
610                                      WORD32 src_strd,
611                                      WORD32 dst_strd,
612                                      WORD32 log_wd,
613                                      WORD32 wt,
614                                      WORD32 ofst,
615                                      WORD32 ht,
616                                      WORD32 wd)
617{
618    __m128i y_0_16x8b, y_1_16x8b;
619
620    __m128i wt_8x16b, round_8x16b, ofst_8x16b;
621
622    WORD32 ofst_u, ofst_v;
623    WORD32 round_val;
624
625    ofst_u = (WORD8)(ofst & 0xff);
626    ofst_v = (WORD8)(ofst >> 8);
627    round_val = 1 << (log_wd - 1);
628    ofst = (ofst_u & 0xffff) | (ofst_v << 16);
629
630    wt_8x16b = _mm_set1_epi32(wt);
631    round_8x16b = _mm_set1_epi16(round_val);
632    ofst_8x16b = _mm_set1_epi32(ofst);
633
634    if(wd == 2)
635    {
636        __m128i y_0_8x16b;
637
638        do
639        {
640            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
641            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
642
643            y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
644
645            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
646
647            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
648
649            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
650
651            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
652
653            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
654
655            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b);
656            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
657
658            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
659            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
660
661            ht -= 2;
662            pu1_src += src_strd << 1;
663            pu1_dst += dst_strd << 1;
664        }
665        while(ht > 0);
666    }
667    else if(wd == 4)
668    {
669        __m128i y_0_8x16b, y_1_8x16b;
670
671        do
672        {
673            y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
674            y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
675
676            y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
677            y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
678
679            y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
680            y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
681
682            y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
683            y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
684
685            y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
686            y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
687
688            y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
689            y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
690
691            y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
692            y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
693
694            _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
695            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
696
697            ht -= 2;
698            pu1_src += src_strd << 1;
699            pu1_dst += dst_strd << 1;
700        }
701        while(ht > 0);
702    }
703    else // wd == 16
704    {
705        __m128i y_2_16x8b, y_3_16x8b;
706        __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
707        __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
708
709        __m128i zero_16x8b;
710        zero_16x8b = _mm_set1_epi8(0);
711
712        do
713        {
714            y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
715            y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
716            y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
717            y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
718
719            y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
720            y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
721            y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
722            y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
723            y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
724            y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
725            y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
726            y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
727
728            y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
729            y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
730            y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
731            y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
732            y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
733            y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
734            y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
735            y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
736
737            y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
738            y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
739            y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
740            y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
741            y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
742            y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
743            y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
744            y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
745
746            y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
747            y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
748            y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
749            y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
750            y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
751            y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
752            y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
753            y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
754
755            y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
756            y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
757            y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
758            y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
759            y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
760            y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
761            y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
762            y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
763
764            y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
765            y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
766            y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
767            y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
768
769            _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
770            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
771            _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
772            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
773
774            ht -= 4;
775            pu1_src += src_strd << 2;
776            pu1_dst += dst_strd << 2;
777        }
778        while(ht > 0);
779    }
780}
781
782/*****************************************************************************/
783/*                                                                           */
784/*  Function Name : ih264_weighted_bi_pred_luma_sse42                        */
785/*                                                                           */
786/*  Description   : This function performs the weighted biprediction as      */
787/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
788/*                  prediction process" for luma. The function gets two      */
789/*                  ht x wd blocks, weights them, adds them, rounds off the  */
790/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
791/*                  stores it in the destination block. (ht,wd) can be       */
792/*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
793/*                                                                           */
794/*  Inputs        : pu1_src1  - Pointer to source 1                          */
795/*                  pu1_src2  - Pointer to source 2                          */
796/*                  pu1_dst   - Pointer to destination                       */
797/*                  src_strd1 - stride for source 1                          */
798/*                  src_strd2 - stride for source 2                          */
799/*                  dst_strd2 - stride for destination                       */
800/*                  log_wd    - number of bits to be rounded off             */
801/*                  wt1       - weight value for source 1                    */
802/*                  wt2       - weight value for source 2                    */
803/*                  ofst1     - offset value for source 1                    */
804/*                  ofst2     - offset value for source 2                    */
805/*                  ht        - height of the block                          */
806/*                  wd        - width of the block                           */
807/*                                                                           */
808/*  Issues        : None                                                     */
809/*                                                                           */
810/*  Revision History:                                                        */
811/*                                                                           */
812/*         DD MM YYYY   Author(s)       Changes                              */
813/*         04 02 2015   Kaushik         Initial Version                      */
814/*                      Senthoor                                             */
815/*                                                                           */
816/*****************************************************************************/
817void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1,
818                                       UWORD8 *pu1_src2,
819                                       UWORD8 *pu1_dst,
820                                       WORD32 src_strd1,
821                                       WORD32 src_strd2,
822                                       WORD32 dst_strd,
823                                       WORD32 log_wd,
824                                       WORD32 wt1,
825                                       WORD32 wt2,
826                                       WORD32 ofst1,
827                                       WORD32 ofst2,
828                                       WORD32 ht,
829                                       WORD32 wd)
830{
831    __m128i y1_0_16x8b, y1_1_16x8b;
832    __m128i y2_0_16x8b, y2_1_16x8b;
833
834    __m128i wt1_8x16b, wt2_8x16b;
835    __m128i ofst_8x16b, round_8x16b;
836
837    WORD32 ofst;
838    WORD32 round_val, shft;
839
840    wt1 = (WORD16)(wt1 & 0xffff);
841    wt2 = (WORD16)(wt2 & 0xffff);
842    round_val = 1 << log_wd;
843    shft = log_wd + 1;
844    ofst1 = (WORD8)(ofst1 & 0xff);
845    ofst2 = (WORD8)(ofst2 & 0xff);
846    ofst = (ofst1 + ofst2 + 1) >> 1;
847
848    wt1_8x16b = _mm_set1_epi16(wt1);
849    wt2_8x16b = _mm_set1_epi16(wt2);
850    round_8x16b = _mm_set1_epi16(round_val);
851    ofst_8x16b = _mm_set1_epi16(ofst);
852
853    if(wd == 4)
854    {
855        __m128i y1_2_16x8b, y1_3_16x8b;
856        __m128i y2_2_16x8b, y2_3_16x8b;
857
858        __m128i y1_0_8x16b, y1_2_8x16b;
859        __m128i y2_0_8x16b, y2_2_8x16b;
860
861        do
862        {
863            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
864            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
865            y1_2_16x8b = _mm_loadl_epi64(
866                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
867            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
868
869            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
870            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
871            y2_2_16x8b = _mm_loadl_epi64(
872                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
873            y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
874
875            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
876            y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b);
877            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
878            y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b);
879
880            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
881            y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
882            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
883            y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
884
885            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
886            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
887            y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
888            y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
889
890            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
891            y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
892
893            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
894            y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
895
896            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
897            y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
898
899            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
900            y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
901
902            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b);
903            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
904            y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
905            y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12);
906
907            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
908            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
909            *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b);
910            *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b);
911
912
913            ht -= 4;
914            pu1_src1 += src_strd1 << 2;
915            pu1_src2 += src_strd2 << 2;
916            pu1_dst += dst_strd << 2;
917        }
918        while(ht > 0);
919    }
920    else if(wd == 8)
921    {
922        __m128i y1_2_16x8b, y1_3_16x8b;
923        __m128i y2_2_16x8b, y2_3_16x8b;
924
925        __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b;
926        __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b;
927
928        do
929        {
930            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
931            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
932            y1_2_16x8b = _mm_loadl_epi64(
933                            (__m128i *)(pu1_src1 + (src_strd1 << 1)));
934            y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
935
936            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
937            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
938            y2_2_16x8b = _mm_loadl_epi64(
939                            (__m128i *)(pu1_src2 + (src_strd2 << 1)));
940            y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
941
942            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
943            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
944            y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
945            y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b);
946
947            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
948            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
949            y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
950            y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b);
951
952            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
953            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
954            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
955            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
956
957            y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
958            y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
959            y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b);
960            y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b);
961
962            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
963            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
964            y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
965            y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b);
966
967            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
968            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
969            y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
970            y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b);
971
972            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
973            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
974            y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
975            y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft);
976
977            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
978            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
979            y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
980            y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b);
981
982            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
983            y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b);
984            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
985            y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8);
986
987            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
988            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
989            _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b);
990            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b);
991
992            ht -= 4;
993            pu1_src1 += src_strd1 << 2;
994            pu1_src2 += src_strd2 << 2;
995            pu1_dst += dst_strd << 2;
996        }
997        while(ht > 0);
998    }
999    else // wd == 16
1000    {
1001        __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1002        __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1003
1004        __m128i zero_16x8b;
1005        zero_16x8b = _mm_set1_epi8(0);
1006
1007        do
1008        {
1009            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1010            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1011            y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1012            y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1013
1014            y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1015            y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1016            y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1017            y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1018
1019            y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1020            y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1021            y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1022            y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1023
1024            y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1025            y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1026            y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1027            y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1028
1029            y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1030            y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1031            y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1032            y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1033
1034            y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1035            y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1036            y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1037            y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1038
1039            y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1040            y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1041            y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1042            y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1043
1044            y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1045            y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1046            y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1047            y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1048
1049            y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1050            y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1051            y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1052            y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1053
1054            y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1055            y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1056
1057            _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1058            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1059
1060            ht -= 2;
1061            pu1_src1 += src_strd1 << 1;
1062            pu1_src2 += src_strd2 << 1;
1063            pu1_dst += dst_strd << 1;
1064        }
1065        while(ht > 0);
1066    }
1067}
1068
1069/*****************************************************************************/
1070/*                                                                           */
1071/*  Function Name : ih264_weighted_bi_pred_chroma_sse42                      */
1072/*                                                                           */
1073/*  Description   : This function performs the weighted biprediction as      */
1074/*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
1075/*                  prediction process" for chroma. The function gets two    */
1076/*                  ht x wd blocks, weights them, adds them, rounds off the  */
1077/*                  sum, offsets it, saturates it to unsigned 8-bit and      */
1078/*                  stores it in the destination block. (ht,wd) can be       */
1079/*                  (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8).       */
1080/*                                                                           */
1081/*  Inputs        : pu1_src1  - Pointer to source 1                          */
1082/*                  pu1_src2  - Pointer to source 2                          */
1083/*                  pu1_dst   - Pointer to destination                       */
1084/*                  src_strd1 - stride for source 1                          */
1085/*                  src_strd2 - stride for source 2                          */
1086/*                  dst_strd2 - stride for destination                       */
1087/*                  log_wd    - number of bits to be rounded off             */
1088/*                  wt1       - weight values for u and v in source 1        */
1089/*                  wt2       - weight values for u and v in source 2        */
1090/*                  ofst1     - offset value for u and v in source 1         */
1091/*                  ofst2     - offset value for u and v in source 2         */
1092/*                  ht        - height of the block                          */
1093/*                  wd        - width of the block                           */
1094/*                                                                           */
1095/*  Issues        : None                                                     */
1096/*                                                                           */
1097/*  Revision History:                                                        */
1098/*                                                                           */
1099/*         DD MM YYYY   Author(s)       Changes                              */
1100/*         04 02 2015   Kaushik         Initial Version                      */
1101/*                      Senthoor                                             */
1102/*                                                                           */
1103/*****************************************************************************/
1104void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1,
1105                                         UWORD8 *pu1_src2,
1106                                         UWORD8 *pu1_dst,
1107                                         WORD32 src_strd1,
1108                                         WORD32 src_strd2,
1109                                         WORD32 dst_strd,
1110                                         WORD32 log_wd,
1111                                         WORD32 wt1,
1112                                         WORD32 wt2,
1113                                         WORD32 ofst1,
1114                                         WORD32 ofst2,
1115                                         WORD32 ht,
1116                                         WORD32 wd)
1117{
1118    __m128i y1_0_16x8b, y1_1_16x8b;
1119    __m128i y2_0_16x8b, y2_1_16x8b;
1120
1121    __m128i wt1_8x16b, wt2_8x16b;
1122    __m128i ofst_8x16b, round_8x16b;
1123
1124    WORD32 ofst1_u, ofst2_u, ofst_u;
1125    WORD32 ofst1_v, ofst2_v, ofst_v;
1126    WORD32 round_val, shft, ofst_val;
1127
1128    round_val = 1 << log_wd;
1129    shft = log_wd + 1;
1130
1131    ofst1_u = (WORD8)(ofst1 & 0xff);
1132    ofst1_v = (WORD8)(ofst1 >> 8);
1133    ofst2_u = (WORD8)(ofst2 & 0xff);
1134    ofst2_v = (WORD8)(ofst2 >> 8);
1135
1136    wt1_8x16b = _mm_set1_epi32(wt1);
1137    wt2_8x16b = _mm_set1_epi32(wt2);
1138
1139    ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
1140    ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
1141    ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
1142
1143    round_8x16b = _mm_set1_epi16(round_val);
1144    ofst_8x16b = _mm_set1_epi32(ofst_val);
1145
1146    if(wd == 2)
1147    {
1148        __m128i y1_0_8x16b, y2_0_8x16b;
1149
1150        do
1151        {
1152            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1153            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1154
1155            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1156            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1157
1158            y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
1159            y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
1160
1161            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1162            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1163
1164            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1165            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1166
1167            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1168            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1169
1170            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1171            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1172
1173            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
1174            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
1175
1176            *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
1177            *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
1178
1179            ht -= 2;
1180            pu1_src1 += src_strd1 << 1;
1181            pu1_src2 += src_strd2 << 1;
1182            pu1_dst += dst_strd << 1;
1183        }
1184        while(ht > 0);
1185    }
1186    else if(wd == 4)
1187    {
1188        __m128i y1_0_8x16b, y1_1_8x16b;
1189        __m128i y2_0_8x16b, y2_1_8x16b;
1190
1191        do
1192        {
1193            y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1194            y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1195
1196            y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1197            y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1198
1199            y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1200            y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1201
1202            y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1203            y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1204
1205            y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1206            y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1207            y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
1208            y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
1209
1210            y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1211            y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
1212
1213            y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1214            y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
1215
1216            y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1217            y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
1218
1219            y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1220            y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
1221
1222            y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
1223            y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
1224
1225            _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
1226            _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1227
1228            ht -= 2;
1229            pu1_src1 += src_strd1 << 1;
1230            pu1_src2 += src_strd2 << 1;
1231            pu1_dst += dst_strd << 1;
1232        }
1233        while(ht > 0);
1234    }
1235    else // wd == 8
1236    {
1237        __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1238        __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1239
1240        __m128i zero_16x8b;
1241        zero_16x8b = _mm_set1_epi8(0);
1242
1243        do
1244        {
1245            y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1246            y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1247            y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1248            y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1249
1250            y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1251            y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1252            y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1253            y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1254
1255            y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1256            y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1257            y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1258            y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1259
1260            y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1261            y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1262            y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1263            y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1264
1265            y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1266            y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1267            y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1268            y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1269
1270            y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1271            y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1272            y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1273            y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1274
1275            y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1276            y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1277            y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1278            y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1279
1280            y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1281            y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1282            y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1283            y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1284
1285            y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1286            y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1287            y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1288            y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1289
1290            y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1291            y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1292
1293            _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1294            _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1295
1296            ht -= 2;
1297            pu1_src1 += src_strd1 << 1;
1298            pu1_src2 += src_strd2 << 1;
1299            pu1_dst += dst_strd << 1;
1300        }
1301        while(ht > 0);
1302    }
1303}
1304