1/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20
21/**
22 *******************************************************************************
23 * @file
24 *  impeg2_inter_pred_sse42_intr.c
25 *
26 * @brief
27 *  Contains Motion compensation function definitions for MPEG2 decoder
28 *
29 * @author
30 *  Mohit [100664]
31 *
32 * - impeg2_copy_mb_sse42()
33 * - impeg2_interpolate_sse42()
34 * - impeg2_mc_halfx_halfy_8x8_sse42()
35 * - impeg2_mc_halfx_fully_8x8_sse42()
36 * - impeg2_mc_fullx_halfy_8x8_sse42()
37 * - impeg2_mc_fullx_fully_8x8_sse42()
38 *
39 * @remarks
40 *  None
41 *
42 *******************************************************************************
43 */
44#include <stdio.h>
45#include <string.h>
46#include "iv_datatypedef.h"
47#include "impeg2_macros.h"
48#include "impeg2_defs.h"
49#include "impeg2_inter_pred.h"
50
51#include <immintrin.h>
52#include <emmintrin.h>
53#include <smmintrin.h>
54#include <tmmintrin.h>
55
56/*******************************************************************************
57*  Function Name   : impeg2_copy_mb
58*
59*  Description     : copies 3 components to the frame from mc_buf
60*
61*  Arguments       :
62*  src_buf         : Source Buffer
63*  dst_buf         : Destination Buffer
64*  src_wd          : Source Width
65*  dst_wd          : destination Width
66*
67*  Values Returned : None
68*******************************************************************************/
69void impeg2_copy_mb_sse42(yuv_buf_t *src_buf,
70                    yuv_buf_t *dst_buf,
71                    UWORD32 src_wd,
72                    UWORD32 dst_wd)
73{
74    UWORD8 *src;
75    UWORD8 *dst;
76    __m128i src_r0, src_r1, src_r2, src_r3;
77
78    /*******************************************************/
79    /* copy Y                                              */
80    /*******************************************************/
81    src = src_buf->pu1_y;
82    dst = dst_buf->pu1_y;
83    // Row 0-3
84    src_r0 = _mm_loadu_si128((__m128i *) (src));
85    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
86    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
87    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
88
89    _mm_storeu_si128((__m128i *) dst, src_r0);
90    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
91    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
92    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
93
94    // Row 4-7
95    src += 4 * src_wd;
96    dst += 4 * dst_wd;
97    src_r0 = _mm_loadu_si128((__m128i *) (src));
98    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
99    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
100    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
101
102    _mm_storeu_si128((__m128i *) dst, src_r0);
103    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
104    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
105    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
106
107    // Row 8-11
108    src += 4 * src_wd;
109    dst += 4 * dst_wd;
110    src_r0 = _mm_loadu_si128((__m128i *) (src));
111    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
112    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
113    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
114
115    _mm_storeu_si128((__m128i *) dst, src_r0);
116    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
117    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
118    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
119
120    // Row 12-15
121    src += 4 * src_wd;
122    dst += 4 * dst_wd;
123    src_r0 = _mm_loadu_si128((__m128i *) (src));
124    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
125    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
126    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
127
128    _mm_storeu_si128((__m128i *) dst, src_r0);
129    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
130    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
131    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
132
133    src_wd >>= 1;
134    dst_wd >>= 1;
135
136    /*******************************************************/
137    /* copy U                                              */
138    /*******************************************************/
139    src = src_buf->pu1_u;
140    dst = dst_buf->pu1_u;
141
142    // Row 0-3
143    src_r0 =  _mm_loadl_epi64((__m128i *)src);
144    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
145    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
146    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
147
148    _mm_storel_epi64((__m128i *)dst, src_r0);
149    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
150    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
151    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
152
153    // Row 4-7
154    src += 4 * src_wd;
155    dst += 4 * dst_wd;
156
157    src_r0 =  _mm_loadl_epi64((__m128i *)src);
158    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
159    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
160    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
161
162    _mm_storel_epi64((__m128i *)dst, src_r0);
163    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
164    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
165    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
166
167    /*******************************************************/
168    /* copy V                                              */
169    /*******************************************************/
170    src = src_buf->pu1_v;
171    dst = dst_buf->pu1_v;
172    // Row 0-3
173    src_r0 =  _mm_loadl_epi64((__m128i *)src);
174    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
175    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
176    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
177
178    _mm_storel_epi64((__m128i *)dst, src_r0);
179    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
180    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
181    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
182
183    // Row 4-7
184    src += 4 * src_wd;
185    dst += 4 * dst_wd;
186
187    src_r0 =  _mm_loadl_epi64((__m128i *)src);
188    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
189    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
190    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
191
192    _mm_storel_epi64((__m128i *)dst, src_r0);
193    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
194    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
195    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
196}
197
198/*****************************************************************************/
199/*                                                                           */
200/*  Function Name : impeg2_interpolate                                       */
201/*                                                                           */
202/*  Description   : averages the contents of buf_src1 and buf_src2 and stores*/
203/*                  result in buf_dst                                        */
204/*                                                                           */
205/*  Inputs        : buf_src1 -  First Source                                 */
206/*                  buf_src2 -  Second Source                                */
207/*                                                                           */
208/*  Globals       : None                                                     */
209/*                                                                           */
210/*  Processing    : Avg the values from two sources and store the result in  */
211/*                  destination buffer                                       */
212/*                                                                           */
213/*  Outputs       : buf_dst  -  Avg of contents of buf_src1 and buf_src2     */
214/*                                                                           */
215/*  Returns       : None                                                     */
216/*                                                                           */
217/*  Issues        : Assumes that all 3 buffers are of same size              */
218/*                                                                           */
219/*****************************************************************************/
220void impeg2_interpolate_sse42(yuv_buf_t *buf_src1,
221                        yuv_buf_t *buf_src2,
222                        yuv_buf_t *buf_dst,
223                        UWORD32 stride)
224{
225    UWORD8 *src1, *src2;
226    UWORD8 *dst;
227    __m128i src1_r0, src1_r1, src1_r2, src1_r3;
228    __m128i src2_r0, src2_r1, src2_r2, src2_r3;
229
230    /*******************************************************/
231    /* interpolate Y                                       */
232    /*******************************************************/
233    src1 = buf_src1->pu1_y;
234    src2 = buf_src2->pu1_y;
235    dst  = buf_dst->pu1_y;
236    // Row 0-3
237    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
238    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
239    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
240    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
241
242    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
243    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
244    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
245    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
246
247    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
248    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
249    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
250    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
251
252    _mm_storeu_si128((__m128i *) dst, src1_r0);
253    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
254    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
255    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
256
257    // Row 4-7
258    src1 += 4 * 16;
259    src2 += 4 * 16;
260    dst += 4 * stride;
261    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
262    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
263    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
264    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
265
266    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
267    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
268    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
269    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
270
271    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
272    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
273    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
274    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
275
276    _mm_storeu_si128((__m128i *) dst, src1_r0);
277    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
278    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
279    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
280
281    // Row 8-11
282    src1 += 4 * 16;
283    src2 += 4 * 16;
284    dst += 4 * stride;
285    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
286    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
287    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
288    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
289
290    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
291    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
292    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
293    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
294
295    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
296    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
297    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
298    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
299
300    _mm_storeu_si128((__m128i *) dst, src1_r0);
301    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
302    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
303    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
304
305    // Row 12-15
306    src1 += 4 * 16;
307    src2 += 4 * 16;
308    dst += 4 * stride;
309    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
310    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
311    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
312    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
313
314    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
315    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
316    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
317    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
318
319    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
320    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
321    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
322    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
323
324    _mm_storeu_si128((__m128i *) dst, src1_r0);
325    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
326    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
327    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
328
329    stride >>= 1;
330
331    /*******************************************************/
332    /* interpolate U                                       */
333    /*******************************************************/
334    src1 = buf_src1->pu1_u;
335    src2 = buf_src2->pu1_u;
336    dst  = buf_dst->pu1_u;
337    // Row 0-3
338    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
339    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
340    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
341    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
342
343    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
344    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
345    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
346    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
347
348    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
349    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
350    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
351    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
352
353    _mm_storel_epi64((__m128i *) dst, src1_r0);
354    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
355    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
356    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
357
358    // Row 4-7
359    src1 += 4 * 8;
360    src2 += 4 * 8;
361    dst += 4 * stride;
362
363    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
364    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
365    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
366    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
367
368    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
369    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
370    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
371    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
372
373    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
374    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
375    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
376    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
377
378    _mm_storel_epi64((__m128i *) dst, src1_r0);
379    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
380    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
381    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
382
383    /*******************************************************/
384    /* interpolate V                                       */
385    /*******************************************************/
386    src1 = buf_src1->pu1_v;
387    src2 = buf_src2->pu1_v;
388    dst  = buf_dst->pu1_v;
389
390    // Row 0-3
391    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
392    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
393    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
394    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
395
396    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
397    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
398    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
399    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
400
401    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
402    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
403    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
404    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
405
406    _mm_storel_epi64((__m128i *) dst, src1_r0);
407    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
408    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
409    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
410
411    // Row 4-7
412    src1 += 4 * 8;
413    src2 += 4 * 8;
414    dst += 4 * stride;
415
416    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
417    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
418    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
419    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
420
421    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
422    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
423    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
424    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
425
426    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
427    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
428    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
429    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
430
431    _mm_storel_epi64((__m128i *) dst, src1_r0);
432    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
433    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
434    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
435}
436
437/*****************************************************************************/
438/*                                                                           */
439/*  Function Name : impeg2_mc_halfx_halfy_8x8_sse42()                                 */
440/*                                                                           */
441/*  Description   : Gets the buffer from (0.5,0.5) to (8.5,8.5)              */
442/*                  and the above block of size 8 x 8 will be placed as a    */
443/*                  block from the current position of out_buf               */
444/*                                                                           */
445/*  Inputs        : ref - Reference frame from which the block will be       */
446/*                        block will be extracted.                           */
447/*                  ref_wid - WIdth of reference frame                       */
448/*                  out_wid - WIdth of the output frame                      */
449/*                  blk_width  - width of the block                          */
450/*                  blk_width  - height of the block                         */
451/*                                                                           */
452/*  Globals       : None                                                     */
453/*                                                                           */
454/*  Processing    : Point to the (0,0),(1,0),(0,1),(1,1) position in         */
455/*                  the ref frame.Interpolate these four values to get the   */
456/*                  value at(0.5,0.5).Repeat this to get an 8 x 8 block      */
457/*                  using 9 x 9 block from reference frame                   */
458/*                                                                           */
459/*  Outputs       : out -  Output containing the extracted block             */
460/*                                                                           */
461/*  Returns       : None                                                     */
462/*                                                                           */
463/*  Issues        : None                                                     */
464/*                                                                           */
465/*****************************************************************************/
466void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out,
467                            UWORD8 *ref,
468                            UWORD32 ref_wid,
469                            UWORD32 out_wid)
470{
471    UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3;
472    /* P0-P3 are the pixels in the reference frame and Q is the value being */
473    /* estimated                                                            */
474    /*
475       P0 P1
476         Q
477       P2 P3
478    */
479    __m128i src_r0, src_r0_1, src_r1, src_r1_1;
480    __m128i tmp0, tmp1;
481    __m128i value_2 = _mm_set1_epi16(2);
482
483    ref_p0 = ref;
484    ref_p1 = ref + 1;
485    ref_p2 = ref + ref_wid;
486    ref_p3 = ref + ref_wid + 1;
487
488    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 0
489    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
490    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 1
491    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
492
493    src_r0 =  _mm_cvtepu8_epi16(src_r0);
494    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
495    src_r1 =  _mm_cvtepu8_epi16(src_r1);
496    src_r1_1 =  _mm_cvtepu8_epi16(src_r1_1);
497
498    tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 0 horizontal interpolation
499    tmp1 = _mm_add_epi16(src_r1, src_r1_1);             //Row 1 horizontal interpolation
500    tmp0 = _mm_add_epi16(tmp0, tmp1);                   //Row 0 vertical interpolation
501    tmp0 = _mm_add_epi16(tmp0, value_2);
502    tmp0 =  _mm_srli_epi16(tmp0, 2);
503    tmp0 = _mm_packus_epi16(tmp0, value_2);
504
505    _mm_storel_epi64((__m128i *)out, tmp0);
506
507    //Row 1
508    ref_p2 += ref_wid;
509    ref_p3 += ref_wid;
510    out += out_wid;
511
512    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 2
513    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
514
515    src_r0 =  _mm_cvtepu8_epi16(src_r0);
516    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
517
518    tmp0 = _mm_add_epi16(src_r0, src_r0_1);         //Row 2 horizontal interpolation
519    tmp1 = _mm_add_epi16(tmp0, tmp1);               //Row 1 vertical interpolation
520    tmp1 = _mm_add_epi16(tmp1, value_2);
521    tmp1 =  _mm_srli_epi16(tmp1, 2);
522    tmp1 = _mm_packus_epi16(tmp1, value_2);
523
524    _mm_storel_epi64((__m128i *)out, tmp1);
525
526    //Row 2
527    ref_p2 += ref_wid;
528    ref_p3 += ref_wid;
529    out += out_wid;
530
531    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 3
532    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
533
534    src_r0 =  _mm_cvtepu8_epi16(src_r0);
535    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
536
537    tmp1 = _mm_add_epi16(src_r0, src_r0_1);         //Row 3 horizontal interpolation
538
539    tmp0 = _mm_add_epi16(tmp0, tmp1);               //Row 2 vertical interpolation
540    tmp0 = _mm_add_epi16(tmp0, value_2);
541    tmp0 =  _mm_srli_epi16(tmp0, 2);
542    tmp0 = _mm_packus_epi16(tmp0, value_2);
543
544    _mm_storel_epi64((__m128i *)out, tmp0);
545
546    //Row 3
547    ref_p2 += ref_wid;
548    ref_p3 += ref_wid;
549    out += out_wid;
550
551    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 4
552    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
553
554    src_r0 =  _mm_cvtepu8_epi16(src_r0);
555    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
556
557    tmp0 = _mm_add_epi16(src_r0, src_r0_1);         //Row 4 horizontal interpolation
558
559    tmp1 = _mm_add_epi16(tmp0, tmp1);               //Row 3 vertical interpolation
560    tmp1 = _mm_add_epi16(tmp1, value_2);
561    tmp1 =  _mm_srli_epi16(tmp1, 2);
562    tmp1 = _mm_packus_epi16(tmp1, value_2);
563
564    _mm_storel_epi64((__m128i *)out, tmp1);
565
566    //Row 4
567    ref_p2 += ref_wid;
568    ref_p3 += ref_wid;
569    out += out_wid;
570
571    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 5
572    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
573
574    src_r0 =  _mm_cvtepu8_epi16(src_r0);
575    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
576
577    tmp1 = _mm_add_epi16(src_r0, src_r0_1);     //Row 5 horizontal interpolation
578
579    tmp0 = _mm_add_epi16(tmp0, tmp1);           //Row 4 vertical interpolation
580    tmp0 = _mm_add_epi16(tmp0, value_2);
581    tmp0 =  _mm_srli_epi16(tmp0, 2);
582    tmp0 = _mm_packus_epi16(tmp0, value_2);
583
584    _mm_storel_epi64((__m128i *)out, tmp0);
585
586    //Row 5
587    ref_p2 += ref_wid;
588    ref_p3 += ref_wid;
589    out += out_wid;
590
591    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 6
592    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
593
594    src_r0 =  _mm_cvtepu8_epi16(src_r0);
595    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
596
597    tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 6 horizontal interpolation
598
599    tmp1 = _mm_add_epi16(tmp0, tmp1);                   //Row 5 vertical interpolation
600    tmp1 = _mm_add_epi16(tmp1, value_2);
601    tmp1 =  _mm_srli_epi16(tmp1, 2);
602    tmp1 = _mm_packus_epi16(tmp1, value_2);
603
604    _mm_storel_epi64((__m128i *)out, tmp1);
605
606    //Row 6
607    ref_p2 += ref_wid;
608    ref_p3 += ref_wid;
609    out += out_wid;
610
611    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 7
612    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
613
614    src_r0 =  _mm_cvtepu8_epi16(src_r0);
615    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
616
617    tmp1 = _mm_add_epi16(src_r0, src_r0_1);             //Row 7 horizontal interpolation
618
619    tmp0 = _mm_add_epi16(tmp0, tmp1);                   //Row 6 vertical interpolation
620    tmp0 = _mm_add_epi16(tmp0, value_2);
621    tmp0 =  _mm_srli_epi16(tmp0, 2);
622    tmp0 = _mm_packus_epi16(tmp0, value_2);
623
624    _mm_storel_epi64((__m128i *)out, tmp0);
625
626    //Row 7
627    ref_p2 += ref_wid;
628    ref_p3 += ref_wid;
629    out += out_wid;
630
631    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 8
632    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
633
634    src_r0 =  _mm_cvtepu8_epi16(src_r0);
635    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
636
637    tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 8 horizontal interpolation
638
639    tmp1 = _mm_add_epi16(tmp0, tmp1);                   //Row 7 vertical interpolation
640    tmp1 = _mm_add_epi16(tmp1, value_2);
641    tmp1 =  _mm_srli_epi16(tmp1, 2);
642    tmp1 = _mm_packus_epi16(tmp1, value_2);
643
644    _mm_storel_epi64((__m128i *)out, tmp1);
645
646    return;
647}
648
649/*****************************************************************************/
650/*                                                                           */
651/*  Function Name : impeg2_mc_halfx_fully_8x8_sse42()                                 */
652/*                                                                           */
653/*  Description   : Gets the buffer from (0.5,0) to (8.5,8)                  */
654/*                  and the above block of size 8 x 8 will be placed as a    */
655/*                  block from the current position of out_buf               */
656/*                                                                           */
657/*  Inputs        : ref - Reference frame from which the block will be       */
658/*                        block will be extracted.                           */
659/*                  ref_wid - WIdth of reference frame                       */
660/*                  out_wid - WIdth of the output frame                      */
661/*                  blk_width  - width of the block                          */
662/*                  blk_width  - height of the block                         */
663/*                                                                           */
664/*  Globals       : None                                                     */
665/*                                                                           */
666/*  Processing    : Point to the (0,0) and (1,0) position in the ref frame   */
667/*                  Interpolate these two values to get the value at(0.5,0)  */
668/*                  Repeat this to get an 8 x 8 block using 9 x 8 block from */
669/*                  reference frame                                          */
670/*                                                                           */
671/*  Outputs       : out -  Output containing the extracted block             */
672/*                                                                           */
673/*  Returns       : None                                                     */
674/*                                                                           */
675/*  Issues        : None                                                     */
676/*                                                                           */
677/*****************************************************************************/
678void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out,
679                            UWORD8 *ref,
680                            UWORD32 ref_wid,
681                            UWORD32 out_wid)
682{
683    UWORD8 *ref_p0,*ref_p1;
684    __m128i src_r0, src_r0_1, src_r1, src_r1_1;
685    /* P0-P3 are the pixels in the reference frame and Q is the value being */
686    /* estimated                                                            */
687    /*
688       P0 Q P1
689    */
690
691    ref_p0 = ref;
692    ref_p1 = ref + 1;
693
694    // Row 0 and 1
695    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 0
696    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
697    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 1
698    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
699
700    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
701    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
702
703    _mm_storel_epi64((__m128i *)out, src_r0);
704    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
705
706    // Row 2 and 3
707    ref_p0 += 2*ref_wid;
708    ref_p1 += 2*ref_wid;
709    out += 2*out_wid;
710
711    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 2
712    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
713    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 3
714    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
715
716    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
717    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
718
719    _mm_storel_epi64((__m128i *)out, src_r0);
720    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
721
722    // Row 4 and 5
723    ref_p0 += 2*ref_wid;
724    ref_p1 += 2*ref_wid;
725    out += 2*out_wid;
726
727    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 4
728    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
729    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 5
730    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
731
732    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
733    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
734
735    _mm_storel_epi64((__m128i *)out, src_r0);
736    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
737
738    // Row 6 and 7
739    ref_p0 += 2*ref_wid;
740    ref_p1 += 2*ref_wid;
741    out += 2*out_wid;
742
743    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 6
744    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
745    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 7
746    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
747
748    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
749    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
750
751    _mm_storel_epi64((__m128i *)out, src_r0);
752    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
753
754    return;
755}
756
757
758/*****************************************************************************/
759/*                                                                           */
760/*  Function Name : impeg2_mc_fullx_halfy_8x8_sse42()                                 */
761/*                                                                           */
762/*  Description   : Gets the buffer from (0,0.5) to (8,8.5)                  */
763/*                  and the above block of size 8 x 8 will be placed as a    */
764/*                  block from the current position of out_buf               */
765/*                                                                           */
766/*  Inputs        : ref - Reference frame from which the block will be       */
767/*                        block will be extracted.                           */
768/*                  ref_wid - WIdth of reference frame                       */
769/*                  out_wid - WIdth of the output frame                      */
770/*                  blk_width  - width of the block                          */
771/*                  blk_width  - height of the block                         */
772/*                                                                           */
773/*  Globals       : None                                                     */
774/*                                                                           */
775/*  Processing    : Point to the (0,0) and (0,1)   position in the ref frame */
776/*                  Interpolate these two values to get the value at(0,0.5)  */
777/*                  Repeat this to get an 8 x 8 block using 8 x 9 block from */
778/*                  reference frame                                          */
779/*                                                                           */
780/*  Outputs       : out -  Output containing the extracted block             */
781/*                                                                           */
782/*  Returns       : None                                                     */
783/*                                                                           */
784/*  Issues        : None                                                     */
785/*                                                                           */
786/*****************************************************************************/
787void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out,
788                            UWORD8 *ref,
789                            UWORD32 ref_wid,
790                            UWORD32 out_wid)
791{
792    __m128i src_r0, src_r1, src_r2, temp0, temp1;
793    /* P0-P3 are the pixels in the reference frame and Q is the value being */
794    /* estimated                                                            */
795    /*
796       P0
797        x
798       P1
799    */
800    src_r0 = _mm_loadl_epi64((__m128i *)ref);               //Row 0
801    src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 1
802    src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));   //Row 2
803    temp0 = _mm_avg_epu8(src_r0, src_r1);
804    temp1 = _mm_avg_epu8(src_r1, src_r2);
805    _mm_storel_epi64((__m128i *)out, temp0);                //Row 0
806    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 1
807
808    ref+= 3*ref_wid;
809    out+= 2*out_wid;
810
811    src_r0 = _mm_loadl_epi64((__m128i *)ref);               //Row 3
812    src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 4
813    temp0 = _mm_avg_epu8(src_r2, src_r0);
814    temp1 = _mm_avg_epu8(src_r0, src_r1);
815    _mm_storel_epi64((__m128i *)out, temp0);                //Row 2
816    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 3
817
818    ref += 2*ref_wid;
819    out+= 2*out_wid;
820
821    src_r2 = _mm_loadl_epi64((__m128i *)ref);               //Row 5
822    src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 6
823    temp0 = _mm_avg_epu8(src_r1, src_r2);
824    temp1 = _mm_avg_epu8(src_r2, src_r0);
825    _mm_storel_epi64((__m128i *)out, temp0);                //Row 4
826    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 5
827
828    ref += 2*ref_wid;
829    out+= 2*out_wid;
830
831    src_r1 = _mm_loadl_epi64((__m128i *)ref);               //Row 7
832    src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid));  //Row 8
833    temp0 = _mm_avg_epu8(src_r0, src_r1);
834    temp1 = _mm_avg_epu8(src_r1, src_r2);
835    _mm_storel_epi64((__m128i *)out, temp0);                //Row 6
836    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 7
837
838    return;
839}
840
841/*****************************************************************************/
842/*                                                                           */
843/*  Function Name : impeg2_mc_fullx_fully_8x8_sse42()                                 */
844/*                                                                           */
845/*  Description   : Gets the buffer from (x,y) to (x+8,y+8)                  */
846/*                  and the above block of size 8 x 8 will be placed as a    */
847/*                  block from the current position of out_buf               */
848/*                                                                           */
849/*  Inputs        : ref - Reference frame from which the block will be       */
850/*                        block will be extracted.                           */
851/*                  ref_wid - WIdth of reference frame                       */
852/*                  out_wid - WIdth of the output frame                      */
853/*                  blk_width  - width of the block                          */
854/*                  blk_width  - height of the block                         */
855/*                                                                           */
856/*  Globals       : None                                                     */
857/*                                                                           */
858/*  Processing    : Point to the (0,0) position in the ref frame             */
859/*                  Get an 8 x 8 block from reference frame                  */
860/*                                                                           */
861/*  Outputs       : out -  Output containing the extracted block             */
862/*                                                                           */
863/*  Returns       : None                                                     */
864/*                                                                           */
865/*  Issues        : None                                                     */
866/*                                                                           */
867/*****************************************************************************/
868void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out,
869                            UWORD8 *ref,
870                            UWORD32 ref_wid,
871                            UWORD32 out_wid)
872{
873    __m128i src_r0, src_r1, src_r2, src_r3;
874    // Row 0-3
875    src_r0 =  _mm_loadl_epi64((__m128i *)ref);
876    src_r1 =  _mm_loadl_epi64((__m128i *)(ref + ref_wid));
877    src_r2 =  _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
878    src_r3 =  _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
879
880    _mm_storel_epi64((__m128i *)out, src_r0);
881    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
882    _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
883    _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
884
885    // Row 4-7
886    ref += 4 * ref_wid;
887    out += 4 * out_wid;
888
889    src_r0 =  _mm_loadl_epi64((__m128i *)ref);
890    src_r1 =  _mm_loadl_epi64((__m128i *)(ref + ref_wid));
891    src_r2 =  _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
892    src_r3 =  _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
893
894    _mm_storel_epi64((__m128i *)out, src_r0);
895    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
896    _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
897    _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
898    return;
899}
900