1aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/******************************************************************************
2aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *
3aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * Copyright (C) 2015 The Android Open Source Project
4aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *
5aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * Licensed under the Apache License, Version 2.0 (the "License");
6aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * you may not use this file except in compliance with the License.
7aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * You may obtain a copy of the License at:
8aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *
9aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * http://www.apache.org/licenses/LICENSE-2.0
10aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *
11aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * Unless required by applicable law or agreed to in writing, software
12aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * distributed under the License is distributed on an "AS IS" BASIS,
13aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * See the License for the specific language governing permissions and
15aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * limitations under the License.
16aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *
17aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *****************************************************************************
18aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*/
20aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
21aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/**
22aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *******************************************************************************
23aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * @file
24aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *  impeg2_inter_pred_sse42_intr.c
25aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *
26aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * @brief
27aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *  Contains Motion compensation function definitions for MPEG2 decoder
28aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *
29aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * @author
30aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *  Mohit [100664]
31aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *
32aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * - impeg2_copy_mb_sse42()
33aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * - impeg2_interpolate_sse42()
34aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * - impeg2_mc_halfx_halfy_8x8_sse42()
35aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * - impeg2_mc_halfx_fully_8x8_sse42()
36aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * - impeg2_mc_fullx_halfy_8x8_sse42()
37aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * - impeg2_mc_fullx_fully_8x8_sse42()
38aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *
39aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani * @remarks
40aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *  None
41aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *
42aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani *******************************************************************************
43aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani */
44aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani#include <stdio.h>
45aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani#include <string.h>
46aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani#include "iv_datatypedef.h"
47aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani#include "impeg2_macros.h"
48aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani#include "impeg2_defs.h"
49aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani#include "impeg2_inter_pred.h"
50aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
51aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani#include <immintrin.h>
52aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani#include <emmintrin.h>
53aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani#include <smmintrin.h>
54aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani#include <tmmintrin.h>
55aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
56aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*******************************************************************************
57aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*  Function Name   : impeg2_copy_mb
58aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*
59aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*  Description     : copies 3 components to the frame from mc_buf
60aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*
61aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*  Arguments       :
62aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*  src_buf         : Source Buffer
63aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*  dst_buf         : Destination Buffer
64aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*  src_wd          : Source Width
65aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*  dst_wd          : destination Width
66aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*
67aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*  Values Returned : None
68aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani*******************************************************************************/
69aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhanivoid impeg2_copy_mb_sse42(yuv_buf_t *src_buf,
70aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                    yuv_buf_t *dst_buf,
71aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                    UWORD32 src_wd,
72aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                    UWORD32 dst_wd)
73aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani{
74aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    UWORD8 *src;
75aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    UWORD8 *dst;
76aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    __m128i src_r0, src_r1, src_r2, src_r3;
77aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
78aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
79aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* copy Y                                              */
80aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
81aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src = src_buf->pu1_y;
82aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst = dst_buf->pu1_y;
83aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 0-3
84aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadu_si128((__m128i *) (src));
85aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
86aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
87aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
88aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
89aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) dst, src_r0);
90aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
91aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
92aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
93aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
94aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 4-7
95aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src += 4 * src_wd;
96aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst += 4 * dst_wd;
97aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadu_si128((__m128i *) (src));
98aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
99aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
100aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
101aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
102aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) dst, src_r0);
103aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
104aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
105aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
106aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
107aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 8-11
108aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src += 4 * src_wd;
109aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst += 4 * dst_wd;
110aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadu_si128((__m128i *) (src));
111aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
112aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
113aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
114aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
115aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) dst, src_r0);
116aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
117aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
118aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
119aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
120aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 12-15
121aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src += 4 * src_wd;
122aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst += 4 * dst_wd;
123aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadu_si128((__m128i *) (src));
124aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
125aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
126aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
127aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
128aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) dst, src_r0);
129aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
130aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
131aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
132aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
133aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_wd >>= 1;
134aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst_wd >>= 1;
135aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
136aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
137aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* copy U                                              */
138aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
139aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src = src_buf->pu1_u;
140aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst = dst_buf->pu1_u;
141aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
142aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 0-3
143aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_loadl_epi64((__m128i *)src);
144aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
145aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
146aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
147aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
148aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)dst, src_r0);
149aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
150aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
151aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
152aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
153aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 4-7
154aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src += 4 * src_wd;
155aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst += 4 * dst_wd;
156aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
157aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_loadl_epi64((__m128i *)src);
158aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
159aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
160aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
161aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
162aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)dst, src_r0);
163aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
164aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
165aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
166aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
167aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
168aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* copy V                                              */
169aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
170aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src = src_buf->pu1_v;
171aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst = dst_buf->pu1_v;
172aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 0-3
173aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_loadl_epi64((__m128i *)src);
174aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
175aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
176aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
177aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
178aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)dst, src_r0);
179aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
180aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
181aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
182aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
183aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 4-7
184aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src += 4 * src_wd;
185aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst += 4 * dst_wd;
186aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
187aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_loadl_epi64((__m128i *)src);
188aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
189aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
190aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
191aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
192aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)dst, src_r0);
193aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
194aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
195aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
196aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani}
197aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
198aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*****************************************************************************/
199aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
200aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Function Name : impeg2_interpolate                                       */
201aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
202aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Description   : averages the contents of buf_src1 and buf_src2 and stores*/
203aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  result in buf_dst                                        */
204aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
205aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Inputs        : buf_src1 -  First Source                                 */
206aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  buf_src2 -  Second Source                                */
207aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
208aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Globals       : None                                                     */
209aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
210aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Processing    : Avg the values from two sources and store the result in  */
211aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  destination buffer                                       */
212aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
213aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Outputs       : buf_dst  -  Avg of contents of buf_src1 and buf_src2     */
214aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
215aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Returns       : None                                                     */
216aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
217aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Issues        : Assumes that all 3 buffers are of same size              */
218aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
219aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*****************************************************************************/
220aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhanivoid impeg2_interpolate_sse42(yuv_buf_t *buf_src1,
221aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                        yuv_buf_t *buf_src2,
222aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                        yuv_buf_t *buf_dst,
223aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                        UWORD32 stride)
224aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani{
225aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    UWORD8 *src1, *src2;
226aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    UWORD8 *dst;
227aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    __m128i src1_r0, src1_r1, src1_r2, src1_r3;
228aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    __m128i src2_r0, src2_r1, src2_r2, src2_r3;
229aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
230aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
231aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* interpolate Y                                       */
232aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
233aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1 = buf_src1->pu1_y;
234aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2 = buf_src2->pu1_y;
235aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst  = buf_dst->pu1_y;
236aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 0-3
237aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
238aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
239aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
240aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
241aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
242aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
243aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
244aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
245aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
246aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
247aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
248aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
249aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
250aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
251aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
252aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) dst, src1_r0);
253aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
254aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
255aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
256aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
257aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 4-7
258aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1 += 4 * 16;
259aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2 += 4 * 16;
260aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst += 4 * stride;
261aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
262aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
263aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
264aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
265aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
266aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
267aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
268aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
269aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
270aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
271aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
272aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
273aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
274aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
275aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
276aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) dst, src1_r0);
277aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
278aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
279aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
280aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
281aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 8-11
282aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1 += 4 * 16;
283aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2 += 4 * 16;
284aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst += 4 * stride;
285aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
286aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
287aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
288aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
289aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
290aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
291aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
292aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
293aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
294aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
295aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
296aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
297aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
298aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
299aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
300aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) dst, src1_r0);
301aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
302aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
303aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
304aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
305aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 12-15
306aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1 += 4 * 16;
307aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2 += 4 * 16;
308aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst += 4 * stride;
309aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_loadu_si128((__m128i *) (src1));
310aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
311aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
312aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
313aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
314aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r0 = _mm_loadu_si128((__m128i *) (src2));
315aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
316aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
317aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
318aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
319aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
320aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
321aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
322aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
323aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
324aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) dst, src1_r0);
325aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
326aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
327aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
328aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
329aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    stride >>= 1;
330aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
331aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
332aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* interpolate U                                       */
333aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
334aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1 = buf_src1->pu1_u;
335aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2 = buf_src2->pu1_u;
336aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst  = buf_dst->pu1_u;
337aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 0-3
338aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
339aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
340aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
341aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
342aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
343aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
344aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
345aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
346aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
347aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
348aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
349aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
350aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
351aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
352aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
353aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) dst, src1_r0);
354aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
355aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
356aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
357aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
358aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 4-7
359aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1 += 4 * 8;
360aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2 += 4 * 8;
361aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst += 4 * stride;
362aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
363aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
364aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
365aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
366aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
367aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
368aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
369aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
370aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
371aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
372aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
373aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
374aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
375aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
376aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
377aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
378aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) dst, src1_r0);
379aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
380aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
381aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
382aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
383aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
384aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* interpolate V                                       */
385aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*******************************************************/
386aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1 = buf_src1->pu1_v;
387aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2 = buf_src2->pu1_v;
388aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst  = buf_dst->pu1_v;
389aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
390aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 0-3
391aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
392aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
393aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
394aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
395aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
396aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
397aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
398aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
399aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
400aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
401aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
402aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
403aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
404aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
405aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
406aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) dst, src1_r0);
407aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
408aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
409aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
410aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
411aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 4-7
412aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1 += 4 * 8;
413aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2 += 4 * 8;
414aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    dst += 4 * stride;
415aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
416aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
417aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
418aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
419aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
420aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
421aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
422aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
423aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
424aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
425aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
426aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
427aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
428aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
429aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
430aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
431aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) dst, src1_r0);
432aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
433aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
434aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
435aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani}
436aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
437aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*****************************************************************************/
438aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
439aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Function Name : impeg2_mc_halfx_halfy_8x8_sse42()                                 */
440aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
441aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Description   : Gets the buffer from (0.5,0.5) to (8.5,8.5)              */
442aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  and the above block of size 8 x 8 will be placed as a    */
443aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  block from the current position of out_buf               */
444aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
445aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Inputs        : ref - Reference frame from which the block will be       */
446aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                        block will be extracted.                           */
447aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  ref_wid - WIdth of reference frame                       */
448aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  out_wid - WIdth of the output frame                      */
449aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  blk_width  - width of the block                          */
450aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  blk_width  - height of the block                         */
451aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
452aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Globals       : None                                                     */
453aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
454aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Processing    : Point to the (0,0),(1,0),(0,1),(1,1) position in         */
455aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  the ref frame.Interpolate these four values to get the   */
456aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  value at(0.5,0.5).Repeat this to get an 8 x 8 block      */
457aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  using 9 x 9 block from reference frame                   */
458aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
459aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Outputs       : out -  Output containing the extracted block             */
460aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
461aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Returns       : None                                                     */
462aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
463aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Issues        : None                                                     */
464aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
465aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*****************************************************************************/
466aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhanivoid impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out,
467aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD8 *ref,
468aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD32 ref_wid,
469aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD32 out_wid)
470aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani{
471aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3;
472aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* P0-P3 are the pixels in the reference frame and Q is the value being */
473aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* estimated                                                            */
474aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*
475aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani       P0 P1
476aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani         Q
477aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani       P2 P3
478aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    */
479aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    __m128i src_r0, src_r0_1, src_r1, src_r1_1;
480aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    __m128i tmp0, tmp1;
481aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    __m128i value_2 = _mm_set1_epi16(2);
482aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
483aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p0 = ref;
484aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p1 = ref + 1;
485aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p2 = ref + ref_wid;
486aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p3 = ref + ref_wid + 1;
487aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
488aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 0
489aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
490aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 1
491aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
492aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
493aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_cvtepu8_epi16(src_r0);
494aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
495aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 =  _mm_cvtepu8_epi16(src_r1);
496aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1_1 =  _mm_cvtepu8_epi16(src_r1_1);
497aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
498aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 0 horizontal interpolation
499aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(src_r1, src_r1_1);             //Row 1 horizontal interpolation
500aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(tmp0, tmp1);                   //Row 0 vertical interpolation
501aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(tmp0, value_2);
502aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 =  _mm_srli_epi16(tmp0, 2);
503aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_packus_epi16(tmp0, value_2);
504aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
505aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, tmp0);
506aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
507aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    //Row 1
508aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p2 += ref_wid;
509aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p3 += ref_wid;
510aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out += out_wid;
511aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
512aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 2
513aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
514aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
515aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_cvtepu8_epi16(src_r0);
516aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
517aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
518aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(src_r0, src_r0_1);         //Row 2 horizontal interpolation
519aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(tmp0, tmp1);               //Row 1 vertical interpolation
520aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(tmp1, value_2);
521aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 =  _mm_srli_epi16(tmp1, 2);
522aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_packus_epi16(tmp1, value_2);
523aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
524aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, tmp1);
525aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
526aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    //Row 2
527aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p2 += ref_wid;
528aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p3 += ref_wid;
529aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out += out_wid;
530aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
531aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 3
532aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
533aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
534aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_cvtepu8_epi16(src_r0);
535aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
536aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
537aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(src_r0, src_r0_1);         //Row 3 horizontal interpolation
538aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
539aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(tmp0, tmp1);               //Row 2 vertical interpolation
540aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(tmp0, value_2);
541aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 =  _mm_srli_epi16(tmp0, 2);
542aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_packus_epi16(tmp0, value_2);
543aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
544aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, tmp0);
545aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
546aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    //Row 3
547aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p2 += ref_wid;
548aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p3 += ref_wid;
549aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out += out_wid;
550aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
551aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 4
552aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
553aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
554aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_cvtepu8_epi16(src_r0);
555aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
556aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
557aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(src_r0, src_r0_1);         //Row 4 horizontal interpolation
558aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
559aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(tmp0, tmp1);               //Row 3 vertical interpolation
560aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(tmp1, value_2);
561aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 =  _mm_srli_epi16(tmp1, 2);
562aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_packus_epi16(tmp1, value_2);
563aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
564aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, tmp1);
565aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
566aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    //Row 4
567aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p2 += ref_wid;
568aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p3 += ref_wid;
569aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out += out_wid;
570aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
571aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 5
572aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
573aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
574aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_cvtepu8_epi16(src_r0);
575aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
576aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
577aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(src_r0, src_r0_1);     //Row 5 horizontal interpolation
578aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
579aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(tmp0, tmp1);           //Row 4 vertical interpolation
580aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(tmp0, value_2);
581aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 =  _mm_srli_epi16(tmp0, 2);
582aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_packus_epi16(tmp0, value_2);
583aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
584aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, tmp0);
585aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
586aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    //Row 5
587aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p2 += ref_wid;
588aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p3 += ref_wid;
589aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out += out_wid;
590aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
591aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 6
592aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
593aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
594aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_cvtepu8_epi16(src_r0);
595aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
596aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
597aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 6 horizontal interpolation
598aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
599aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(tmp0, tmp1);                   //Row 5 vertical interpolation
600aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(tmp1, value_2);
601aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 =  _mm_srli_epi16(tmp1, 2);
602aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_packus_epi16(tmp1, value_2);
603aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
604aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, tmp1);
605aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
606aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    //Row 6
607aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p2 += ref_wid;
608aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p3 += ref_wid;
609aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out += out_wid;
610aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
611aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 7
612aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
613aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
614aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_cvtepu8_epi16(src_r0);
615aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
616aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
617aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(src_r0, src_r0_1);             //Row 7 horizontal interpolation
618aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
619aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(tmp0, tmp1);                   //Row 6 vertical interpolation
620aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(tmp0, value_2);
621aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 =  _mm_srli_epi16(tmp0, 2);
622aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_packus_epi16(tmp0, value_2);
623aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
624aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, tmp0);
625aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
626aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    //Row 7
627aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p2 += ref_wid;
628aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p3 += ref_wid;
629aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out += out_wid;
630aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
631aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 8
632aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
633aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
634aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_cvtepu8_epi16(src_r0);
635aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
636aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
637aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 8 horizontal interpolation
638aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
639aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(tmp0, tmp1);                   //Row 7 vertical interpolation
640aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_add_epi16(tmp1, value_2);
641aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 =  _mm_srli_epi16(tmp1, 2);
642aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    tmp1 = _mm_packus_epi16(tmp1, value_2);
643aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
644aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, tmp1);
645aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
646aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    return;
647aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani}
648aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
649aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*****************************************************************************/
650aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
651aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Function Name : impeg2_mc_halfx_fully_8x8_sse42()                                 */
652aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
653aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Description   : Gets the buffer from (0.5,0) to (8.5,8)                  */
654aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  and the above block of size 8 x 8 will be placed as a    */
655aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  block from the current position of out_buf               */
656aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
657aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Inputs        : ref - Reference frame from which the block will be       */
658aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                        block will be extracted.                           */
659aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  ref_wid - WIdth of reference frame                       */
660aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  out_wid - WIdth of the output frame                      */
661aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  blk_width  - width of the block                          */
662aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  blk_width  - height of the block                         */
663aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
664aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Globals       : None                                                     */
665aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
666aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Processing    : Point to the (0,0) and (1,0) position in the ref frame   */
667aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  Interpolate these two values to get the value at(0.5,0)  */
668aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  Repeat this to get an 8 x 8 block using 9 x 8 block from */
669aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  reference frame                                          */
670aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
671aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Outputs       : out -  Output containing the extracted block             */
672aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
673aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Returns       : None                                                     */
674aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
675aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Issues        : None                                                     */
676aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
677aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*****************************************************************************/
678aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhanivoid impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out,
679aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD8 *ref,
680aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD32 ref_wid,
681aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD32 out_wid)
682aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani{
683aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    UWORD8 *ref_p0,*ref_p1;
684aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    __m128i src_r0, src_r0_1, src_r1, src_r1_1;
685aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* P0-P3 are the pixels in the reference frame and Q is the value being */
686aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* estimated                                                            */
687aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*
688aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani       P0 Q P1
689aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    */
690aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
691aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p0 = ref;
692aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p1 = ref + 1;
693aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
694aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 0 and 1
695aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 0
696aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
697aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 1
698aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
699aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
700aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
701aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
702aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
703aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, src_r0);
704aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
705aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
706aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 2 and 3
707aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p0 += 2*ref_wid;
708aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p1 += 2*ref_wid;
709aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out += 2*out_wid;
710aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
711aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 2
712aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
713aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 3
714aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
715aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
716aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
717aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
718aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
719aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, src_r0);
720aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
721aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
722aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 4 and 5
723aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p0 += 2*ref_wid;
724aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p1 += 2*ref_wid;
725aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out += 2*out_wid;
726aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
727aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 4
728aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
729aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 5
730aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
731aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
732aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
733aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
734aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
735aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, src_r0);
736aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
737aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
738aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 6 and 7
739aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p0 += 2*ref_wid;
740aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref_p1 += 2*ref_wid;
741aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out += 2*out_wid;
742aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
743aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 6
744aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
745aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 7
746aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
747aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
748aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
749aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
750aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
751aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, src_r0);
752aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
753aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
754aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    return;
755aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani}
756aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
757aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
758aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*****************************************************************************/
759aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
760aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Function Name : impeg2_mc_fullx_halfy_8x8_sse42()                                 */
761aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
762aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Description   : Gets the buffer from (0,0.5) to (8,8.5)                  */
763aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  and the above block of size 8 x 8 will be placed as a    */
764aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  block from the current position of out_buf               */
765aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
766aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Inputs        : ref - Reference frame from which the block will be       */
767aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                        block will be extracted.                           */
768aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  ref_wid - WIdth of reference frame                       */
769aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  out_wid - WIdth of the output frame                      */
770aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  blk_width  - width of the block                          */
771aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  blk_width  - height of the block                         */
772aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
773aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Globals       : None                                                     */
774aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
775aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Processing    : Point to the (0,0) and (0,1)   position in the ref frame */
776aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  Interpolate these two values to get the value at(0,0.5)  */
777aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  Repeat this to get an 8 x 8 block using 8 x 9 block from */
778aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  reference frame                                          */
779aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
780aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Outputs       : out -  Output containing the extracted block             */
781aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
782aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Returns       : None                                                     */
783aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
784aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Issues        : None                                                     */
785aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
786aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*****************************************************************************/
787aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhanivoid impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out,
788aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD8 *ref,
789aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD32 ref_wid,
790aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD32 out_wid)
791aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani{
792aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    __m128i src_r0, src_r1, src_r2, temp0, temp1;
793aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* P0-P3 are the pixels in the reference frame and Q is the value being */
794aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /* estimated                                                            */
795aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    /*
796aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani       P0
797aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani        x
798aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani       P1
799aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    */
800aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *)ref);               //Row 0
801aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 1
802aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));   //Row 2
803aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    temp0 = _mm_avg_epu8(src_r0, src_r1);
804aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    temp1 = _mm_avg_epu8(src_r1, src_r2);
805aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, temp0);                //Row 0
806aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 1
807aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
808aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref+= 3*ref_wid;
809aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out+= 2*out_wid;
810aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
811aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *)ref);               //Row 3
812aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 4
813aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    temp0 = _mm_avg_epu8(src_r2, src_r0);
814aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    temp1 = _mm_avg_epu8(src_r0, src_r1);
815aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, temp0);                //Row 2
816aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 3
817aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
818aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref += 2*ref_wid;
819aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out+= 2*out_wid;
820aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
821aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 = _mm_loadl_epi64((__m128i *)ref);               //Row 5
822aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 6
823aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    temp0 = _mm_avg_epu8(src_r1, src_r2);
824aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    temp1 = _mm_avg_epu8(src_r2, src_r0);
825aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, temp0);                //Row 4
826aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 5
827aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
828aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref += 2*ref_wid;
829aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out+= 2*out_wid;
830aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
831aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 = _mm_loadl_epi64((__m128i *)ref);               //Row 7
832aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid));  //Row 8
833aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    temp0 = _mm_avg_epu8(src_r0, src_r1);
834aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    temp1 = _mm_avg_epu8(src_r1, src_r2);
835aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, temp0);                //Row 6
836aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 7
837aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
838aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    return;
839aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani}
840aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
841aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*****************************************************************************/
842aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
843aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Function Name : impeg2_mc_fullx_fully_8x8_sse42()                                 */
844aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
845aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Description   : Gets the buffer from (x,y) to (x+8,y+8)                  */
846aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  and the above block of size 8 x 8 will be placed as a    */
847aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  block from the current position of out_buf               */
848aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
849aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Inputs        : ref - Reference frame from which the block will be       */
850aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                        block will be extracted.                           */
851aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  ref_wid - WIdth of reference frame                       */
852aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  out_wid - WIdth of the output frame                      */
853aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  blk_width  - width of the block                          */
854aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  blk_width  - height of the block                         */
855aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
856aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Globals       : None                                                     */
857aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
858aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Processing    : Point to the (0,0) position in the ref frame             */
859aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                  Get an 8 x 8 block from reference frame                  */
860aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
861aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Outputs       : out -  Output containing the extracted block             */
862aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
863aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Returns       : None                                                     */
864aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
865aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*  Issues        : None                                                     */
866aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*                                                                           */
867aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani/*****************************************************************************/
868aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhanivoid impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out,
869aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD8 *ref,
870aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD32 ref_wid,
871aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani                            UWORD32 out_wid)
872aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani{
873aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    __m128i src_r0, src_r1, src_r2, src_r3;
874aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 0-3
875aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_loadl_epi64((__m128i *)ref);
876aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 =  _mm_loadl_epi64((__m128i *)(ref + ref_wid));
877aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 =  _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
878aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r3 =  _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
879aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
880aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, src_r0);
881aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
882aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
883aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
884aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
885aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    // Row 4-7
886aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    ref += 4 * ref_wid;
887aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    out += 4 * out_wid;
888aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
889aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r0 =  _mm_loadl_epi64((__m128i *)ref);
890aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r1 =  _mm_loadl_epi64((__m128i *)(ref + ref_wid));
891aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r2 =  _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
892aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    src_r3 =  _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
893aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani
894aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)out, src_r0);
895aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
896aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
897aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
898aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani    return;
899aed24eee7ddfc93f1436b0c1679431bd286879b4Venkatarama Avadhani}
900