1/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20
21/**
22******************************************************************************
23* @file ime_distortion_metrics_sse42.c
24*
25* @brief
26*  This file contains definitions of routines that compute distortion
27*  between two macro/sub blocks of identical dimensions
28*
29* @author
30*  Ittiam
31*
32* @par List of Functions:
33*  - ime_compute_sad_16x16_sse42()
34*  - ime_compute_sad_16x16_fast_sse42()
35*  - ime_compute_sad_16x16_ea8_sse42()
36*  - ime_compute_sad_16x8_sse42()
37*  - ime_calculate_sad4_prog_sse42()
38*  - ime_sub_pel_compute_sad_16x16_sse42()
39*  - ime_compute_satqd_16x16_lumainter_sse42()
40*
41* @remarks
42*  None
43*
44*******************************************************************************
45*/
46
47/*****************************************************************************/
48/* File Includes                                                             */
49/*****************************************************************************/
50
51/* System include files */
52#include <stdio.h>
53#include <stdlib.h>
54#include <string.h>
55
56/* User include files */
57#include "ime_typedefs.h"
58#include "ime_defs.h"
59#include "ime_macros.h"
60#include "ime_statistics.h"
61#include "ime_platform_macros.h"
62#include "ime_distortion_metrics.h"
63#include <immintrin.h>
64
65/*****************************************************************************/
66/* Function Definitions                                                      */
67/*****************************************************************************/
68
69/**
70******************************************************************************
71*
72* @brief computes distortion (SAD) between 2 16x16 blocks
73*
74* @par   Description
75*   This functions computes SAD between 2 16x16 blocks. There is a provision
76*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
77*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
78*
79* @param[in] pu1_src
80*  UWORD8 pointer to the source
81*
82* @param[out] pu1_dst
83*  UWORD8 pointer to the destination
84*
85* @param[in] src_strd
86*  integer source stride
87*
88* @param[in] dst_strd
89*  integer destination stride
90*
91* @param[in] i4_max_sad
92*  integer maximum allowed distortion
93*
94* @param[out] pi4_mb_distortion
95*  integer evaluated sad
96*
97* @remarks
98*
99******************************************************************************
100*/
101void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src,
102                           UWORD8 *pu1_est,
103                           WORD32 src_strd,
104                           WORD32 est_strd,
105                           WORD32 i4_max_sad,
106                           WORD32 *pi4_mb_distortion)
107{
108    __m128i src_r0, src_r1, src_r2, src_r3;
109    __m128i est_r0, est_r1, est_r2, est_r3;
110    __m128i res_r0, res_r1, res_r2, res_r3;
111    __m128i sad_val;
112    int val1, val2;
113    UNUSED (i4_max_sad);
114
115    // Row 0-3 sad calculation
116    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
117    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
118    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
119    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
120
121    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
122    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
123    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
124    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
125
126    res_r0 = _mm_sad_epu8(src_r0, est_r0);
127    res_r1 = _mm_sad_epu8(src_r1, est_r1);
128    res_r2 = _mm_sad_epu8(src_r2, est_r2);
129    res_r3 = _mm_sad_epu8(src_r3, est_r3);
130
131    sad_val = _mm_add_epi64(res_r0, res_r1);
132    sad_val = _mm_add_epi64(sad_val, res_r2);
133    sad_val = _mm_add_epi64(sad_val, res_r3);
134
135    // Row 4-7 sad calculation
136    pu1_src += 4*src_strd;
137    pu1_est += 4*est_strd;
138
139    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
140    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
141    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
142    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
143
144    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
145    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
146    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
147    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
148
149    res_r0 = _mm_sad_epu8(src_r0, est_r0);
150    res_r1 = _mm_sad_epu8(src_r1, est_r1);
151    res_r2 = _mm_sad_epu8(src_r2, est_r2);
152    res_r3 = _mm_sad_epu8(src_r3, est_r3);
153
154    sad_val = _mm_add_epi64(sad_val, res_r0);
155    sad_val = _mm_add_epi64(sad_val, res_r1);
156    sad_val = _mm_add_epi64(sad_val, res_r2);
157    sad_val = _mm_add_epi64(sad_val, res_r3);
158
159    // Row 8-11 sad calculation
160    pu1_src += 4*src_strd;
161    pu1_est += 4*est_strd;
162    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
163    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
164    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
165    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
166
167    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
168    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
169    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
170    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
171
172    res_r0 = _mm_sad_epu8(src_r0, est_r0);
173    res_r1 = _mm_sad_epu8(src_r1, est_r1);
174    res_r2 = _mm_sad_epu8(src_r2, est_r2);
175    res_r3 = _mm_sad_epu8(src_r3, est_r3);
176
177    sad_val = _mm_add_epi64(sad_val, res_r0);
178    sad_val = _mm_add_epi64(sad_val, res_r1);
179    sad_val = _mm_add_epi64(sad_val, res_r2);
180    sad_val = _mm_add_epi64(sad_val, res_r3);
181
182    // Row 12-15 sad calculation
183    pu1_src += 4*src_strd;
184    pu1_est += 4*est_strd;
185    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
186    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
187    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
188    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
189
190    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
191    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
192    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
193    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
194
195    res_r0 = _mm_sad_epu8(src_r0, est_r0);
196    res_r1 = _mm_sad_epu8(src_r1, est_r1);
197    res_r2 = _mm_sad_epu8(src_r2, est_r2);
198    res_r3 = _mm_sad_epu8(src_r3, est_r3);
199
200    sad_val = _mm_add_epi64(sad_val, res_r0);
201    sad_val = _mm_add_epi64(sad_val, res_r1);
202    sad_val = _mm_add_epi64(sad_val, res_r2);
203    sad_val = _mm_add_epi64(sad_val, res_r3);
204
205    val1 = _mm_extract_epi32(sad_val,0);
206    val2 = _mm_extract_epi32(sad_val, 2);
207    *pi4_mb_distortion = (val1+val2);
208
209    return;
210}
211
212/**
213******************************************************************************
214*
215*  @brief computes distortion (SAD) between 2 16x8  blocks
216*
217*
218*  @par   Description
219*   This functions computes SAD between 2 16x8 blocks. There is a provision
220*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
221*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
222*
223* @param[in] pu1_src
224*  UWORD8 pointer to the source
225*
226* @param[out] pu1_dst
227*  UWORD8 pointer to the destination
228*
229* @param[in] src_strd
230*  integer source stride
231*
232* @param[in] dst_strd
233*  integer destination stride
234*
235* @param[in] u4_max_sad
236*  integer maximum allowed distortion
237*
238* @param[out] pi4_mb_distortion
239*  integer evaluated sad
240*
241* @remarks
242*
243******************************************************************************
244*/
245void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src,
246                    UWORD8 *pu1_est,
247                    WORD32 src_strd,
248                    WORD32 est_strd,
249                    WORD32 i4_max_sad,
250                    WORD32 *pi4_mb_distortion)
251{
252    __m128i src_r0, src_r1, src_r2, src_r3;
253    __m128i est_r0, est_r1, est_r2, est_r3;
254    __m128i res_r0, res_r1, res_r2, res_r3;
255    __m128i sad_val;
256    int val1, val2;
257    UNUSED (i4_max_sad);
258
259    // Row 0-3 sad calculation
260    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
261    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
262    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
263    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
264
265    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
266    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
267    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
268    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
269
270    res_r0 = _mm_sad_epu8(src_r0, est_r0);
271    res_r1 = _mm_sad_epu8(src_r1, est_r1);
272    res_r2 = _mm_sad_epu8(src_r2, est_r2);
273    res_r3 = _mm_sad_epu8(src_r3, est_r3);
274
275    sad_val = _mm_add_epi64(res_r0, res_r1);
276    sad_val = _mm_add_epi64(sad_val, res_r2);
277    sad_val = _mm_add_epi64(sad_val, res_r3);
278
279    // Row 4-7 sad calculation
280    pu1_src += 4*src_strd;
281    pu1_est += 4*est_strd;
282
283    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
284    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
285    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
286    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
287
288    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
289    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
290    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
291    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
292
293    res_r0 = _mm_sad_epu8(src_r0, est_r0);
294    res_r1 = _mm_sad_epu8(src_r1, est_r1);
295    res_r2 = _mm_sad_epu8(src_r2, est_r2);
296    res_r3 = _mm_sad_epu8(src_r3, est_r3);
297
298    sad_val = _mm_add_epi64(sad_val, res_r0);
299    sad_val = _mm_add_epi64(sad_val, res_r1);
300    sad_val = _mm_add_epi64(sad_val, res_r2);
301    sad_val = _mm_add_epi64(sad_val, res_r3);
302
303    val1 = _mm_extract_epi32(sad_val,0);
304    val2 = _mm_extract_epi32(sad_val, 2);
305    *pi4_mb_distortion = (val1+val2);
306    return;
307}
308
309/**
310******************************************************************************
311*
312* @brief computes distortion (SAD) between 2 16x16 blocks
313*
314* @par   Description
315*   This functions computes SAD between 2 16x16 blocks. There is a provision
316*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
317*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
318*
319* @param[in] pu1_src
320*  UWORD8 pointer to the source
321*
322* @param[out] pu1_dst
323*  UWORD8 pointer to the destination
324*
325* @param[in] src_strd
326*  integer source stride
327*
328* @param[in] dst_strd
329*  integer destination stride
330*
331* @param[in] i4_max_sad
332*  integer maximum allowed distortion
333*
334* @param[out] pi4_mb_distortion
335*  integer evaluated sad
336*
337* @remarks
338*
339******************************************************************************
340*/
341void ime_compute_sad_16x16_ea8_sse42(UWORD8 *pu1_src,
342                               UWORD8 *pu1_est,
343                               WORD32 src_strd,
344                               WORD32 est_strd,
345                               WORD32 i4_max_sad,
346                               WORD32 *pi4_mb_distortion)
347{
348    __m128i src_r0, src_r1, src_r2, src_r3;
349    __m128i est_r0, est_r1, est_r2, est_r3;
350    __m128i res_r0, res_r1, res_r2, res_r3;
351    __m128i sad_val;
352    WORD32 val1, val2;
353    WORD32 i4_sad;
354    UWORD8 *pu1_src_temp = pu1_src + src_strd;
355    UWORD8 *pu1_est_temp = pu1_est + est_strd;
356
357    // Row 0,2,4,6 sad calculation
358    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
359    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
360    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
361    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
362
363    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
364    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
365    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
366    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
367
368    res_r0 = _mm_sad_epu8(src_r0, est_r0);
369    res_r1 = _mm_sad_epu8(src_r1, est_r1);
370    res_r2 = _mm_sad_epu8(src_r2, est_r2);
371    res_r3 = _mm_sad_epu8(src_r3, est_r3);
372
373    sad_val = _mm_add_epi64(res_r0, res_r1);
374    sad_val = _mm_add_epi64(sad_val, res_r2);
375    sad_val = _mm_add_epi64(sad_val, res_r3);
376
377    // Row 8,10,12,14 sad calculation
378    pu1_src += 8*src_strd;
379    pu1_est += 8*est_strd;
380
381    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
382    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
383    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
384    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
385
386    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
387    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
388    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
389    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
390
391    res_r0 = _mm_sad_epu8(src_r0, est_r0);
392    res_r1 = _mm_sad_epu8(src_r1, est_r1);
393    res_r2 = _mm_sad_epu8(src_r2, est_r2);
394    res_r3 = _mm_sad_epu8(src_r3, est_r3);
395
396    sad_val = _mm_add_epi64(sad_val, res_r0);
397    sad_val = _mm_add_epi64(sad_val, res_r1);
398    sad_val = _mm_add_epi64(sad_val, res_r2);
399    sad_val = _mm_add_epi64(sad_val, res_r3);
400
401    pu1_src = pu1_src_temp;
402    pu1_est = pu1_est_temp;
403
404    val1 = _mm_extract_epi32(sad_val, 0);
405    val2 = _mm_extract_epi32(sad_val, 2);
406
407    i4_sad = val1 + val2;
408    if (i4_max_sad < i4_sad)
409    {
410        *pi4_mb_distortion = i4_sad;
411        return ;
412    }
413    // Row 1,3,5,7 sad calculation
414    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
415    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
416    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
417    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
418
419    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
420    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
421    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
422    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
423
424    res_r0 = _mm_sad_epu8(src_r0, est_r0);
425    res_r1 = _mm_sad_epu8(src_r1, est_r1);
426    res_r2 = _mm_sad_epu8(src_r2, est_r2);
427    res_r3 = _mm_sad_epu8(src_r3, est_r3);
428
429    sad_val = _mm_add_epi64(sad_val, res_r0);
430    sad_val = _mm_add_epi64(sad_val, res_r1);
431    sad_val = _mm_add_epi64(sad_val, res_r2);
432    sad_val = _mm_add_epi64(sad_val, res_r3);
433
434    // Row 9,11,13,15 sad calculation
435    pu1_src += 8*src_strd;
436    pu1_est += 8*est_strd;
437    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
438    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
439    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
440    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
441
442    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
443    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
444    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
445    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
446
447    res_r0 = _mm_sad_epu8(src_r0, est_r0);
448    res_r1 = _mm_sad_epu8(src_r1, est_r1);
449    res_r2 = _mm_sad_epu8(src_r2, est_r2);
450    res_r3 = _mm_sad_epu8(src_r3, est_r3);
451
452    sad_val = _mm_add_epi64(sad_val, res_r0);
453    sad_val = _mm_add_epi64(sad_val, res_r1);
454    sad_val = _mm_add_epi64(sad_val, res_r2);
455    sad_val = _mm_add_epi64(sad_val, res_r3);
456
457    val1 = _mm_extract_epi32(sad_val, 0);
458    val2 = _mm_extract_epi32(sad_val, 2);
459    *pi4_mb_distortion = (val1+val2);
460
461    return;
462}
463
464/**
465******************************************************************************
466*
467* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
468*
469* @par   Description
470*   This functions computes SAD between 2 16x16 blocks by processing alternate
471*   rows (fast mode). For fast mode it is assumed sad obtained by processing
472*   alternate rows is approximately twice as that for the whole block.
473*
474* @param[in] pu1_src
475*  UWORD8 pointer to the source
476*
477* @param[out] pu1_dst
478*  UWORD8 pointer to the destination
479*
480* @param[in] src_strd
481*  integer source stride
482*
483* @param[in] dst_strd
484*  integer destination stride
485*
486* @param[in] i4_max_sad
487*  integer maximum allowed distortion
488*
489* @param[out] pi4_mb_distortion
490*  integer evaluated sad
491*
492* @remarks
493*
494******************************************************************************
495*/
496void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src,
497                                UWORD8 *pu1_est,
498                                WORD32 src_strd,
499                                WORD32 est_strd,
500                                WORD32 i4_max_sad,
501                                WORD32 *pi4_mb_distortion)
502{
503    __m128i src_r0, src_r1, src_r2, src_r3;
504    __m128i est_r0, est_r1, est_r2, est_r3;
505    __m128i res_r0, res_r1, res_r2, res_r3;
506    __m128i sad_val;
507    WORD32 val1, val2;
508    WORD32 i4_sad;
509    UWORD8 *pu1_src_temp = pu1_src + src_strd;
510    UWORD8 *pu1_est_temp = pu1_est + est_strd;
511    UNUSED (i4_max_sad);
512
513    // Row 0,2,4,6 sad calculation
514    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
515    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
516    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
517    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));
518
519    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
520    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
521    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
522    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));
523
524    res_r0 = _mm_sad_epu8(src_r0, est_r0);
525    res_r1 = _mm_sad_epu8(src_r1, est_r1);
526    res_r2 = _mm_sad_epu8(src_r2, est_r2);
527    res_r3 = _mm_sad_epu8(src_r3, est_r3);
528
529    sad_val = _mm_add_epi64(res_r0, res_r1);
530    sad_val = _mm_add_epi64(sad_val, res_r2);
531    sad_val = _mm_add_epi64(sad_val, res_r3);
532
533    // Row 8,10,12,14 sad calculation
534    pu1_src += 8 * src_strd;
535    pu1_est += 8 * est_strd;
536
537    src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
538    src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
539    src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
540    src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));
541
542    est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
543    est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
544    est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
545    est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));
546
547    res_r0 = _mm_sad_epu8(src_r0, est_r0);
548    res_r1 = _mm_sad_epu8(src_r1, est_r1);
549    res_r2 = _mm_sad_epu8(src_r2, est_r2);
550    res_r3 = _mm_sad_epu8(src_r3, est_r3);
551
552    sad_val = _mm_add_epi64(sad_val, res_r0);
553    sad_val = _mm_add_epi64(sad_val, res_r1);
554    sad_val = _mm_add_epi64(sad_val, res_r2);
555    sad_val = _mm_add_epi64(sad_val, res_r3);
556
557    pu1_src = pu1_src_temp;
558    pu1_est = pu1_est_temp;
559
560    val1 = _mm_extract_epi32(sad_val, 0);
561    val2 = _mm_extract_epi32(sad_val, 2);
562
563    i4_sad = val1 + val2;
564    *pi4_mb_distortion = (i4_sad<<1);
565    return;
566}
567
568/**
569*******************************************************************************
570*
571* @brief compute sad
572*
573* @par Description: This function computes the sad at vertices of diamond grid
574* centered at reference pointer and at unit distance from it.
575*
576* @param[in] pu1_ref
577*  UWORD8 pointer to the reference
578*
579* @param[out] pu1_src
580*  UWORD8 pointer to the source
581*
582* @param[in] ref_strd
583*  integer reference stride
584*
585* @param[in] src_strd
586*  integer source stride
587*
588* @param[out] pi4_sad
589*  pointer to integer array evaluated sad
590*
591* @returns  sad at all evaluated vertexes
592*
593* @remarks  none
594*
595*******************************************************************************
596*/
597void ime_calculate_sad4_prog_sse42(UWORD8 *pu1_ref,
598                             UWORD8 *pu1_src,
599                             WORD32 ref_strd,
600                             WORD32 src_strd,
601                             WORD32 *pi4_sad)
602{
603    /* reference ptrs at unit 1 distance in diamond pattern centered at pu1_ref */
604    UWORD8 *left_ptr    = pu1_ref - 1;
605    UWORD8 *right_ptr   = pu1_ref + 1;
606    UWORD8 *top_ptr     = pu1_ref - ref_strd;
607    UWORD8 *bot_ptr     = pu1_ref + ref_strd;
608
609    WORD32 val1, val2;
610    __m128i src, ref_left, ref_right, ref_top, ref_bot;
611    __m128i res_r0, res_r1, res_r2, res_r3;
612    __m128i sad_r0, sad_r1, sad_r2, sad_r3;
613
614    // Row 0 sad calculation
615    src = _mm_loadu_si128((__m128i *) (pu1_src));
616    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
617    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
618    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
619    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
620
621    sad_r0 = _mm_sad_epu8(src, ref_left);
622    sad_r1 = _mm_sad_epu8(src, ref_right);
623    sad_r2 = _mm_sad_epu8(src, ref_top);
624    sad_r3 = _mm_sad_epu8(src, ref_bot);
625
626    pu1_src += src_strd;
627    left_ptr += ref_strd;
628    right_ptr += ref_strd;
629    top_ptr += ref_strd;
630    bot_ptr += ref_strd;
631
632    // Row 1 sad calculation
633    src = _mm_loadu_si128((__m128i *) (pu1_src));
634    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
635    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
636    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
637    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
638
639    res_r0 = _mm_sad_epu8(src, ref_left);
640    res_r1 = _mm_sad_epu8(src, ref_right);
641    res_r2 = _mm_sad_epu8(src, ref_top);
642    res_r3 = _mm_sad_epu8(src, ref_bot);
643
644    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
645    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
646    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
647    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
648
649    pu1_src += src_strd;
650    left_ptr += ref_strd;
651    right_ptr += ref_strd;
652    top_ptr += ref_strd;
653    bot_ptr += ref_strd;
654
655    // Row 2 sad calculation
656    src = _mm_loadu_si128((__m128i *) (pu1_src));
657    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
658    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
659    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
660    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
661
662    res_r0 = _mm_sad_epu8(src, ref_left);
663    res_r1 = _mm_sad_epu8(src, ref_right);
664    res_r2 = _mm_sad_epu8(src, ref_top);
665    res_r3 = _mm_sad_epu8(src, ref_bot);
666
667    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
668    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
669    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
670    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
671
672    pu1_src += src_strd;
673    left_ptr += ref_strd;
674    right_ptr += ref_strd;
675    top_ptr += ref_strd;
676    bot_ptr += ref_strd;
677
678    // Row 3 sad calculation
679    src = _mm_loadu_si128((__m128i *) (pu1_src));
680    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
681    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
682    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
683    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
684
685    res_r0 = _mm_sad_epu8(src, ref_left);
686    res_r1 = _mm_sad_epu8(src, ref_right);
687    res_r2 = _mm_sad_epu8(src, ref_top);
688    res_r3 = _mm_sad_epu8(src, ref_bot);
689
690    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
691    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
692    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
693    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
694
695    pu1_src += src_strd;
696    left_ptr += ref_strd;
697    right_ptr += ref_strd;
698    top_ptr += ref_strd;
699    bot_ptr += ref_strd;
700
701    // Row 4 sad calculation
702    src = _mm_loadu_si128((__m128i *) (pu1_src));
703    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
704    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
705    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
706    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
707
708    res_r0 = _mm_sad_epu8(src, ref_left);
709    res_r1 = _mm_sad_epu8(src, ref_right);
710    res_r2 = _mm_sad_epu8(src, ref_top);
711    res_r3 = _mm_sad_epu8(src, ref_bot);
712
713    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
714    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
715    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
716    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
717
718    pu1_src += src_strd;
719    left_ptr += ref_strd;
720    right_ptr += ref_strd;
721    top_ptr += ref_strd;
722    bot_ptr += ref_strd;
723
724    // Row 5 sad calculation
725    src = _mm_loadu_si128((__m128i *) (pu1_src));
726    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
727    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
728    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
729    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
730
731    res_r0 = _mm_sad_epu8(src, ref_left);
732    res_r1 = _mm_sad_epu8(src, ref_right);
733    res_r2 = _mm_sad_epu8(src, ref_top);
734    res_r3 = _mm_sad_epu8(src, ref_bot);
735
736    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
737    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
738    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
739    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
740
741    pu1_src += src_strd;
742    left_ptr += ref_strd;
743    right_ptr += ref_strd;
744    top_ptr += ref_strd;
745    bot_ptr += ref_strd;
746
747    // Row 6 sad calculation
748    src = _mm_loadu_si128((__m128i *) (pu1_src));
749    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
750    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
751    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
752    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
753
754    res_r0 = _mm_sad_epu8(src, ref_left);
755    res_r1 = _mm_sad_epu8(src, ref_right);
756    res_r2 = _mm_sad_epu8(src, ref_top);
757    res_r3 = _mm_sad_epu8(src, ref_bot);
758
759    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
760    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
761    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
762    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
763
764    pu1_src += src_strd;
765    left_ptr += ref_strd;
766    right_ptr += ref_strd;
767    top_ptr += ref_strd;
768    bot_ptr += ref_strd;
769
770    // Row 7 sad calculation
771    src = _mm_loadu_si128((__m128i *) (pu1_src));
772    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
773    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
774    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
775    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
776
777    res_r0 = _mm_sad_epu8(src, ref_left);
778    res_r1 = _mm_sad_epu8(src, ref_right);
779    res_r2 = _mm_sad_epu8(src, ref_top);
780    res_r3 = _mm_sad_epu8(src, ref_bot);
781
782    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
783    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
784    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
785    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
786
787    pu1_src += src_strd;
788    left_ptr += ref_strd;
789    right_ptr += ref_strd;
790    top_ptr += ref_strd;
791    bot_ptr += ref_strd;
792
793    // Row 8 sad calculation
794    src = _mm_loadu_si128((__m128i *) (pu1_src));
795    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
796    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
797    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
798    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
799
800    res_r0 = _mm_sad_epu8(src, ref_left);
801    res_r1 = _mm_sad_epu8(src, ref_right);
802    res_r2 = _mm_sad_epu8(src, ref_top);
803    res_r3 = _mm_sad_epu8(src, ref_bot);
804
805    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
806    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
807    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
808    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
809
810    pu1_src += src_strd;
811    left_ptr += ref_strd;
812    right_ptr += ref_strd;
813    top_ptr += ref_strd;
814    bot_ptr += ref_strd;
815
816    // Row 9 sad calculation
817    src = _mm_loadu_si128((__m128i *) (pu1_src));
818    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
819    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
820    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
821    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
822
823    res_r0 = _mm_sad_epu8(src, ref_left);
824    res_r1 = _mm_sad_epu8(src, ref_right);
825    res_r2 = _mm_sad_epu8(src, ref_top);
826    res_r3 = _mm_sad_epu8(src, ref_bot);
827
828    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
829    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
830    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
831    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
832
833    pu1_src += src_strd;
834    left_ptr += ref_strd;
835    right_ptr += ref_strd;
836    top_ptr += ref_strd;
837    bot_ptr += ref_strd;
838
839    // Row 10 sad calculation
840    src = _mm_loadu_si128((__m128i *) (pu1_src));
841    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
842    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
843    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
844    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
845
846    res_r0 = _mm_sad_epu8(src, ref_left);
847    res_r1 = _mm_sad_epu8(src, ref_right);
848    res_r2 = _mm_sad_epu8(src, ref_top);
849    res_r3 = _mm_sad_epu8(src, ref_bot);
850
851    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
852    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
853    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
854    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
855
856    pu1_src += src_strd;
857    left_ptr += ref_strd;
858    right_ptr += ref_strd;
859    top_ptr += ref_strd;
860    bot_ptr += ref_strd;
861
862    // Row 11 sad calculation
863    src = _mm_loadu_si128((__m128i *) (pu1_src));
864    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
865    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
866    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
867    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
868
869    res_r0 = _mm_sad_epu8(src, ref_left);
870    res_r1 = _mm_sad_epu8(src, ref_right);
871    res_r2 = _mm_sad_epu8(src, ref_top);
872    res_r3 = _mm_sad_epu8(src, ref_bot);
873
874    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
875    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
876    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
877    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
878
879    pu1_src += src_strd;
880    left_ptr += ref_strd;
881    right_ptr += ref_strd;
882    top_ptr += ref_strd;
883    bot_ptr += ref_strd;
884
885    // Row 12 sad calculation
886    src = _mm_loadu_si128((__m128i *) (pu1_src));
887    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
888    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
889    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
890    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
891
892    res_r0 = _mm_sad_epu8(src, ref_left);
893    res_r1 = _mm_sad_epu8(src, ref_right);
894    res_r2 = _mm_sad_epu8(src, ref_top);
895    res_r3 = _mm_sad_epu8(src, ref_bot);
896
897    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
898    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
899    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
900    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
901
902    pu1_src += src_strd;
903    left_ptr += ref_strd;
904    right_ptr += ref_strd;
905    top_ptr += ref_strd;
906    bot_ptr += ref_strd;
907
908    // Row 13 sad calculation
909    src = _mm_loadu_si128((__m128i *) (pu1_src));
910    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
911    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
912    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
913    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
914
915    res_r0 = _mm_sad_epu8(src, ref_left);
916    res_r1 = _mm_sad_epu8(src, ref_right);
917    res_r2 = _mm_sad_epu8(src, ref_top);
918    res_r3 = _mm_sad_epu8(src, ref_bot);
919
920    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
921    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
922    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
923    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
924
925    pu1_src += src_strd;
926    left_ptr += ref_strd;
927    right_ptr += ref_strd;
928    top_ptr += ref_strd;
929    bot_ptr += ref_strd;
930
931    // Row 14 sad calculation
932    src = _mm_loadu_si128((__m128i *) (pu1_src));
933    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
934    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
935    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
936    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
937
938    res_r0 = _mm_sad_epu8(src, ref_left);
939    res_r1 = _mm_sad_epu8(src, ref_right);
940    res_r2 = _mm_sad_epu8(src, ref_top);
941    res_r3 = _mm_sad_epu8(src, ref_bot);
942
943    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
944    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
945    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
946    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
947
948    pu1_src += src_strd;
949    left_ptr += ref_strd;
950    right_ptr += ref_strd;
951    top_ptr += ref_strd;
952    bot_ptr += ref_strd;
953
954    // Row 15 sad calculation
955    src = _mm_loadu_si128((__m128i *) (pu1_src));
956    ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
957    ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
958    ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
959    ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
960
961    res_r0 = _mm_sad_epu8(src, ref_left);
962    res_r1 = _mm_sad_epu8(src, ref_right);
963    res_r2 = _mm_sad_epu8(src, ref_top);
964    res_r3 = _mm_sad_epu8(src, ref_bot);
965
966    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
967    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
968    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
969    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
970
971    val1 = _mm_extract_epi32(sad_r0, 0);
972    val2 = _mm_extract_epi32(sad_r0, 2);
973    pi4_sad[0] = (val1 + val2);
974
975    val1 = _mm_extract_epi32(sad_r1, 0);
976    val2 = _mm_extract_epi32(sad_r1, 2);
977    pi4_sad[1] = (val1 + val2);
978
979    val1 = _mm_extract_epi32(sad_r2, 0);
980    val2 = _mm_extract_epi32(sad_r2, 2);
981    pi4_sad[2] = (val1 + val2);
982
983    val1 = _mm_extract_epi32(sad_r3, 0);
984    val2 = _mm_extract_epi32(sad_r3, 2);
985    pi4_sad[3] = (val1 + val2);
986}
987
988/**
989******************************************************************************
990*
991* @brief computes distortion (SAD) at all subpel points about the src location
992*
993* @par Description
994*   This functions computes SAD at all points at a subpel distance from the
995*   current source location.
996*
997* @param[in] pu1_src
998*  UWORD8 pointer to the source
999*
1000* @param[out] pu1_ref_half_x
1001*  UWORD8 pointer to half pel buffer
1002*
1003* @param[out] pu1_ref_half_y
1004*  UWORD8 pointer to half pel buffer
1005*
1006* @param[out] pu1_ref_half_xy
1007*  UWORD8 pointer to half pel buffer
1008*
1009* @param[in] src_strd
1010*  integer source stride
1011*
1012* @param[in] ref_strd
1013*  integer ref stride
1014*
1015* @param[out] pi4_sad
1016*  integer evaluated sad
1017*  pi4_sad[0] - half x
1018*  pi4_sad[1] - half x - 1
1019*  pi4_sad[2] - half y
1020*  pi4_sad[3] - half y - 1
1021*  pi4_sad[4] - half xy
1022*  pi4_sad[5] - half xy - 1
1023*  pi4_sad[6] - half xy - strd
1024*  pi4_sad[7] - half xy - 1 - strd
1025*
1026* @remarks
1027*
1028******************************************************************************
1029*/
1030void ime_sub_pel_compute_sad_16x16_sse42(UWORD8 *pu1_src,
1031                                   UWORD8 *pu1_ref_half_x,
1032                                   UWORD8 *pu1_ref_half_y,
1033                                   UWORD8 *pu1_ref_half_xy,
1034                                   WORD32 src_strd,
1035                                   WORD32 ref_strd,
1036                                   WORD32 *pi4_sad)
1037{
1038    UWORD8 *pu1_ref_half_x_left = pu1_ref_half_x - 1;
1039    UWORD8 *pu1_ref_half_y_top = pu1_ref_half_y - ref_strd;
1040    UWORD8 *pu1_ref_half_xy_left = pu1_ref_half_xy - 1;
1041    UWORD8 *pu1_ref_half_xy_top = pu1_ref_half_xy - ref_strd;
1042    UWORD8 *pu1_ref_half_xy_top_left = pu1_ref_half_xy - ref_strd - 1;
1043    WORD32 val1, val2;
1044
1045    __m128i src, ref_half_x, ref_half_y, ref_half_xy;
1046    __m128i ref_half_x_left, ref_half_y_top, ref_half_xy_left, ref_half_xy_top, ref_half_xy_top_left;
1047    __m128i res_r0, res_r1, res_r2, res_r3, res_r4, res_r5, res_r6, res_r7;
1048    __m128i sad_r0, sad_r1, sad_r2, sad_r3, sad_r4, sad_r5, sad_r6, sad_r7;
1049    // Row 0 sad calculation
1050    src = _mm_loadu_si128((__m128i *) (pu1_src));
1051    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1052    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1053    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1054    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1055    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1056    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1057    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1058    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1059
1060    sad_r0 = _mm_sad_epu8(src, ref_half_x);
1061    sad_r1 = _mm_sad_epu8(src, ref_half_x_left);
1062    sad_r2 = _mm_sad_epu8(src, ref_half_y);
1063    sad_r3 = _mm_sad_epu8(src, ref_half_y_top);
1064    sad_r4 = _mm_sad_epu8(src, ref_half_xy);
1065    sad_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1066    sad_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1067    sad_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1068
1069    pu1_src += src_strd;
1070    pu1_ref_half_x += ref_strd;
1071    pu1_ref_half_x_left += ref_strd;
1072    pu1_ref_half_y += ref_strd;
1073    pu1_ref_half_y_top += ref_strd;
1074    pu1_ref_half_xy += ref_strd;
1075    pu1_ref_half_xy_left += ref_strd;
1076    pu1_ref_half_xy_top += ref_strd;
1077    pu1_ref_half_xy_top_left += ref_strd;
1078
1079    // Row 1 sad calculation
1080    src = _mm_loadu_si128((__m128i *) (pu1_src));
1081    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1082    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1083    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1084    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1085    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1086    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1087    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1088    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1089
1090    res_r0 = _mm_sad_epu8(src, ref_half_x);
1091    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1092    res_r2 = _mm_sad_epu8(src, ref_half_y);
1093    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1094    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1095    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1096    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1097    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1098
1099    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1100    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1101    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1102    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1103    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1104    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1105    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1106    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1107
1108    pu1_src += src_strd;
1109    pu1_ref_half_x += ref_strd;
1110    pu1_ref_half_x_left += ref_strd;
1111    pu1_ref_half_y += ref_strd;
1112    pu1_ref_half_y_top += ref_strd;
1113    pu1_ref_half_xy += ref_strd;
1114    pu1_ref_half_xy_left += ref_strd;
1115    pu1_ref_half_xy_top += ref_strd;
1116    pu1_ref_half_xy_top_left += ref_strd;
1117
1118    // Row 2 sad calculation
1119    src = _mm_loadu_si128((__m128i *) (pu1_src));
1120    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1121    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1122    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1123    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1124    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1125    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1126    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1127    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1128
1129    res_r0 = _mm_sad_epu8(src, ref_half_x);
1130    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1131    res_r2 = _mm_sad_epu8(src, ref_half_y);
1132    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1133    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1134    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1135    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1136    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1137
1138    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1139    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1140    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1141    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1142    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1143    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1144    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1145    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1146
1147    pu1_src += src_strd;
1148    pu1_ref_half_x += ref_strd;
1149    pu1_ref_half_x_left += ref_strd;
1150    pu1_ref_half_y += ref_strd;
1151    pu1_ref_half_y_top += ref_strd;
1152    pu1_ref_half_xy += ref_strd;
1153    pu1_ref_half_xy_left += ref_strd;
1154    pu1_ref_half_xy_top += ref_strd;
1155    pu1_ref_half_xy_top_left += ref_strd;
1156
1157    // Row 3 sad calculation
1158    src = _mm_loadu_si128((__m128i *) (pu1_src));
1159    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1160    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1161    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1162    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1163    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1164    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1165    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1166    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1167
1168    res_r0 = _mm_sad_epu8(src, ref_half_x);
1169    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1170    res_r2 = _mm_sad_epu8(src, ref_half_y);
1171    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1172    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1173    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1174    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1175    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1176
1177    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1178    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1179    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1180    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1181    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1182    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1183    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1184    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1185
1186    pu1_src += src_strd;
1187    pu1_ref_half_x += ref_strd;
1188    pu1_ref_half_x_left += ref_strd;
1189    pu1_ref_half_y += ref_strd;
1190    pu1_ref_half_y_top += ref_strd;
1191    pu1_ref_half_xy += ref_strd;
1192    pu1_ref_half_xy_left += ref_strd;
1193    pu1_ref_half_xy_top += ref_strd;
1194    pu1_ref_half_xy_top_left += ref_strd;
1195
1196    // Row 4 sad calculation
1197    src = _mm_loadu_si128((__m128i *) (pu1_src));
1198    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1199    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1200    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1201    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1202    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1203    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1204    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1205    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1206
1207    res_r0 = _mm_sad_epu8(src, ref_half_x);
1208    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1209    res_r2 = _mm_sad_epu8(src, ref_half_y);
1210    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1211    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1212    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1213    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1214    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1215
1216    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1217    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1218    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1219    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1220    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1221    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1222    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1223    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1224
1225    pu1_src += src_strd;
1226    pu1_ref_half_x += ref_strd;
1227    pu1_ref_half_x_left += ref_strd;
1228    pu1_ref_half_y += ref_strd;
1229    pu1_ref_half_y_top += ref_strd;
1230    pu1_ref_half_xy += ref_strd;
1231    pu1_ref_half_xy_left += ref_strd;
1232    pu1_ref_half_xy_top += ref_strd;
1233    pu1_ref_half_xy_top_left += ref_strd;
1234
1235
1236    // Row 5 sad calculation
1237    src = _mm_loadu_si128((__m128i *) (pu1_src));
1238    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1239    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1240    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1241    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1242    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1243    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1244    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1245    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1246
1247    res_r0 = _mm_sad_epu8(src, ref_half_x);
1248    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1249    res_r2 = _mm_sad_epu8(src, ref_half_y);
1250    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1251    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1252    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1253    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1254    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1255
1256    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1257    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1258    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1259    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1260    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1261    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1262    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1263    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1264
1265    pu1_src += src_strd;
1266    pu1_ref_half_x += ref_strd;
1267    pu1_ref_half_x_left += ref_strd;
1268    pu1_ref_half_y += ref_strd;
1269    pu1_ref_half_y_top += ref_strd;
1270    pu1_ref_half_xy += ref_strd;
1271    pu1_ref_half_xy_left += ref_strd;
1272    pu1_ref_half_xy_top += ref_strd;
1273    pu1_ref_half_xy_top_left += ref_strd;
1274
1275    // Row 6 sad calculation
1276    src = _mm_loadu_si128((__m128i *) (pu1_src));
1277    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1278    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1279    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1280    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1281    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1282    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1283    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1284    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1285
1286    res_r0 = _mm_sad_epu8(src, ref_half_x);
1287    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1288    res_r2 = _mm_sad_epu8(src, ref_half_y);
1289    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1290    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1291    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1292    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1293    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1294
1295    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1296    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1297    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1298    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1299    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1300    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1301    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1302    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1303
1304    pu1_src += src_strd;
1305    pu1_ref_half_x += ref_strd;
1306    pu1_ref_half_x_left += ref_strd;
1307    pu1_ref_half_y += ref_strd;
1308    pu1_ref_half_y_top += ref_strd;
1309    pu1_ref_half_xy += ref_strd;
1310    pu1_ref_half_xy_left += ref_strd;
1311    pu1_ref_half_xy_top += ref_strd;
1312    pu1_ref_half_xy_top_left += ref_strd;
1313
1314    // Row 7 sad calculation
1315    src = _mm_loadu_si128((__m128i *) (pu1_src));
1316    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1317    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1318    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1319    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1320    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1321    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1322    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1323    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1324
1325    res_r0 = _mm_sad_epu8(src, ref_half_x);
1326    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1327    res_r2 = _mm_sad_epu8(src, ref_half_y);
1328    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1329    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1330    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1331    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1332    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1333
1334    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1335    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1336    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1337    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1338    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1339    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1340    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1341    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1342
1343    pu1_src += src_strd;
1344    pu1_ref_half_x += ref_strd;
1345    pu1_ref_half_x_left += ref_strd;
1346    pu1_ref_half_y += ref_strd;
1347    pu1_ref_half_y_top += ref_strd;
1348    pu1_ref_half_xy += ref_strd;
1349    pu1_ref_half_xy_left += ref_strd;
1350    pu1_ref_half_xy_top += ref_strd;
1351    pu1_ref_half_xy_top_left += ref_strd;
1352
1353    // Row 8 sad calculation
1354    src = _mm_loadu_si128((__m128i *) (pu1_src));
1355    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1356    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1357    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1358    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1359    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1360    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1361    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1362    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1363
1364    res_r0 = _mm_sad_epu8(src, ref_half_x);
1365    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1366    res_r2 = _mm_sad_epu8(src, ref_half_y);
1367    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1368    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1369    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1370    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1371    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1372
1373    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1374    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1375    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1376    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1377    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1378    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1379    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1380    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1381
1382    pu1_src += src_strd;
1383    pu1_ref_half_x += ref_strd;
1384    pu1_ref_half_x_left += ref_strd;
1385    pu1_ref_half_y += ref_strd;
1386    pu1_ref_half_y_top += ref_strd;
1387    pu1_ref_half_xy += ref_strd;
1388    pu1_ref_half_xy_left += ref_strd;
1389    pu1_ref_half_xy_top += ref_strd;
1390    pu1_ref_half_xy_top_left += ref_strd;
1391
1392    // Row 9 sad calculation
1393    src = _mm_loadu_si128((__m128i *) (pu1_src));
1394    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1395    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1396    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1397    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1398    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1399    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1400    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1401    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1402
1403    res_r0 = _mm_sad_epu8(src, ref_half_x);
1404    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1405    res_r2 = _mm_sad_epu8(src, ref_half_y);
1406    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1407    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1408    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1409    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1410    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1411
1412    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1413    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1414    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1415    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1416    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1417    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1418    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1419    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1420
1421    pu1_src += src_strd;
1422    pu1_ref_half_x += ref_strd;
1423    pu1_ref_half_x_left += ref_strd;
1424    pu1_ref_half_y += ref_strd;
1425    pu1_ref_half_y_top += ref_strd;
1426    pu1_ref_half_xy += ref_strd;
1427    pu1_ref_half_xy_left += ref_strd;
1428    pu1_ref_half_xy_top += ref_strd;
1429    pu1_ref_half_xy_top_left += ref_strd;
1430
1431    // Row 10 sad calculation
1432    src = _mm_loadu_si128((__m128i *) (pu1_src));
1433    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1434    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1435    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1436    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1437    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1438    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1439    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1440    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1441
1442    res_r0 = _mm_sad_epu8(src, ref_half_x);
1443    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1444    res_r2 = _mm_sad_epu8(src, ref_half_y);
1445    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1446    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1447    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1448    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1449    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1450
1451    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1452    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1453    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1454    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1455    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1456    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1457    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1458    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1459
1460    pu1_src += src_strd;
1461    pu1_ref_half_x += ref_strd;
1462    pu1_ref_half_x_left += ref_strd;
1463    pu1_ref_half_y += ref_strd;
1464    pu1_ref_half_y_top += ref_strd;
1465    pu1_ref_half_xy += ref_strd;
1466    pu1_ref_half_xy_left += ref_strd;
1467    pu1_ref_half_xy_top += ref_strd;
1468    pu1_ref_half_xy_top_left += ref_strd;
1469
1470    // Row 11 sad calculation
1471    src = _mm_loadu_si128((__m128i *) (pu1_src));
1472    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1473    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1474    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1475    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1476    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1477    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1478    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1479    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1480
1481    res_r0 = _mm_sad_epu8(src, ref_half_x);
1482    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1483    res_r2 = _mm_sad_epu8(src, ref_half_y);
1484    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1485    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1486    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1487    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1488    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1489
1490    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1491    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1492    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1493    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1494    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1495    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1496    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1497    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1498
1499    pu1_src += src_strd;
1500    pu1_ref_half_x += ref_strd;
1501    pu1_ref_half_x_left += ref_strd;
1502    pu1_ref_half_y += ref_strd;
1503    pu1_ref_half_y_top += ref_strd;
1504    pu1_ref_half_xy += ref_strd;
1505    pu1_ref_half_xy_left += ref_strd;
1506    pu1_ref_half_xy_top += ref_strd;
1507    pu1_ref_half_xy_top_left += ref_strd;
1508
1509    // Row 12 sad calculation
1510    src = _mm_loadu_si128((__m128i *) (pu1_src));
1511    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1512    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1513    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1514    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1515    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1516    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1517    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1518    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1519
1520    res_r0 = _mm_sad_epu8(src, ref_half_x);
1521    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1522    res_r2 = _mm_sad_epu8(src, ref_half_y);
1523    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1524    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1525    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1526    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1527    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1528
1529    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1530    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1531    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1532    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1533    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1534    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1535    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1536    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1537
1538    pu1_src += src_strd;
1539    pu1_ref_half_x += ref_strd;
1540    pu1_ref_half_x_left += ref_strd;
1541    pu1_ref_half_y += ref_strd;
1542    pu1_ref_half_y_top += ref_strd;
1543    pu1_ref_half_xy += ref_strd;
1544    pu1_ref_half_xy_left += ref_strd;
1545    pu1_ref_half_xy_top += ref_strd;
1546    pu1_ref_half_xy_top_left += ref_strd;
1547
1548    // Row 13 sad calculation
1549    src = _mm_loadu_si128((__m128i *) (pu1_src));
1550    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1551    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1552    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1553    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1554    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1555    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1556    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1557    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1558
1559    res_r0 = _mm_sad_epu8(src, ref_half_x);
1560    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1561    res_r2 = _mm_sad_epu8(src, ref_half_y);
1562    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1563    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1564    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1565    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1566    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1567
1568    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1569    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1570    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1571    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1572    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1573    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1574    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1575    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1576
1577    pu1_src += src_strd;
1578    pu1_ref_half_x += ref_strd;
1579    pu1_ref_half_x_left += ref_strd;
1580    pu1_ref_half_y += ref_strd;
1581    pu1_ref_half_y_top += ref_strd;
1582    pu1_ref_half_xy += ref_strd;
1583    pu1_ref_half_xy_left += ref_strd;
1584    pu1_ref_half_xy_top += ref_strd;
1585    pu1_ref_half_xy_top_left += ref_strd;
1586
1587    // Row 14 sad calculation
1588    src = _mm_loadu_si128((__m128i *) (pu1_src));
1589    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1590    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1591    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1592    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1593    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1594    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1595    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1596    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1597
1598    res_r0 = _mm_sad_epu8(src, ref_half_x);
1599    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1600    res_r2 = _mm_sad_epu8(src, ref_half_y);
1601    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1602    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1603    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1604    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1605    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1606
1607    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1608    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1609    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1610    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1611    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1612    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1613    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1614    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1615
1616    pu1_src += src_strd;
1617    pu1_ref_half_x += ref_strd;
1618    pu1_ref_half_x_left += ref_strd;
1619    pu1_ref_half_y += ref_strd;
1620    pu1_ref_half_y_top += ref_strd;
1621    pu1_ref_half_xy += ref_strd;
1622    pu1_ref_half_xy_left += ref_strd;
1623    pu1_ref_half_xy_top += ref_strd;
1624    pu1_ref_half_xy_top_left += ref_strd;
1625
1626    // Row 15 sad calculation
1627    src = _mm_loadu_si128((__m128i *) (pu1_src));
1628    ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1629    ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1630    ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1631    ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1632    ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1633    ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1634    ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1635    ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1636
1637    res_r0 = _mm_sad_epu8(src, ref_half_x);
1638    res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1639    res_r2 = _mm_sad_epu8(src, ref_half_y);
1640    res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1641    res_r4 = _mm_sad_epu8(src, ref_half_xy);
1642    res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1643    res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1644    res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1645
1646    sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1647    sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1648    sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1649    sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1650    sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1651    sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1652    sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1653    sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1654
1655    val1 = _mm_extract_epi32(sad_r0, 0);
1656    val2 = _mm_extract_epi32(sad_r0, 2);
1657    pi4_sad[0] = (val1 + val2);
1658
1659    val1 = _mm_extract_epi32(sad_r1, 0);
1660    val2 = _mm_extract_epi32(sad_r1, 2);
1661    pi4_sad[1] = (val1 + val2);
1662
1663    val1 = _mm_extract_epi32(sad_r2, 0);
1664    val2 = _mm_extract_epi32(sad_r2, 2);
1665    pi4_sad[2] = (val1 + val2);
1666
1667    val1 = _mm_extract_epi32(sad_r3, 0);
1668    val2 = _mm_extract_epi32(sad_r3, 2);
1669    pi4_sad[3] = (val1 + val2);
1670
1671    val1 = _mm_extract_epi32(sad_r4, 0);
1672    val2 = _mm_extract_epi32(sad_r4, 2);
1673    pi4_sad[4] = (val1 + val2);
1674
1675    val1 = _mm_extract_epi32(sad_r5, 0);
1676    val2 = _mm_extract_epi32(sad_r5, 2);
1677    pi4_sad[5] = (val1 + val2);
1678
1679    val1 = _mm_extract_epi32(sad_r6, 0);
1680    val2 = _mm_extract_epi32(sad_r6, 2);
1681    pi4_sad[6] = (val1 + val2);
1682
1683    val1 = _mm_extract_epi32(sad_r7, 0);
1684    val2 = _mm_extract_epi32(sad_r7, 2);
1685    pi4_sad[7] = (val1 + val2);
1686
1687    return;
1688}
1689/*
1690*
1691* @brief This function computes SAD between two 16x16 blocks
1692*        It also computes if the block will be zero after H264 transform and quant for
1693*        Intra 16x16 blocks
1694*
1695* @param[in] pu1_src
1696*  UWORD8 pointer to the source
1697*
1698* @param[out] pu1_dst
1699*  UWORD8 pointer to the destination
1700*
1701* @param[in] src_strd
1702*  integer source stride
1703*
1704* @param[in] dst_strd
1705*  integer destination stride
1706*
1707* @param[in] pu2_thrsh
1708*  Threshold for each element of transofrmed quantized block
1709*
1710* @param[out] pi4_mb_distortion
1711*  integer evaluated sad
1712*
1713* @param[out] pu4_is_zero
1714*  Poitner to store if the block is zero after transform and quantization
1715*
1716* @remarks
1717*
1718******************************************************************************
1719*/
1720void ime_compute_satqd_16x16_lumainter_sse42(UWORD8 *pu1_src,
1721                                         UWORD8 *pu1_est,
1722                                         WORD32 src_strd,
1723                                         WORD32 est_strd,
1724                                         UWORD16 *pu2_thrsh,
1725                                         WORD32 *pi4_mb_distortion,
1726                                         UWORD32 *pu4_is_zero)
1727{
1728    __m128i src_r0, src_r1, src_r2, src_r3;
1729    __m128i est_r0, est_r1, est_r2, est_r3;
1730    __m128i temp0, temp1, temp2, temp3, temp4;
1731    __m128i zero = _mm_setzero_si128();          // all bits reset to zero
1732    __m128i all_one = _mm_set1_epi8(0xFF);
1733    __m128i sad_b1, sad_b2, threshold;
1734    WORD16 sad_1, sad_2;
1735    WORD32 i;
1736    UWORD32 flag = 0;
1737    WORD32 test1, test2;
1738    threshold = _mm_loadu_si128((__m128i *) pu2_thrsh);
1739    (*pi4_mb_distortion) = 0;
1740
1741    for (i=0; i<4; i++)
1742    {
1743        src_r0 = _mm_loadl_epi64((__m128i *) pu1_src);  //Row 0 - Block1 and 2
1744        src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
1745        src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
1746        src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2
1747
1748        src_r0 = _mm_cvtepu8_epi16(src_r0);
1749        src_r1 = _mm_cvtepu8_epi16(src_r1);
1750        src_r2 = _mm_cvtepu8_epi16(src_r2);
1751        src_r3 = _mm_cvtepu8_epi16(src_r3);
1752
1753        est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
1754        est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
1755        est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
1756        est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));
1757
1758        est_r0 = _mm_cvtepu8_epi16(est_r0);
1759        est_r1 = _mm_cvtepu8_epi16(est_r1);
1760        est_r2 = _mm_cvtepu8_epi16(est_r2);
1761        est_r3 = _mm_cvtepu8_epi16(est_r3);
1762
1763        src_r0 = _mm_sub_epi16(src_r0, est_r0);
1764        src_r1 = _mm_sub_epi16(src_r1, est_r1);
1765        src_r2 = _mm_sub_epi16(src_r2, est_r2);
1766        src_r3 = _mm_sub_epi16(src_r3, est_r3);
1767
1768        src_r0 = _mm_abs_epi16(src_r0);
1769        src_r1 = _mm_abs_epi16(src_r1);
1770        src_r2 = _mm_abs_epi16(src_r2);
1771        src_r3 = _mm_abs_epi16(src_r3);
1772
1773        src_r0 = _mm_add_epi16(src_r0, src_r3);     //s1 s4 s4 s1 a1 a4 a4 a1
1774        src_r1 = _mm_add_epi16(src_r1, src_r2);     //s2 s3 s3 s2 a2 a3 a3 a2
1775
1776        //SAD calculation
1777        temp0 = _mm_add_epi16(src_r0, src_r1);      //s1+s2 s4+s3 s4+s3 s1+s2 a1+a2 a4+a3 a4+a3 a1+a2
1778        temp0 = _mm_hadd_epi16(temp0, zero);
1779        temp0 = _mm_hadd_epi16(temp0, zero);        //sad1, sad2 - 16bit values
1780
1781        sad_1 = _mm_extract_epi16(temp0, 0);
1782        sad_2 = _mm_extract_epi16(temp0, 1);
1783
1784        (*pi4_mb_distortion) += sad_1 + sad_2;
1785
1786        if (flag == 0) {
1787            sad_b1 = _mm_set1_epi16((sad_1 << 1));
1788            sad_b2 = _mm_set1_epi16((sad_2 << 1));
1789
1790            src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
1791            src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4
1792
1793            src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
1794            src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3
1795
1796            src_r0 = _mm_hadd_epi16(src_r0, zero);      //s1 s4 a1 a4 0 0 0 0
1797            src_r1 = _mm_hadd_epi16(src_r1, zero);      //s2 s3 a2 a3 0 0 0 0
1798
1799            temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
1800            temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0
1801
1802            temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
1803            temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0
1804
1805            temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
1806            temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0
1807
1808            temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0
1809
1810            temp0 = _mm_hadd_epi16(src_r0, zero);   //s1+s4 a1+a4 0 0 0 0 0 0
1811            temp1 = _mm_hadd_epi16(src_r1, zero);   //s2+s3 a2+a3 0 0 0 0 0 0
1812
1813            temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0
1814
1815            temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
1816            temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)
1817
1818            temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
1819            temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)
1820
1821            sad_b1 = _mm_sub_epi16(sad_b1, temp2);      //lsi values Block0
1822            sad_b2 = _mm_sub_epi16(sad_b2, temp3);      //lsi values Block1
1823
1824            temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff
1825
1826            temp1 = _mm_cmpgt_epi16(threshold, sad_b2);
1827
1828            temp0 = _mm_xor_si128(temp0, all_one);      //Xor with 1 => NOT operation
1829            temp1 = _mm_xor_si128(temp1, all_one);
1830
1831            test1 = _mm_test_all_zeros(temp0, all_one);
1832            test2 = _mm_test_all_zeros(temp1, all_one);
1833
1834            if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
1835                    || pu2_thrsh[8] <= sad_2)
1836                flag = 1;
1837        }
1838
1839        pu1_src += 8;
1840        pu1_est += 8;
1841
1842        src_r0 = _mm_loadl_epi64((__m128i *) pu1_src);  //Row 0 - Block1 and 2
1843        src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
1844        src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
1845        src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2
1846
1847        src_r0 = _mm_cvtepu8_epi16(src_r0);
1848        src_r1 = _mm_cvtepu8_epi16(src_r1);
1849        src_r2 = _mm_cvtepu8_epi16(src_r2);
1850        src_r3 = _mm_cvtepu8_epi16(src_r3);
1851
1852        est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
1853        est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
1854        est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
1855        est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));
1856
1857        est_r0 = _mm_cvtepu8_epi16(est_r0);
1858        est_r1 = _mm_cvtepu8_epi16(est_r1);
1859        est_r2 = _mm_cvtepu8_epi16(est_r2);
1860        est_r3 = _mm_cvtepu8_epi16(est_r3);
1861
1862        src_r0 = _mm_sub_epi16(src_r0, est_r0);
1863        src_r1 = _mm_sub_epi16(src_r1, est_r1);
1864        src_r2 = _mm_sub_epi16(src_r2, est_r2);
1865        src_r3 = _mm_sub_epi16(src_r3, est_r3);
1866
1867        src_r0 = _mm_abs_epi16(src_r0);
1868        src_r1 = _mm_abs_epi16(src_r1);
1869        src_r2 = _mm_abs_epi16(src_r2);
1870        src_r3 = _mm_abs_epi16(src_r3);
1871
1872        src_r0 = _mm_add_epi16(src_r0, src_r3);     //s1 s4 s4 s1 a1 a4 a4 a1
1873        src_r1 = _mm_add_epi16(src_r1, src_r2);     //s2 s3 s3 s2 a2 a3 a3 a2
1874
1875        //SAD calculation
1876        temp0 = _mm_add_epi16(src_r0, src_r1);
1877        temp0 = _mm_hadd_epi16(temp0, zero);
1878        temp0 = _mm_hadd_epi16(temp0, zero);        //sad1, sad2 - 16bit values
1879
1880        sad_1 = _mm_extract_epi16(temp0, 0);
1881        sad_2 = _mm_extract_epi16(temp0, 1);
1882
1883        (*pi4_mb_distortion) += sad_1 + sad_2;
1884
1885        if (flag == 0) {
1886            sad_b1 = _mm_set1_epi16((sad_1 << 1));
1887            sad_b2 = _mm_set1_epi16((sad_2 << 1));
1888
1889            src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
1890            src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4
1891
1892            src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
1893            src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3
1894
1895            src_r0 = _mm_hadd_epi16(src_r0, zero);      //s1 s4 a1 a4 0 0 0 0
1896            src_r1 = _mm_hadd_epi16(src_r1, zero);      //s2 s3 a2 a3 0 0 0 0
1897
1898            temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
1899            temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0
1900
1901            temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
1902            temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0
1903
1904            temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
1905            temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0
1906
1907            temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0
1908
1909            temp0 = _mm_hadd_epi16(src_r0, zero);   //s1+s4 a1+a4 0 0 0 0 0 0
1910            temp1 = _mm_hadd_epi16(src_r1, zero);   //s2+s3 a2+a3 0 0 0 0 0 0
1911
1912            temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0
1913
1914            temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
1915            temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)
1916
1917            temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
1918            temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)
1919
1920            sad_b1 = _mm_sub_epi16(sad_b1, temp2);      //lsi values Block0
1921            sad_b2 = _mm_sub_epi16(sad_b2, temp3);      //lsi values Block1
1922
1923            temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff
1924
1925            temp1 = _mm_cmpgt_epi16(threshold, sad_b2);
1926
1927            temp0 = _mm_xor_si128(temp0, all_one);      //Xor with 1 => NOT operation
1928            temp1 = _mm_xor_si128(temp1, all_one);
1929
1930            test1 = _mm_test_all_zeros(temp0, all_one);
1931            test2 = _mm_test_all_zeros(temp1, all_one);
1932
1933            if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
1934                    || pu2_thrsh[8] <= sad_2)
1935                flag = 1;
1936        }
1937
1938        pu1_src += 4*src_strd - 8;
1939        pu1_est += 4*est_strd - 8;
1940    }
1941
1942        *pu4_is_zero = flag;
1943}
1944