1/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20
21/**
22******************************************************************************
23* @file ih264e_distortion_metrics.c
24*
25* @brief
26*  This file contains definitions of routines that compute distortion
27*  between two macro/sub blocks of identical dimensions
28*
29* @author
30*  Ittiam
31*
32* @par List of Functions:
33*  - ime_sub_pel_compute_sad_16x16()
34*  - ime_calculate_sad4_prog()
35*  - ime_calculate_sad3_prog()
36*  - ime_calculate_sad2_prog()
37*  - ime_compute_sad_16x16()
38*  - ime_compute_sad_16x16_fast()
39*  - ime_compute_sad_16x16_ea8()
40*  - ime_compute_sad_8x8()
41*  - ime_compute_sad_4x4()
42*  - ime_compute_sad_16x8()
43*  - ime_compute_satqd_16x16_lumainter()
44*  - ime_compute_satqd_8x16_chroma()
45*  - ime_compute_satqd_16x16_lumaintra()
46*
47*
48* @remarks
49*  None
50*
51*******************************************************************************
52*/
53
54/*****************************************************************************/
55/* File Includes                                                             */
56/*****************************************************************************/
57
58/* System include files */
59#include <stdio.h>
60#include <stdlib.h>
61#include <string.h>
62
63/* User include files */
64#include "ime_typedefs.h"
65#include "ime_defs.h"
66#include "ime_macros.h"
67#include "ime_statistics.h"
68#include "ime_platform_macros.h"
69#include "ime_distortion_metrics.h"
70
71
72/*****************************************************************************/
73/* Function Definitions                                                      */
74/*****************************************************************************/
75
76/**
77******************************************************************************
78*
79* @brief computes distortion (SAD) at all subpel points about the src location
80*
81* @par Description
82*   This functions computes SAD at all points at a subpel distance from the
83*   current source location.
84*
85* @param[in] pu1_src
86*  UWORD8 pointer to the source
87*
88* @param[out] pu1_ref_half_x
89*  UWORD8 pointer to half pel buffer
90*
91* @param[out] pu1_ref_half_y
92*  UWORD8 pointer to half pel buffer
93*
94* @param[out] pu1_ref_half_xy
95*  UWORD8 pointer to half pel buffer
96*
97* @param[in] src_strd
98*  integer source stride
99*
100* @param[in] ref_strd
101*  integer ref stride
102*
103* @param[out] pi4_sad
104*  integer evaluated sad
105*  pi4_sad[0] - half x
106*  pi4_sad[1] - half x - 1
107*  pi4_sad[2] - half y
108*  pi4_sad[3] - half y - 1
109*  pi4_sad[4] - half xy
110*  pi4_sad[5] - half xy - 1
111*  pi4_sad[6] - half xy - strd
112*  pi4_sad[7] - half xy - 1 - strd
113*
114* @remarks
115*
116******************************************************************************
117*/
118void ime_sub_pel_compute_sad_16x16(UWORD8 *pu1_src,
119                                   UWORD8 *pu1_ref_half_x,
120                                   UWORD8 *pu1_ref_half_y,
121                                   UWORD8 *pu1_ref_half_xy,
122                                   WORD32 src_strd,
123                                   WORD32 ref_strd,
124                                   WORD32 *pi4_sad)
125{
126    UWORD8 *pu1_ref_half_x_left = pu1_ref_half_x - 1;
127    UWORD8 *pu1_ref_half_y_top = pu1_ref_half_y - ref_strd;
128    UWORD8 *pu1_ref_half_xy_left = pu1_ref_half_xy - 1;
129    UWORD8 *pu1_ref_half_xy_top = pu1_ref_half_xy - ref_strd;
130    UWORD8 *pu1_ref_half_xy_top_left = pu1_ref_half_xy - ref_strd - 1;
131
132    WORD32 row, col;
133
134    memset(pi4_sad, 0, 8 * sizeof(WORD32));
135
136    for(row = 0; row < MB_SIZE; row++)
137    {
138        for(col = 0; col < MB_SIZE; col++)
139        {
140            WORD32 src;
141            WORD32 diff;
142
143            src = pu1_src[col];
144
145            diff = src - pu1_ref_half_x[col];
146            pi4_sad[0] += ABS(diff);
147
148            diff = src - pu1_ref_half_x_left[col];
149            pi4_sad[1] += ABS(diff);
150
151            diff = src - pu1_ref_half_y[col];
152            pi4_sad[2] += ABS(diff);
153
154            diff = src - pu1_ref_half_y_top[col];
155            pi4_sad[3] += ABS(diff);
156
157            diff = src - pu1_ref_half_xy[col];
158            pi4_sad[4] += ABS(diff);
159
160            diff = src - pu1_ref_half_xy_left[col];
161            pi4_sad[5] += ABS(diff);
162
163            diff = src - pu1_ref_half_xy_top[col];
164            pi4_sad[6] += ABS(diff);
165
166            diff = src - pu1_ref_half_xy_top_left[col];
167            pi4_sad[7] += ABS(diff);
168        }
169
170        pu1_src += src_strd;
171
172        pu1_ref_half_x += ref_strd;
173        pu1_ref_half_x_left += ref_strd;
174
175        pu1_ref_half_y += ref_strd;
176        pu1_ref_half_y_top += ref_strd;
177
178        pu1_ref_half_xy += ref_strd;
179        pu1_ref_half_xy_left += ref_strd;
180        pu1_ref_half_xy_top += ref_strd;
181        pu1_ref_half_xy_top_left += ref_strd;
182    }
183}
184
185/**
186*******************************************************************************
187*
188* @brief compute sad
189*
190* @par Description: This function computes the sad at vertices of diamond grid
191* centered at reference pointer and at unit distance from it.
192*
193* @param[in] pu1_ref
194*  UWORD8 pointer to the reference
195*
196* @param[out] pu1_src
197*  UWORD8 pointer to the source
198*
199* @param[in] ref_strd
200*  integer reference stride
201*
202* @param[in] src_strd
203*  integer source stride
204*
205* @param[out] pi4_sad
206*  pointer to integer array evaluated sad
207*
208* @returns  sad at all evaluated vertexes
209*
210* @remarks  none
211*
212*******************************************************************************
213*/
214void ime_calculate_sad4_prog(UWORD8 *pu1_ref,
215                             UWORD8 *pu1_src,
216                             WORD32 ref_strd,
217                             WORD32 src_strd,
218                             WORD32 *pi4_sad)
219{
220
221    /* reference ptrs at unit 1 distance in diamond pattern centered at pu1_ref */
222    UWORD8 *left_ptr    = pu1_ref - 1;
223    UWORD8 *right_ptr   = pu1_ref + 1;
224    UWORD8 *top_ptr     = pu1_ref - ref_strd;
225    UWORD8 *bot_ptr     = pu1_ref + ref_strd;
226
227    /* temp var */
228    WORD32 count2, count3;
229    UWORD32 u4_ref_buf_offset = ref_strd - MB_SIZE;
230    UWORD32 u4_cur_buf_offset = src_strd - MB_SIZE;
231
232    memset(pi4_sad, 0, 4 * sizeof(WORD32));
233
234    for(count2 = MB_SIZE; count2 > 0; count2--)
235    {
236        for(count3 = MB_SIZE; count3 > 0 ; count3--)
237        {
238            WORD32 src;
239            WORD32 diff;
240
241            src = *pu1_src++;
242
243            diff = src - *left_ptr++;
244            pi4_sad[0] += ABS(diff);
245
246            diff = src - *right_ptr++;
247            pi4_sad[1] += ABS(diff);
248
249            diff = src - *top_ptr++;
250            pi4_sad[2] += ABS(diff);
251
252            diff = src - *bot_ptr++;
253            pi4_sad[3]  += ABS(diff);
254        }
255
256        bot_ptr    += u4_ref_buf_offset;
257        left_ptr   += u4_ref_buf_offset;
258        right_ptr  += u4_ref_buf_offset;
259        top_ptr    += u4_ref_buf_offset;
260
261        pu1_src += u4_cur_buf_offset;
262    }
263
264}
265
266/**
267*******************************************************************************
268*
269* @brief compute sad
270*
271* @par Description: This function computes the sad at vertices of diamond grid
272* centered at reference pointer and at unit distance from it.
273*
274* @param[in] pu1_ref1, pu1_ref2, pu1_ref3
275*  UWORD8 pointer to the reference
276*
277* @param[out] pu1_src
278*  UWORD8 pointer to the source
279*
280* @param[in] ref_strd
281*  integer reference stride
282*
283* @param[in] src_strd
284*  integer source stride
285*
286* @param[out] pi4_sad
287*  pointer to integer array evaluated sad
288*
289* @returns  sad at all evaluated vertexes
290*
291* @remarks  none
292*
293*******************************************************************************
294*/
295void ime_calculate_sad3_prog(UWORD8 *pu1_ref1,
296                             UWORD8 *pu1_ref2,
297                             UWORD8 *pu1_ref3,
298                             UWORD8 *pu1_src,
299                             WORD32 ref_strd,
300                             WORD32 src_strd,
301                             WORD32 *pi4_sad)
302{
303    /* temp var */
304    WORD32 i;
305    UWORD32 u4_ref_buf_offset = ref_strd - MB_SIZE;
306    UWORD32 u4_cur_buf_offset = src_strd - MB_SIZE;
307
308    for(i = 16; i > 0; i--)
309    {
310        USADA8(pu1_src, pu1_ref1, pi4_sad[0]);
311        USADA8(pu1_src, pu1_ref2, pi4_sad[1]);
312        USADA8(pu1_src, pu1_ref3, pi4_sad[2]);
313        pu1_src += 4;
314        pu1_ref1 += 4;
315        pu1_ref2 += 4;
316        pu1_ref3 += 4;
317
318        USADA8(pu1_src, pu1_ref1, pi4_sad[0]);
319        USADA8(pu1_src, pu1_ref2, pi4_sad[1]);
320        USADA8(pu1_src, pu1_ref3, pi4_sad[2]);
321        pu1_src += 4;
322        pu1_ref1 += 4;
323        pu1_ref2 += 4;
324        pu1_ref3 += 4;
325
326        USADA8(pu1_src, pu1_ref1, pi4_sad[0]);
327        USADA8(pu1_src, pu1_ref2, pi4_sad[1]);
328        USADA8(pu1_src, pu1_ref3, pi4_sad[2]);
329        pu1_src += 4;
330        pu1_ref1 += 4;
331        pu1_ref2 += 4;
332        pu1_ref3 += 4;
333
334        USADA8(pu1_src, pu1_ref1, pi4_sad[0]);
335        USADA8(pu1_src, pu1_ref2, pi4_sad[1]);
336        USADA8(pu1_src, pu1_ref3, pi4_sad[2]);
337        pu1_src += 4;
338        pu1_ref1 += 4;
339        pu1_ref2 += 4;
340        pu1_ref3 += 4;
341
342        pu1_src += u4_cur_buf_offset;
343        pu1_ref1 += u4_ref_buf_offset;
344        pu1_ref2 += u4_ref_buf_offset;
345        pu1_ref3 += u4_ref_buf_offset;
346    }
347
348}
349
350/**
351*******************************************************************************
352*
353* @brief compute sad
354*
355* @par Description: This function computes the sad at vertices of diamond grid
356* centered at reference pointer and at unit distance from it.
357*
358* @param[in] pu1_ref1, pu1_ref2
359*  UWORD8 pointer to the reference
360*
361* @param[out] pu1_src
362*  UWORD8 pointer to the source
363*
364* @param[in] ref_strd
365*  integer reference stride
366*
367* @param[in] src_strd
368*  integer source stride
369*
370* @param[out] pi4_sad
371*  pointer to integer array evaluated sad
372*
373* @returns  sad at all evaluated vertexes
374*
375* @remarks  none
376*
377*******************************************************************************
378*/
379void ime_calculate_sad2_prog(UWORD8 *pu1_ref1,
380                             UWORD8 *pu1_ref2,
381                             UWORD8 *pu1_src,
382                             WORD32 ref_strd,
383                             WORD32 src_strd,
384                             WORD32 *pi4_sad)
385{
386    /* temp var */
387    WORD32 i;
388    UWORD32 u4_ref_buf_offset = ref_strd - MB_SIZE;
389    UWORD32 u4_cur_buf_offset = src_strd - MB_SIZE;
390
391    for(i = 16; i > 0; i--)
392    {
393        USADA8(pu1_src, pu1_ref1, pi4_sad[0]);
394        USADA8(pu1_src, pu1_ref2, pi4_sad[1]);
395        pu1_src += 4;
396        pu1_ref1 += 4;
397        pu1_ref2 += 4;
398
399        USADA8(pu1_src, pu1_ref1, pi4_sad[0]);
400        USADA8(pu1_src, pu1_ref2, pi4_sad[1]);
401        pu1_src += 4;
402        pu1_ref1 += 4;
403        pu1_ref2 += 4;
404
405        USADA8(pu1_src, pu1_ref1, pi4_sad[0]);
406        USADA8(pu1_src, pu1_ref2, pi4_sad[1]);
407        pu1_src += 4;
408        pu1_ref1 += 4;
409        pu1_ref2 += 4;
410
411        USADA8(pu1_src, pu1_ref1, pi4_sad[0]);
412        USADA8(pu1_src, pu1_ref2, pi4_sad[1]);
413        pu1_src += 4;
414        pu1_ref1 += 4;
415        pu1_ref2 += 4;
416
417        pu1_src += u4_cur_buf_offset;
418        pu1_ref1 += u4_ref_buf_offset;
419        pu1_ref2 += u4_ref_buf_offset;
420    }
421
422}
423
424/**
425******************************************************************************
426*
427* @brief computes distortion (SAD) between 2 16x16 blocks
428*
429* @par   Description
430*   This functions computes SAD between 2 16x16 blocks. There is a provision
431*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
432*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
433*
434* @param[in] pu1_src
435*  UWORD8 pointer to the source
436*
437* @param[out] pu1_dst
438*  UWORD8 pointer to the destination
439*
440* @param[in] src_strd
441*  integer source stride
442*
443* @param[in] dst_strd
444*  integer destination stride
445*
446* @param[in] i4_max_sad
447*  integer maximum allowed distortion
448*
449* @param[out] pi4_mb_distortion
450*  integer evaluated sad
451*
452* @remarks
453*
454******************************************************************************
455*/
456void ime_compute_sad_16x16(UWORD8 *pu1_src,
457                           UWORD8 *pu1_est,
458                           WORD32 src_strd,
459                           WORD32 est_strd,
460                           WORD32 i4_max_sad,
461                           WORD32 *pi4_mb_distortion)
462{
463    WORD32 i4_sad = 0;
464    UWORD32 u4_src_offset = src_strd - 16;
465    UWORD32 u4_est_offset = est_strd - 16;
466    UWORD32 i;
467
468GATHER_16x16_SAD_EE_STATS(gu4_16x16_sad_ee_stats, 16);
469
470    for(i = 16; i > 0; i--)
471    {
472        USADA8(pu1_src, pu1_est, i4_sad);
473        pu1_src += 4;
474        pu1_est += 4;
475
476        USADA8(pu1_src, pu1_est, i4_sad);
477        pu1_src += 4;
478        pu1_est += 4;
479
480        USADA8(pu1_src, pu1_est, i4_sad);
481        pu1_src += 4;
482        pu1_est += 4;
483
484        USADA8(pu1_src, pu1_est, i4_sad);
485        pu1_src += 4;
486        pu1_est += 4;
487
488        /* early exit */
489        if(i4_max_sad < i4_sad)
490        {
491
492GATHER_16x16_SAD_EE_STATS(gu4_16x16_sad_ee_stats, 16-i);
493
494            *pi4_mb_distortion = i4_sad;
495            return ;
496        }
497        pu1_src += u4_src_offset;
498        pu1_est += u4_est_offset;
499    }
500
501    *pi4_mb_distortion = i4_sad;
502    return ;
503}
504
505/**
506******************************************************************************
507*
508* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
509*
510* @par   Description
511*   This functions computes SAD between 2 16x16 blocks. There is a provision
512*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
513*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
514*
515* @param[in] pu1_src
516*  UWORD8 pointer to the source
517*
518* @param[out] pu1_dst
519*  UWORD8 pointer to the destination
520*
521* @param[in] src_strd
522*  integer source stride
523*
524* @param[in] dst_strd
525*  integer destination stride
526*
527* @param[in] i4_max_sad
528*  integer maximum allowed distortion
529*
530* @param[out] pi4_mb_distortion
531*  integer evaluated sad
532*
533* @remarks
534*
535******************************************************************************
536*/
537void ime_compute_sad_16x16_fast(UWORD8 *pu1_src,
538                                UWORD8 *pu1_est,
539                                WORD32 src_strd,
540                                WORD32 est_strd,
541                                WORD32 i4_max_sad,
542                                WORD32 *pi4_mb_distortion)
543{
544
545    WORD32 i4_sad = 0;
546    UWORD32 u4_src_offset = 2 * src_strd - 16;
547    UWORD32 u4_est_offset = 2 * est_strd - 16;
548    UWORD32 i;
549
550    UNUSED(i4_max_sad);
551
552    for(i = 16; i > 0; i-= 2)
553    {
554        USADA8(pu1_src, pu1_est, i4_sad);
555        pu1_src += 4;
556        pu1_est += 4;
557
558        USADA8(pu1_src, pu1_est, i4_sad);
559        pu1_src += 4;
560        pu1_est += 4;
561
562        USADA8(pu1_src, pu1_est, i4_sad);
563        pu1_src += 4;
564        pu1_est += 4;
565
566        USADA8(pu1_src, pu1_est, i4_sad);
567        pu1_src += 4;
568        pu1_est += 4;
569
570        pu1_src += u4_src_offset;
571        pu1_est += u4_est_offset;
572    }
573
574    *pi4_mb_distortion = (i4_sad << 1);
575    return ;
576}
577
578/**
579******************************************************************************
580*
581*  @brief computes distortion (SAD) between 2 8x8 blocks
582*
583*  @par   Description
584*   This functions computes SAD between 2 8x8 blocks. There is a provision
585*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
586*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
587*
588* @param[in] pu1_src
589*  UWORD8 pointer to the source
590*
591* @param[out] pu1_dst
592*  UWORD8 pointer to the destination
593*
594* @param[in] src_strd
595*  integer source stride
596*
597* @param[in] dst_strd
598*  integer destination stride
599*
600* @param[in] u4_max_sad
601*  integer maximum allowed distortion
602*
603* @param[out] i4_sad
604*  integer evaluated sad
605*
606* @remarks
607*
608******************************************************************************
609 */
610
611void ime_compute_sad_8x8(UWORD8 *pu1_src,
612                         UWORD8 *pu1_est,
613                         WORD32 src_strd,
614                         WORD32 est_strd,
615                         WORD32 i4_max_sad,
616                         WORD32 *pi4_mb_distortion)
617{
618    WORD32 i4_sad = 0;
619    UWORD32 u4_src_offset = src_strd - 8;
620    UWORD32 u4_est_offset = est_strd - 8;
621    UWORD32 i, j;
622    WORD16 temp;
623
624    for(i = 8; i > 0; i--)
625    {
626        for(j = 8; j > 0; j--)
627        {
628            /* SAD */
629            temp = *pu1_src++ - *pu1_est++;
630            i4_sad += ABS(temp);
631        }
632        /* early exit */
633        if(i4_max_sad < i4_sad)
634        {
635            *pi4_mb_distortion = i4_sad;
636            return;
637        }
638        pu1_src += u4_src_offset;
639        pu1_est += u4_est_offset;
640    }
641    *pi4_mb_distortion = i4_sad;
642}
643
644/**
645******************************************************************************
646*
647*  @brief computes distortion (SAD) between 2 4x4 blocks
648*
649*  @par   Description
650*   This functions computes SAD between 2 4x4 blocks. There is a provision
651*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
652*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
653*
654* @param[in] pu1_src
655*  UWORD8 pointer to the source
656*
657* @param[out] pu1_dst
658*  UWORD8 pointer to the destination
659*
660* @param[in] src_strd
661*  integer source stride
662*
663* @param[in] dst_strd
664*  integer destination stride
665*
666* @param[in] u4_max_sad
667*  integer maximum allowed distortion
668*
669* @param[out] pi4_mb_distortion
670*  integer evaluated sad
671*
672* @remarks
673*
674******************************************************************************
675*/
676void ime_compute_sad_4x4
677        (
678            UWORD8 *pu1_src,
679            UWORD8 *pu1_est,
680            WORD32 src_strd,
681            WORD32 est_strd,
682            WORD32 i4_max_sad,
683            WORD32 *pi4_mb_distortion
684        )
685{
686    WORD32 i4_sad = 0;
687
688    UNUSED(i4_max_sad);
689
690    USADA8(pu1_src, pu1_est, i4_sad);
691    pu1_src += src_strd;
692    pu1_est += est_strd;
693
694    USADA8(pu1_src, pu1_est, i4_sad);
695    pu1_src += src_strd;
696    pu1_est += est_strd;
697
698    USADA8(pu1_src, pu1_est, i4_sad);
699    pu1_src += src_strd;
700    pu1_est += est_strd;
701
702    USADA8(pu1_src, pu1_est, i4_sad);
703    *pi4_mb_distortion = i4_sad;
704}
705
706
707/**
708******************************************************************************
709*
710*  @brief computes distortion (SAD) between 2 16x8  blocks
711*
712*
713*  @par   Description
714*   This functions computes SAD between 2 16x8 blocks. There is a provision
715*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
716*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
717*
718* @param[in] pu1_src
719*  UWORD8 pointer to the source
720*
721* @param[out] pu1_dst
722*  UWORD8 pointer to the destination
723*
724* @param[in] src_strd
725*  integer source stride
726*
727* @param[in] dst_strd
728*  integer destination stride
729*
730* @param[in] u4_max_sad
731*  integer maximum allowed distortion
732*
733* @param[out] pi4_mb_distortion
734*  integer evaluated sad
735*
736* @remarks
737*
738******************************************************************************
739*/
740void ime_compute_sad_16x8
741        (
742            UWORD8 *pu1_src,
743            UWORD8 *pu1_est,
744            WORD32 src_strd,
745            WORD32 est_strd,
746            WORD32 i4_max_sad,
747            WORD32 *pi4_mb_distortion
748        )
749{
750    WORD32 i4_sad = 0;
751    UWORD32 u4_src_offset = src_strd - 16;
752    UWORD32 u4_est_offset = est_strd - 16;
753    UWORD32 i, j;
754    WORD16 temp;
755
756GATHER_16x8_SAD_EE_STATS(gu4_16x8_sad_ee_stats, 8);
757
758    for(i = 8; i > 0; i--)
759    {
760        for(j = 16; j > 0; j--)
761        {
762            /* SAD */
763            temp = *pu1_src++ - *pu1_est++;
764            i4_sad += ABS(temp);
765        }
766        /* early exit */
767        if(i4_max_sad < i4_sad)
768        {
769
770GATHER_16x8_SAD_EE_STATS(gu4_16x8_sad_ee_stats, 8-i);
771
772            *pi4_mb_distortion = i4_sad;
773
774            return;
775        }
776        pu1_src += u4_src_offset;
777        pu1_est += u4_est_offset;
778    }
779
780    *pi4_mb_distortion = i4_sad;
781    return;
782
783}
784
785/**
786******************************************************************************
787*
788* @brief computes distortion (SAD) between 2 16x16 blocks
789*
790* @par   Description
791*   This functions computes SAD between 2 16x16 blocks. There is a provision
792*   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
793*   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
794*
795* @param[in] pu1_src
796*  UWORD8 pointer to the source
797*
798* @param[out] pu1_dst
799*  UWORD8 pointer to the destination
800*
801* @param[in] src_strd
802*  integer source stride
803*
804* @param[in] dst_strd
805*  integer destination stride
806*
807* @param[in] i4_max_sad
808*  integer maximum allowed distortion
809*
810* @param[out] pi4_mb_distortion
811*  integer evaluated sad
812*
813* @remarks
814*
815******************************************************************************
816*/
817void ime_compute_sad_16x16_ea8(UWORD8 *pu1_src,
818                               UWORD8 *pu1_est,
819                               WORD32 src_strd,
820                               WORD32 est_strd,
821                               WORD32 i4_max_sad,
822                               WORD32 *pi4_mb_distortion)
823{
824    WORD32 i4_sad = 0;
825    UWORD32 u4_src_offset = src_strd - 16;
826    UWORD32 u4_est_offset = est_strd - 16;
827    UWORD32 i, j;
828    WORD16 temp;
829    UWORD8 *pu1_src_temp = pu1_src + src_strd;
830    UWORD8 *pu1_est_temp = pu1_est + est_strd;
831
832    for(i = 16; i > 0; i -= 2)
833    {
834        for(j = 16; j > 0; j--)
835        {
836            /* SAD */
837            temp = *pu1_src++ - *pu1_est++;
838            i4_sad += ABS(temp);
839        }
840
841        pu1_src += (u4_src_offset + src_strd);
842        pu1_est += (u4_est_offset + est_strd);
843
844    }
845
846    /* early exit */
847    if(i4_max_sad < i4_sad)
848    {
849        *pi4_mb_distortion = i4_sad;
850        return;
851    }
852
853    pu1_src = pu1_src_temp;
854    pu1_est = pu1_est_temp;
855
856    for(i = 16; i > 0; i -= 2)
857    {
858        for(j = 16; j > 0; j--)
859        {
860            /* SAD */
861            temp = *pu1_src++ - *pu1_est++;
862            i4_sad += ABS(temp);
863        }
864
865        pu1_src += u4_src_offset + src_strd;
866        pu1_est += u4_est_offset + est_strd;
867    }
868
869    *pi4_mb_distortion = i4_sad;
870    return;
871}
872
873
874/**
875*******************************************************************************
876*
877* @brief This function computes SAD between two 16x16 blocks
878*        It also computes if the block will be zero after H264 transform and quant for
879*        Intra 16x16 blocks
880*
881* @param[in] pu1_src
882*  UWORD8 pointer to the source
883*
884* @param[out] pu1_dst
885*  UWORD8 pointer to the destination
886*
887* @param[in] src_strd
888*  integer source stride
889*
890* @param[in] dst_strd
891*  integer destination stride
892*
893* @param[in] pu2_thrsh
894*  Threshold for each element of transofrmed quantized block
895*
896* @param[out] pi4_mb_distortion
897*  integer evaluated sad
898*
899* @param[out] pu4_is_zero
900*  Poitner to store if the block is zero after transform and quantization
901*
902* @remarks
903*
904******************************************************************************
905*/
906void ime_compute_satqd_16x16_lumainter(UWORD8 *pu1_src,
907                                         UWORD8 *pu1_est,
908                                         WORD32 src_strd,
909                                         WORD32 est_strd,
910                                         UWORD16 *pu2_thrsh,
911                                         WORD32 *pi4_mb_distortion,
912                                         UWORD32 *pu4_is_non_zero)
913{
914    UWORD32 i,j;
915    WORD16 s1,s2,s3,s4,sad_1,sad_2,ls1,ls2,ls3,ls4,ls5,ls6,ls7,ls8;
916    UWORD8 *pu1_src_lp,*pu1_est_lp;
917    UWORD32 sad = 0;
918
919    (*pi4_mb_distortion) = 0;
920    for(i=0;i<4;i++)
921    {
922        for(j=0;j<4;j++)
923        {
924            pu1_src_lp = pu1_src + 4*j;
925            pu1_est_lp = pu1_est + 4*j;
926
927            s1 = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]);
928            s4 = ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]);
929
930            pu1_src_lp += src_strd;
931            pu1_est_lp += est_strd;
932
933            s2 = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]);
934            s3 = ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]);
935
936            pu1_src_lp += src_strd;
937            pu1_est_lp += est_strd;
938
939            s2 += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]);
940            s3 += ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]);
941
942            pu1_src_lp += src_strd;
943            pu1_est_lp += est_strd;
944
945            s1 += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]);
946            s4 += ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]);
947
948            sad_1 = s1+s2+s3+s4;
949
950            if(sad == 0)
951            {
952                sad_2 = sad_1<<1;
953
954                ls1 = sad_2 -(s2 + s3);
955                ls2 = sad_2 -(s1 + s4);
956                ls3 = sad_2 -(s3 + s4);
957                ls4 = sad_2 -(s3 - (s1<<1));
958                ls5 = sad_2 -(s4 - (s2<<1));
959                ls6 = sad_2 -(s1 + s2);
960                ls7 = sad_2 -(s2 - (s4<<1));
961                ls8 = sad_2 -(s1 - (s3<<1));
962
963                if(
964                        pu2_thrsh[8] <= sad_1   ||
965                        pu2_thrsh[0] <=  ls2    ||
966                        pu2_thrsh[1] <=  ls1    ||
967                        pu2_thrsh[2] <=  ls8    ||
968                        pu2_thrsh[3] <=  ls5    ||
969
970                        pu2_thrsh[4] <=  ls6    ||
971                        pu2_thrsh[5] <=  ls3    ||
972                        pu2_thrsh[6] <=  ls7    ||
973                        pu2_thrsh[7] <=  ls4
974
975                )sad = 1;
976            }
977            (*pi4_mb_distortion) += sad_1;
978        }
979        pu1_src +=  (src_strd *4);
980        pu1_est +=  (est_strd *4);
981    }
982    *pu4_is_non_zero = sad;
983}
984
985
986/**
987******************************************************************************
988*
989* @brief computes distortion (SAD and SAQTD) between 2 16x8 (interleaved) chroma blocks
990*
991*
992* @par   Description
993*   This functions computes SAD between2 16x8 chroma blocks(interleaved)
994*   It also checks if the SATDD(Sum of absolute transformed wuqntized differnce beteern the blocks
995*   If SAQTD is zero, it gives back zero
996*   Other wise sad is retrned
997*   There is no provison for early exit
998*
999*   The transform done here is the transform for chroma blocks in H264
1000*
1001* @param[in] pu1_src
1002*  UWORD8 pointer to the source
1003*
1004* @param[out] pu1_dst
1005*  UWORD8 pointer to the destination
1006*
1007* @param[in] src_strd
1008*  integer source stride
1009*
1010* @param[in] dst_strd
1011*  integer destination stride
1012*
1013* @param[in] pu2_thrsh
1014*  Threshold for each element of transofrmed quantized block
1015*
1016* @param[out] pi4_mb_distortion
1017*  integer evaluated sad
1018*
1019* @remarks
1020* Fucntion code is nit updated.
1021* Will require debugging and minor modifications
1022*
1023******************************************************************************
1024*/
1025void ime_compute_satqd_8x16_chroma(UWORD8 *pu1_src,
1026                                     UWORD8 *pu1_est,
1027                                     WORD32 src_strd,
1028                                     WORD32 est_strd,
1029                                     WORD32 max_sad,
1030                                     UWORD16 *thrsh)
1031{
1032    WORD32 i,j,plane;
1033    WORD16 s1,s2,s3,s4,sad_1,sad_2,ls1,ls2,ls3,ls4,ls5,ls6,ls7,ls8;
1034    UWORD8 *pu1_src_lp,*pu1_est_lp,*pu1_src_plane,*pu1_est_plane;
1035    WORD32 sad =0;
1036    UNUSED(max_sad);
1037
1038    pu1_src_plane = pu1_src;
1039    pu1_est_plane = pu1_est;
1040
1041    for(plane =0;plane<2;plane++)
1042    {
1043        for(i=0;i<4;i++)
1044        {
1045            for(j=0;j<4;j++)
1046            {
1047                pu1_src_lp = pu1_src + 8*j;
1048                pu1_est_lp = pu1_est + 8*j;
1049
1050                s1 = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[6] - (WORD16)pu1_est_lp[6]);
1051                s4 = ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2])+ ABS((WORD16)pu1_src_lp[4] - (WORD16)pu1_est_lp[4]);
1052
1053                pu1_src_lp += src_strd;
1054                pu1_est_lp += est_strd;
1055
1056                s2 = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[6] - (WORD16)pu1_est_lp[6]);
1057                s3 = ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2])+ ABS((WORD16)pu1_src_lp[4] - (WORD16)pu1_est_lp[4]);
1058
1059                pu1_src_lp += src_strd;
1060                pu1_est_lp += est_strd;
1061
1062                s2 += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[6] - (WORD16)pu1_est_lp[6]);
1063                s3 += ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2])+ ABS((WORD16)pu1_src_lp[4] - (WORD16)pu1_est_lp[4]);
1064
1065                pu1_src_lp += src_strd;
1066                pu1_est_lp += est_strd;
1067
1068                s1 += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[6] - (WORD16)pu1_est_lp[6]);
1069                s4 += ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2])+ ABS((WORD16)pu1_src_lp[4] - (WORD16)pu1_est_lp[4]);
1070
1071                sad_1 = s1+s2+s3+s4;
1072                sad_2 = sad_1<<1;
1073
1074                ls1 = sad_2 -(s2 + s3);
1075                ls2 = sad_2 -(s1 + s4);
1076                ls3 = sad_2 -(s3 + s4);
1077                ls4 = sad_2 -(s3 - (s1<<1));
1078                ls5 = sad_2 -(s4 - (s2<<1));
1079                ls6 = sad_2 -(s1 + s2);
1080                ls7 = sad_2 -(s2 - (s4<<1));
1081                ls8 = sad_2 -(s1 - (s3<<1));
1082
1083                if(
1084                        //thrsh[0] >  sad_1     && Chroma Dc is checked later
1085                        thrsh[1] >  ls1     &&
1086                        thrsh[2] >  sad_1   &&
1087                        thrsh[3] >  ls2     &&
1088
1089                        thrsh[4] >  ls3     &&
1090                        thrsh[5] >  ls4     &&
1091                        thrsh[6] >  ls3     &&
1092                        thrsh[7] >  ls5     &&
1093
1094                        thrsh[8] >  sad_1   &&
1095                        thrsh[9] >  ls1     &&
1096                        thrsh[10]>  sad_1   &&
1097                        thrsh[11]>  ls2     &&
1098
1099                        thrsh[12]>  ls6     &&
1100                        thrsh[13]>  ls7     &&
1101                        thrsh[14]>  ls6     &&
1102                        thrsh[15]>  ls8
1103                )
1104                {
1105                    /*set current sad to be zero*/
1106                }
1107                else
1108                    return ;
1109
1110                sad += sad_1;
1111            }
1112            pu1_src +=  (src_strd *4);
1113            pu1_est +=  (est_strd *4);
1114        }
1115        if(sad < (thrsh[0]<<1))sad = 0;
1116        else return ;
1117
1118        pu1_src = pu1_src_plane+1;
1119        pu1_est = pu1_est_plane+1;
1120    }
1121    return ;
1122}
1123
1124
1125/**
1126******************************************************************************
1127*
1128* @brief computes distortion (SAD and SAQTD) between 2 16x16 blocks
1129*
1130* @par   Description
1131*   This functions computes SAD between 2 16x16 blocks.
1132*   It also checks if the SATDD(Sum of absolute transformed wuqntized differnce beteern the blocks
1133*   If SAQTD is zero, it gives back zero
1134*   Other wise sad is retrned
1135*   There is no provison for early exit
1136*
1137*   The transform done here is the transform for inter 16x16 blocks in H264
1138*
1139* @param[in] pu1_src
1140*  UWORD8 pointer to the source
1141*
1142* @param[out] pu1_dst
1143*  UWORD8 pointer to the destination
1144*
1145* @param[in] src_strd
1146*  integer source stride
1147*
1148* @param[in] dst_strd
1149*  integer destination stride
1150*
1151* @param[in] pu2_thrsh
1152*  Threshold for each element of transofrmed quantized block
1153*
1154* @param[out] pi4_mb_distortion
1155*  integer evaluated sad
1156*
1157* @remarks
1158*
1159******************************************************************************
1160*/
1161void ime_compute_satqd_16x16_lumaintra(UWORD8 *pu1_src,
1162                                         UWORD8 *pu1_est,
1163                                         WORD32 src_strd,
1164                                         WORD32 est_strd,
1165                                         WORD32 max_sad,
1166                                         UWORD16 *thrsh,
1167                                         WORD32 *pi4_mb_distortion,
1168                                         UWORD8 *sig_nz_sad)
1169{
1170    UWORD32 i,j;
1171    WORD16 s1[4],s2[4],s3[4],s4[4],sad[4];
1172    UWORD8 *pu1_src_lp,*pu1_est_lp;
1173    UWORD8 *sig_sad_dc;
1174    UWORD32 nz_sad_sig = 0;
1175    UNUSED(max_sad);
1176    *pi4_mb_distortion =0;
1177
1178    sig_sad_dc = sig_nz_sad;
1179    sig_nz_sad++;
1180
1181    for(i=0;i<4;i++)
1182    {
1183        for(j=0;j<4;j++)
1184        {
1185            pu1_src_lp = pu1_src + 4*j;
1186            pu1_est_lp = pu1_est + 4*j;
1187
1188            s1[j] = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]);
1189            s4[j] = ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]);
1190
1191            pu1_src_lp += src_strd;
1192            pu1_est_lp += est_strd;
1193
1194            s2[j] = ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]);
1195            s3[j] = ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]);
1196
1197            pu1_src_lp += src_strd;
1198            pu1_est_lp += est_strd;
1199
1200            s2[j] += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]);
1201            s3[j] += ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]);
1202
1203            pu1_src_lp += src_strd;
1204            pu1_est_lp += est_strd;
1205
1206            s1[j] += ABS((WORD16)pu1_src_lp[0] - (WORD16)pu1_est_lp[0])+ ABS((WORD16)pu1_src_lp[3] - (WORD16)pu1_est_lp[3]);
1207            s4[j] += ABS((WORD16)pu1_src_lp[1] - (WORD16)pu1_est_lp[1])+ ABS((WORD16)pu1_src_lp[2] - (WORD16)pu1_est_lp[2]);
1208
1209            sad[j] = ((s1[j]+s2[j]+s3[j]+s4[j])<<1);
1210        }
1211
1212        for(j=0;j<4;j++)
1213        {
1214
1215            if(
1216                    //thrsh[0] > (sad[j] >> 1) &&Dc goes in the other part
1217                    thrsh[1] > (sad[j] -(s2[j] + s3[j])) &&
1218                    thrsh[2] > (sad[j]>>1) &&
1219                    thrsh[3] > (sad[j] -(s1[j] + s4[j])) &&
1220
1221                    thrsh[4] > (sad[j] -(s3[j] + s4[j])) &&
1222                    thrsh[5] > (sad[j] -(s3[j] - (s1[j]<<1))) &&
1223                    thrsh[6] > (sad[j] -(s3[j] + s4[j])) &&
1224                    thrsh[7] > (sad[j] -(s4[j] - (s2[j]<<1))) &&
1225
1226                    thrsh[8] > (sad[j]>>1) &&
1227                    thrsh[9] > (sad[j] -(s2[j] + s3[j])) &&
1228                    thrsh[10]> (sad[j]>>1) &&
1229                    thrsh[11]> (sad[j] -(s1[j] + s4[j])) &&
1230
1231                    thrsh[12]> (sad[j] -(s1[j] + s2[j])) &&
1232                    thrsh[13]> (sad[j] -(s2[j] - (s4[j]<<1))) &&
1233                    thrsh[14]> (sad[j] -(s1[j] + s2[j])) &&
1234                    thrsh[15]> (sad[j] -(s1[j] - (s3[j]<<1)))
1235            )
1236            {
1237                //sad[j] = 0;   /*set current sad to be zero*/
1238                sig_nz_sad[j] = 0;/*Signal that the sad is zero*/
1239            }
1240            else
1241            {
1242                sig_nz_sad[j] = 1;/*signal that sad is non zero*/
1243                nz_sad_sig = 1;
1244            }
1245
1246            (*pi4_mb_distortion) += (sad[j]>>1);
1247            //if((*pi4_mb_distortion) >= max_sad)return; /*return or some thing*/
1248        }
1249
1250        sig_nz_sad += 4;
1251        pu1_src +=  (src_strd *4);
1252        pu1_est +=  (est_strd *4);
1253    }
1254
1255    if((*pi4_mb_distortion) < thrsh[0]<<2)
1256    {
1257        *sig_sad_dc = 0;
1258        if(nz_sad_sig == 0)(*pi4_mb_distortion) = 0;
1259    }
1260    else *sig_sad_dc = 1;
1261}
1262
1263
1264