159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong/* ------------------------------------------------------------------
259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong * Copyright (C) 1998-2009 PacketVideo
359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong *
459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong * Licensed under the Apache License, Version 2.0 (the "License");
559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong * you may not use this file except in compliance with the License.
659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong * You may obtain a copy of the License at
759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong *
859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong *      http://www.apache.org/licenses/LICENSE-2.0
959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong *
1059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong * Unless required by applicable law or agreed to in writing, software
1159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong * distributed under the License is distributed on an "AS IS" BASIS,
1259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
1359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong * express or implied.
1459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong * See the License for the specific language governing permissions
1559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong * and limitations under the License.
1659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong * -------------------------------------------------------------------
1759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong */
1859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong/*********************************************************************************/
1959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong/*  Filename: sad_inline.h                                                      */
2059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong/*  Description: Implementation for in-line functions used in dct.cpp           */
2159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong/*  Modified:                                                                   */
2259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong/*********************************************************************************/
2359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#ifndef _SAD_INLINE_H_
2459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define _SAD_INLINE_H_
2559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
2659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#ifdef __cplusplus
2759f566c4ec3dfc097ad8163523e522280b27e5c3James Dongextern "C"
2859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong{
2959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#endif
3059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
3159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#if !defined(PV_ARM_GCC_V5) && !defined(PV_ARM_GCC_V4) /* ARM GNU COMPILER  */
3259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
3359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
3459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    {
3559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        tmp = tmp - tmp2;
3659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        if (tmp > 0) sad += tmp;
3759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        else sad -= tmp;
3859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
3959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return sad;
4059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    }
4159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
4259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
4359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    {
4459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        int32 x7;
4559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
4659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x7 = src2 ^ src1;       /* check odd/even combination */
4759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        if ((uint32)src2 >= (uint32)src1)
4859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        {
4959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            src1 = src2 - src1;     /* subs */
5059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        }
5159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        else
5259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        {
5359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            src1 = src1 - src2;
5459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        }
5559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x7 = x7 ^ src1;     /* only odd bytes need to add carry */
5659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x7 = mask & ((uint32)x7 >> 1);
5759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x7 = (x7 << 8) - x7;
5859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
5959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        src1 = src1 ^(x7 >> 7);   /* take absolute value of negative byte */
6059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
6159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return src1;
6259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    }
6359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
6459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define NUMBER 3
6559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define SHIFT 24
6659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
6759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#include "sad_mb_offset.h"
6859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
6959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef NUMBER
7059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define NUMBER 2
7159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef SHIFT
7259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define SHIFT 16
7359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#include "sad_mb_offset.h"
7459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
7559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef NUMBER
7659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define NUMBER 1
7759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef SHIFT
7859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define SHIFT 8
7959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#include "sad_mb_offset.h"
8059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
8159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
8259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
8359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    {
8459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
8559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
8659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x9 = 0x80808080; /* const. */
8759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
88377b2ec9a2885f9b6405b07ba900a9e3f4349c38Kévin PETIT        x8 = (uintptr_t)ref & 0x3;
8959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        if (x8 == 3)
9059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            goto SadMBOffset3;
9159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        if (x8 == 2)
9259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            goto SadMBOffset2;
9359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        if (x8 == 1)
9459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            goto SadMBOffset1;
9559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
9659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong//  x5 = (x4<<8)-x4; /* x5 = x4*255; */
9759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x5 = 0;
9859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
9959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x6 = 0xFFFF00FF;
10059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
10159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        ref -= lx;
10259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        blk -= 16;
10359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
10459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x8 = 16;
10559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
10659f566c4ec3dfc097ad8163523e522280b27e5c3James DongLOOP_SAD0:
10759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /****** process 8 pixels ******/
10859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = *((uint32*)(ref += lx));
10959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = *((uint32*)(ref + 4));
11059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x12 = *((uint32*)(blk += 16));
11159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x14 = *((uint32*)(blk + 4));
11259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
11359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x11 & x14 */
11459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = sad_4pixel(x11, x14, x9);
11559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
11659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x12 & x10 */
11759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = sad_4pixel(x10, x12, x9);
11859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
11959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x10; /* accumulate low bytes */
12059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
12159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
12259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x11;  /* accumulate low bytes */
12359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
12459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
12559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
12659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /****** process 8 pixels ******/
12759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = *((uint32*)(ref + 8));
12859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = *((uint32*)(ref + 12));
12959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x12 = *((uint32*)(blk + 8));
13059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x14 = *((uint32*)(blk + 12));
13159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
13259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x11 & x14 */
13359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = sad_4pixel(x11, x14, x9);
13459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
13559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x12 & x10 */
13659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = sad_4pixel(x10, x12, x9);
13759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
13859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x10;  /* accumulate low bytes */
13959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
14059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
14159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x11;  /* accumulate low bytes */
14259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
14359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
14459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
14559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /****************/
14659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x5 - (x4 << 8); /* extract low bytes */
14759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 + x4;     /* add with high bytes */
14859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 + (x10 << 16); /* add with lower half word */
14959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
15059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
15159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        {
15259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            if (--x8)
15359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            {
15459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                goto LOOP_SAD0;
15559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            }
15659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
15759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        }
15859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
15959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return ((uint32)x10 >> 16);
16059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
16159f566c4ec3dfc097ad8163523e522280b27e5c3James DongSadMBOffset3:
16259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
16359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return sad_mb_offset3(ref, blk, lx, dmin);
16459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
16559f566c4ec3dfc097ad8163523e522280b27e5c3James DongSadMBOffset2:
16659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
16759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return sad_mb_offset2(ref, blk, lx, dmin);
16859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
16959f566c4ec3dfc097ad8163523e522280b27e5c3James DongSadMBOffset1:
17059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
17159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return sad_mb_offset1(ref, blk, lx, dmin);
17259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
17359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    }
17459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
17559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#elif defined(__CC_ARM)  /* only work with arm v5 */
17659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
17759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
17859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    {
17959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        __asm
18059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        {
18159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            rsbs    tmp, tmp, tmp2 ;
18259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            rsbmi   tmp, tmp, #0 ;
18359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            add     sad, sad, tmp ;
18459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        }
18559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
18659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return sad;
18759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    }
18859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
18959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
19059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    {
19159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        int32 x7;
19259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
19359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        __asm
19459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        {
19559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            EOR     x7, src2, src1;     /* check odd/even combination */
19659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            SUBS    src1, src2, src1;
19759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            EOR     x7, x7, src1;
19859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            AND     x7, mask, x7, lsr #1;
19959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            ORRCC   x7, x7, #0x80000000;
20059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            RSB     x7, x7, x7, lsl #8;
20159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            ADD     src1, src1, x7, asr #7;   /* add 0xFF to the negative byte, add back carry */
20259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            EOR     src1, src1, x7, asr #7;   /* take absolute value of negative byte */
20359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        }
20459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
20559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return src1;
20659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    }
20759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
20859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
20959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    {
21059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        int32 x7;
21159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
21259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        __asm
21359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        {
21459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            EOR      x7, src2, src1;        /* check odd/even combination */
21559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            ADDS     src1, src2, src1;
21659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            EOR      x7, x7, src1;      /* only odd bytes need to add carry */
21759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            ANDS     x7, mask, x7, rrx;
21859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            RSB      x7, x7, x7, lsl #8;
21959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            SUB      src1, src1, x7, asr #7;  /* add 0xFF to the negative byte, add back carry */
22059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            EOR      src1, src1, x7, asr #7; /* take absolute value of negative byte */
22159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        }
22259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
22359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return src1;
22459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    }
22559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
22659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define sum_accumulate  __asm{      SBC      x5, x5, x10;  /* accumulate low bytes */ \
22759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        BIC      x10, x6, x10;   /* x10 & 0xFF00FF00 */ \
22859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        ADD      x4, x4, x10,lsr #8;   /* accumulate high bytes */ \
22959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        SBC      x5, x5, x11;    /* accumulate low bytes */ \
23059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        BIC      x11, x6, x11;   /* x11 & 0xFF00FF00 */ \
23159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        ADD      x4, x4, x11,lsr #8; } /* accumulate high bytes */
23259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
23359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
23459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define NUMBER 3
23559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define SHIFT 24
23659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define INC_X8 0x08000001
23759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
23859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#include "sad_mb_offset.h"
23959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
24059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef NUMBER
24159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define NUMBER 2
24259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef SHIFT
24359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define SHIFT 16
24459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef INC_X8
24559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define INC_X8 0x10000001
24659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#include "sad_mb_offset.h"
24759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
24859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef NUMBER
24959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define NUMBER 1
25059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef SHIFT
25159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define SHIFT 8
25259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef INC_X8
25359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define INC_X8 0x08000001
25459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#include "sad_mb_offset.h"
25559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
25659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
25759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
25859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    {
25959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
26059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
26159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x9 = 0x80808080; /* const. */
26259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x5 = 0;
26359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
26459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        __asm
26559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        {
26659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            MOVS    x8, ref, lsl #31 ;
26759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            BHI     SadMBOffset3;
26859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            BCS     SadMBOffset2;
26959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            BMI     SadMBOffset1;
27059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
27159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            MVN     x6, #0xFF00;
27259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        }
27359f566c4ec3dfc097ad8163523e522280b27e5c3James DongLOOP_SAD0:
27459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /****** process 8 pixels ******/
27559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = *((int32*)(ref + 12));
27659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = *((int32*)(ref + 8));
27759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x14 = *((int32*)(blk + 12));
27859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x12 = *((int32*)(blk + 8));
27959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
28059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x11 & x14 */
28159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = sad_4pixel(x11, x14, x9);
28259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
28359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x12 & x10 */
28459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = sad_4pixel(x10, x12, x9);
28559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
28659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x10;  /* accumulate low bytes */
28759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
28859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
28959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x11;  /* accumulate low bytes */
29059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
29159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
29259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
29359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        __asm
29459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        {
29559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            /****** process 8 pixels ******/
29659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            LDR     x11, [ref, #4];
29759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            LDR     x10, [ref], lx ;
29859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            LDR     x14, [blk, #4];
29959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            LDR     x12, [blk], #16 ;
30059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        }
30159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
30259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x11 & x14 */
30359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = sad_4pixel(x11, x14, x9);
30459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
30559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x12 & x10 */
30659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = sad_4pixel(x10, x12, x9);
30759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
30859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x10;  /* accumulate low bytes */
30959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
31059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
31159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x11;  /* accumulate low bytes */
31259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
31359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
31459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
31559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /****************/
31659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x5 - (x4 << 8); /* extract low bytes */
31759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 + x4;     /* add with high bytes */
31859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 + (x10 << 16); /* add with lower half word */
31959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
32059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        __asm
32159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        {
32259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            /****************/
32359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            RSBS    x11, dmin, x10, lsr #16;
32459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            ADDLSS  x8, x8, #0x10000001;
32559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            BLS     LOOP_SAD0;
32659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        }
32759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
32859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return ((uint32)x10 >> 16);
32959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
33059f566c4ec3dfc097ad8163523e522280b27e5c3James DongSadMBOffset3:
33159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
33259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return sad_mb_offset3(ref, blk, lx, dmin, x8);
33359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
33459f566c4ec3dfc097ad8163523e522280b27e5c3James DongSadMBOffset2:
33559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
33659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return sad_mb_offset2(ref, blk, lx, dmin, x8);
33759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
33859f566c4ec3dfc097ad8163523e522280b27e5c3James DongSadMBOffset1:
33959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
34059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return sad_mb_offset1(ref, blk, lx, dmin, x8);
34159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    }
34259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
34359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
34459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER  */
34559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
34659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
34759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    {
34859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 out;
34959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 temp1;
35059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 ss = sad;
35159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 tt = tmp;
35259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 uu = tmp2;
35359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
35459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        asm volatile("rsbs  %1, %4, %3\n\t"
35559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "rsbmi %1, %1, #0\n\t"
35659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "add   %0, %2, %1"
35759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong             : "=&r"(out),
35859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "=&r"(temp1)
35959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                             : "r"(ss),
36059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                             "r"(tt),
36159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                             "r"(uu));
36259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return out;
36359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    }
36459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
36559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
36659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong{
36759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 out;
36859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 temp1;
36959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 s1 = src1;
37059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 s2 = src2;
37159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 mm = mask;
37259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
37359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        asm volatile("eor   %0, %3, %2\n\t"
37459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "subs  %1, %3, %2\n\t"
37559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "eor   %0, %0, %1\n\t"
37659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "and   %0, %4, %0, lsr #1\n\t"
37759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "orrcc %0, %0, #0x80000000\n\t"
37859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "rsb   %0, %0, %0, lsl #8\n\t"
37959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "add   %1, %1, %0, asr #7\n\t"
38059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "eor   %1, %1, %0, asr #7"
38159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong             : "=&r"(out),
38259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "=&r"(temp1)
38359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                             : "r"(s1),
38459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                             "r"(s2),
38559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                             "r"(mm));
38659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
38759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return temp1;
38859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    }
38959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
39059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
39159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong{
39259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 out;
39359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 temp1;
39459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 s1 = src1;
39559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 s2 = src2;
39659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        register int32 mm = mask;
39759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
39859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        asm volatile("eor    %1, %3, %2\n\t"
39959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "adds   %0, %3, %2\n\t"
40059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "eor    %1, %1, %0\n\t"
40159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "ands   %1, %4, %1,rrx\n\t"
40259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "rsb    %1, %1, %1, lsl #8\n\t"
40359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "sub    %0, %0, %1, asr #7\n\t"
40459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "eor    %0, %0, %1, asr #7"
40559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong             : "=&r"(out),
40659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "=&r"(temp1)
40759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                             : "r"(s1),
40859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                             "r"(s2),
40959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                             "r"(mm));
41059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
41159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return (out);
41259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    }
41359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
41459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define sum_accumulate asm volatile("sbc  %0, %0, %1\n\t" \
41559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                                "bic  %1, %4, %1\n\t" \
41659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                                "add  %2, %2, %1, lsr #8\n\t" \
41759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                                "sbc  %0, %0, %3\n\t" \
41859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                                "bic  %3, %4, %3\n\t" \
41959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                                "add  %2, %2, %3, lsr #8" \
42059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                                :"+r"(x5), "+r"(x10), "+r"(x4), "+r"(x11) \
42159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                                :"r"(x6));
42259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
42359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define NUMBER 3
42459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define SHIFT 24
42559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define INC_X8 0x08000001
42659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
42759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#include "sad_mb_offset.h"
42859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
42959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef NUMBER
43059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define NUMBER 2
43159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef SHIFT
43259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define SHIFT 16
43359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef INC_X8
43459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define INC_X8 0x10000001
43559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#include "sad_mb_offset.h"
43659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
43759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef NUMBER
43859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define NUMBER 1
43959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef SHIFT
44059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define SHIFT 8
44159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#undef INC_X8
44259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#define INC_X8 0x08000001
44359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#include "sad_mb_offset.h"
44459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
44559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
44659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
44759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong{
44859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
44959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
45059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x9 = 0x80808080; /* const. */
45159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x5 = 0;
45259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
45359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x8 = (uint32)ref & 0x3;
45459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        if (x8 == 3)
45559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            goto SadMBOffset3;
45659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        if (x8 == 2)
45759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            goto SadMBOffset2;
45859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        if (x8 == 1)
45959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            goto SadMBOffset1;
46059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
46159f566c4ec3dfc097ad8163523e522280b27e5c3James Dongasm volatile("mvn %0, #0xFF00": "=r"(x6));
46259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
46359f566c4ec3dfc097ad8163523e522280b27e5c3James DongLOOP_SAD0:
46459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /****** process 8 pixels ******/
46559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = *((int32*)(ref + 12));
46659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = *((int32*)(ref + 8));
46759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x14 = *((int32*)(blk + 12));
46859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x12 = *((int32*)(blk + 8));
46959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
47059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x11 & x14 */
47159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = sad_4pixel(x11, x14, x9);
47259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
47359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x12 & x10 */
47459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = sad_4pixel(x10, x12, x9);
47559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
47659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x10;  /* accumulate low bytes */
47759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
47859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
47959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x11;  /* accumulate low bytes */
48059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
48159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
48259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
48359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        asm volatile("ldr  %0, [%4, #4]\n\t"
48459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "ldr  %1, [%4], %6\n\t"
48559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "ldr  %2, [%5, #4]\n\t"
48659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                     "ldr  %3, [%5], #16"
48759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong             : "=r"(x11), "=r"(x10), "=r"(x14), "=r"(x12), "+r"(ref), "+r"(blk)
48859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                             : "r"(lx));
48959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
49059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x11 & x14 */
49159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = sad_4pixel(x11, x14, x9);
49259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
49359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /* process x12 & x10 */
49459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = sad_4pixel(x10, x12, x9);
49559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
49659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x10;  /* accumulate low bytes */
49759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
49859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
49959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x5 = x5 + x11;  /* accumulate low bytes */
50059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
50159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
50259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
50359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        /****************/
50459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x5 - (x4 << 8); /* extract low bytes */
50559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 + x4;     /* add with high bytes */
50659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        x10 = x10 + (x10 << 16); /* add with lower half word */
50759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
50859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
50959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        {
51059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            if (--x8)
51159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            {
51259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong                goto LOOP_SAD0;
51359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong            }
51459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
51559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        }
51659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
51759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return ((uint32)x10 >> 16);
51859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
51959f566c4ec3dfc097ad8163523e522280b27e5c3James DongSadMBOffset3:
52059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
52159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return sad_mb_offset3(ref, blk, lx, dmin);
52259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
52359f566c4ec3dfc097ad8163523e522280b27e5c3James DongSadMBOffset2:
52459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
52559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return sad_mb_offset2(ref, blk, lx, dmin);
52659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
52759f566c4ec3dfc097ad8163523e522280b27e5c3James DongSadMBOffset1:
52859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
52959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong        return sad_mb_offset1(ref, blk, lx, dmin);
53059f566c4ec3dfc097ad8163523e522280b27e5c3James Dong    }
53159f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
53259f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#endif // OS
53359f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
53459f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#ifdef __cplusplus
53559f566c4ec3dfc097ad8163523e522280b27e5c3James Dong}
53659f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#endif
53759f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
53859f566c4ec3dfc097ad8163523e522280b27e5c3James Dong#endif // _SAD_INLINE_H_
53959f566c4ec3dfc097ad8163523e522280b27e5c3James Dong
540