sad_mb_offset.h revision f5af6314db25ff3bef9bd2eeba201bc6cc60805d
129a84457aed4c45bc900998b5e11c03023264208James Dong/* ------------------------------------------------------------------
229a84457aed4c45bc900998b5e11c03023264208James Dong * Copyright (C) 1998-2009 PacketVideo
329a84457aed4c45bc900998b5e11c03023264208James Dong *
429a84457aed4c45bc900998b5e11c03023264208James Dong * Licensed under the Apache License, Version 2.0 (the "License");
529a84457aed4c45bc900998b5e11c03023264208James Dong * you may not use this file except in compliance with the License.
629a84457aed4c45bc900998b5e11c03023264208James Dong * You may obtain a copy of the License at
729a84457aed4c45bc900998b5e11c03023264208James Dong *
829a84457aed4c45bc900998b5e11c03023264208James Dong *      http://www.apache.org/licenses/LICENSE-2.0
929a84457aed4c45bc900998b5e11c03023264208James Dong *
1029a84457aed4c45bc900998b5e11c03023264208James Dong * Unless required by applicable law or agreed to in writing, software
1129a84457aed4c45bc900998b5e11c03023264208James Dong * distributed under the License is distributed on an "AS IS" BASIS,
1229a84457aed4c45bc900998b5e11c03023264208James Dong * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
1329a84457aed4c45bc900998b5e11c03023264208James Dong * express or implied.
1429a84457aed4c45bc900998b5e11c03023264208James Dong * See the License for the specific language governing permissions
1529a84457aed4c45bc900998b5e11c03023264208James Dong * and limitations under the License.
1629a84457aed4c45bc900998b5e11c03023264208James Dong * -------------------------------------------------------------------
1729a84457aed4c45bc900998b5e11c03023264208James Dong */
1829a84457aed4c45bc900998b5e11c03023264208James Dong
19f5af6314db25ff3bef9bd2eeba201bc6cc60805dMartin Storsjo/* Intentionally not using the gcc asm version, since it (if fixed so
20f5af6314db25ff3bef9bd2eeba201bc6cc60805dMartin Storsjo * as to not crash - the current register constraints are faulty) is
21f5af6314db25ff3bef9bd2eeba201bc6cc60805dMartin Storsjo * slightly slower than the plain C version on modern GCC versions. */
22f5af6314db25ff3bef9bd2eeba201bc6cc60805dMartin Storsjo#if !defined(__CC_ARM) /* Generic C version */
2329a84457aed4c45bc900998b5e11c03023264208James Dong
2429a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
2529a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
2629a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
2729a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
2829a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
2929a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
3029a84457aed4c45bc900998b5e11c03023264208James Dong#endif
3129a84457aed4c45bc900998b5e11c03023264208James Dong{
3229a84457aed4c45bc900998b5e11c03023264208James Dong    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
3329a84457aed4c45bc900998b5e11c03023264208James Dong
3429a84457aed4c45bc900998b5e11c03023264208James Dong    //  x5 = (x4<<8) - x4;
3529a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x5 = 0;
3629a84457aed4c45bc900998b5e11c03023264208James Dong    x6 = 0xFFFF00FF;
3729a84457aed4c45bc900998b5e11c03023264208James Dong    x9 = 0x80808080; /* const. */
3829a84457aed4c45bc900998b5e11c03023264208James Dong    ref -= NUMBER; /* bic ref, ref, #3 */
3929a84457aed4c45bc900998b5e11c03023264208James Dong    ref -= lx;
4029a84457aed4c45bc900998b5e11c03023264208James Dong    blk -= 16;
4129a84457aed4c45bc900998b5e11c03023264208James Dong    x8 = 16;
4229a84457aed4c45bc900998b5e11c03023264208James Dong
4329a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
4429a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD3:
4529a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
4629a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD2:
4729a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
4829a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD1:
4929a84457aed4c45bc900998b5e11c03023264208James Dong#endif
5029a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
5129a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((uint32*)(ref += lx)); /* D C B A */
5229a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((uint32*)(ref + 4));    /* H G F E */
5329a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(ref + 8));    /* L K J I */
5429a84457aed4c45bc900998b5e11c03023264208James Dong
5529a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
5629a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 | (x11 << (32 - SHIFT));        /* G F E D */
5729a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
5829a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 | (x12 << (32 - SHIFT));        /* K J I H */
5929a84457aed4c45bc900998b5e11c03023264208James Dong
6029a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(blk += 16));
6129a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((uint32*)(blk + 4));
6229a84457aed4c45bc900998b5e11c03023264208James Dong
6329a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
6429a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixel(x11, x14, x9);
6529a84457aed4c45bc900998b5e11c03023264208James Dong
6629a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
6729a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixel(x10, x12, x9);
6829a84457aed4c45bc900998b5e11c03023264208James Dong
6929a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x10; /* accumulate low bytes */
7029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
7129a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
7229a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x11;  /* accumulate low bytes */
7329a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
7429a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
7529a84457aed4c45bc900998b5e11c03023264208James Dong
7629a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
7729a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((uint32*)(ref + 8)); /* D C B A */
7829a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((uint32*)(ref + 12));   /* H G F E */
7929a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(ref + 16));   /* L K J I */
8029a84457aed4c45bc900998b5e11c03023264208James Dong
8129a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24  = 0xFF 0xFF 0xFF ~D */
8229a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 | (x11 << (32 - SHIFT));        /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
8329a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
8429a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 | (x12 << (32 - SHIFT));        /* ~K ~J ~I ~H */
8529a84457aed4c45bc900998b5e11c03023264208James Dong
8629a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(blk + 8));
8729a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((uint32*)(blk + 12));
8829a84457aed4c45bc900998b5e11c03023264208James Dong
8929a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
9029a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixel(x11, x14, x9);
9129a84457aed4c45bc900998b5e11c03023264208James Dong
9229a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
9329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixel(x10, x12, x9);
9429a84457aed4c45bc900998b5e11c03023264208James Dong
9529a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x10; /* accumulate low bytes */
9629a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
9729a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
9829a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x11;  /* accumulate low bytes */
9929a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
10029a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
10129a84457aed4c45bc900998b5e11c03023264208James Dong
10229a84457aed4c45bc900998b5e11c03023264208James Dong    /****************/
10329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x5 - (x4 << 8); /* extract low bytes */
10429a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + x4;     /* add with high bytes */
10529a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + (x10 << 16); /* add with lower half word */
10629a84457aed4c45bc900998b5e11c03023264208James Dong
10729a84457aed4c45bc900998b5e11c03023264208James Dong    if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
10829a84457aed4c45bc900998b5e11c03023264208James Dong    {
10929a84457aed4c45bc900998b5e11c03023264208James Dong        if (--x8)
11029a84457aed4c45bc900998b5e11c03023264208James Dong        {
11129a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
11229a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD3;
11329a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
11429a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD2;
11529a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
11629a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD1;
11729a84457aed4c45bc900998b5e11c03023264208James Dong#endif
11829a84457aed4c45bc900998b5e11c03023264208James Dong        }
11929a84457aed4c45bc900998b5e11c03023264208James Dong
12029a84457aed4c45bc900998b5e11c03023264208James Dong    }
12129a84457aed4c45bc900998b5e11c03023264208James Dong
12229a84457aed4c45bc900998b5e11c03023264208James Dong    return ((uint32)x10 >> 16);
12329a84457aed4c45bc900998b5e11c03023264208James Dong}
12429a84457aed4c45bc900998b5e11c03023264208James Dong
12529a84457aed4c45bc900998b5e11c03023264208James Dong#elif defined(__CC_ARM)  /* only work with arm v5 */
12629a84457aed4c45bc900998b5e11c03023264208James Dong
12729a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
12829a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
12929a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
13029a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
13129a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
13229a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
13329a84457aed4c45bc900998b5e11c03023264208James Dong#endif
13429a84457aed4c45bc900998b5e11c03023264208James Dong{
13529a84457aed4c45bc900998b5e11c03023264208James Dong    int32 x4, x5, x6, x9, x10, x11, x12, x14;
13629a84457aed4c45bc900998b5e11c03023264208James Dong
13729a84457aed4c45bc900998b5e11c03023264208James Dong    x9 = 0x80808080; /* const. */
13829a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x5 = 0;
13929a84457aed4c45bc900998b5e11c03023264208James Dong
14029a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
14129a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x6, #0xff0000;
14229a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
14329a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD3:
14429a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
14529a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD2:
14629a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
14729a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD1:
14829a84457aed4c45bc900998b5e11c03023264208James Dong#endif
14929a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      ref, ref, #3;
15029a84457aed4c45bc900998b5e11c03023264208James Dong    }
15129a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
15229a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((int32*)(ref + 12));
15329a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(ref + 16));
15429a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((int32*)(ref + 8));
15529a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((int32*)(blk + 12));
15629a84457aed4c45bc900998b5e11c03023264208James Dong
15729a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
15829a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x10, x10, lsr #SHIFT;
15929a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x10, x10, x11, lsl #(32-SHIFT);
16029a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x11, x11, lsr #SHIFT;
16129a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x11, x11, x12, lsl #(32-SHIFT);
16229a84457aed4c45bc900998b5e11c03023264208James Dong
16329a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x12, [blk, #8];
16429a84457aed4c45bc900998b5e11c03023264208James Dong    }
16529a84457aed4c45bc900998b5e11c03023264208James Dong
16629a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
16729a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
16829a84457aed4c45bc900998b5e11c03023264208James Dong
16929a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
17029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
17129a84457aed4c45bc900998b5e11c03023264208James Dong
17229a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
17329a84457aed4c45bc900998b5e11c03023264208James Dong
17429a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
17529a84457aed4c45bc900998b5e11c03023264208James Dong        /****** process 8 pixels ******/
17629a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x11, [ref, #4];
17729a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x12, [ref, #8];
17829a84457aed4c45bc900998b5e11c03023264208James Dong        LDR  x10, [ref], lx ;
17929a84457aed4c45bc900998b5e11c03023264208James Dong        LDR  x14, [blk, #4];
18029a84457aed4c45bc900998b5e11c03023264208James Dong
18129a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x10, x10, lsr #SHIFT;
18229a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x10, x10, x11, lsl #(32-SHIFT);
18329a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x11, x11, lsr #SHIFT;
18429a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x11, x11, x12, lsl #(32-SHIFT);
18529a84457aed4c45bc900998b5e11c03023264208James Dong
18629a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x12, [blk], #16;
18729a84457aed4c45bc900998b5e11c03023264208James Dong    }
18829a84457aed4c45bc900998b5e11c03023264208James Dong
18929a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
19029a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
19129a84457aed4c45bc900998b5e11c03023264208James Dong
19229a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
19329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
19429a84457aed4c45bc900998b5e11c03023264208James Dong
19529a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
19629a84457aed4c45bc900998b5e11c03023264208James Dong
19729a84457aed4c45bc900998b5e11c03023264208James Dong    /****************/
19829a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x5 - (x4 << 8); /* extract low bytes */
19929a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + x4;     /* add with high bytes */
20029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + (x10 << 16); /* add with lower half word */
20129a84457aed4c45bc900998b5e11c03023264208James Dong
20229a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
20329a84457aed4c45bc900998b5e11c03023264208James Dong        RSBS     x11, dmin, x10, lsr #16
20429a84457aed4c45bc900998b5e11c03023264208James Dong        ADDLSS   x8, x8, #INC_X8
20529a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
20629a84457aed4c45bc900998b5e11c03023264208James Dong        BLS      LOOP_SAD3;
20729a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
20829a84457aed4c45bc900998b5e11c03023264208James DongBLS      LOOP_SAD2;
20929a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
21029a84457aed4c45bc900998b5e11c03023264208James DongBLS      LOOP_SAD1;
21129a84457aed4c45bc900998b5e11c03023264208James Dong#endif
21229a84457aed4c45bc900998b5e11c03023264208James Dong    }
21329a84457aed4c45bc900998b5e11c03023264208James Dong
21429a84457aed4c45bc900998b5e11c03023264208James Dong    return ((uint32)x10 >> 16);
21529a84457aed4c45bc900998b5e11c03023264208James Dong}
21629a84457aed4c45bc900998b5e11c03023264208James Dong
21729a84457aed4c45bc900998b5e11c03023264208James Dong#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
21829a84457aed4c45bc900998b5e11c03023264208James Dong
21929a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
22029a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
22129a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
22229a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
22329a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
22429a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
22529a84457aed4c45bc900998b5e11c03023264208James Dong#endif
22629a84457aed4c45bc900998b5e11c03023264208James Dong{
22729a84457aed4c45bc900998b5e11c03023264208James Dong    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
22829a84457aed4c45bc900998b5e11c03023264208James Dong
22929a84457aed4c45bc900998b5e11c03023264208James Dong    x9 = 0x80808080; /* const. */
23029a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x5 = 0;
23129a84457aed4c45bc900998b5e11c03023264208James Dong    x8 = 16; //<<===========*******
23229a84457aed4c45bc900998b5e11c03023264208James Dong
23329a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN	%0, #0xFF0000": "=r"(x6));
23429a84457aed4c45bc900998b5e11c03023264208James Dong
23529a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
23629a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD3:
23729a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
23829a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD2:
23929a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
24029a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD1:
24129a84457aed4c45bc900998b5e11c03023264208James Dong#endif
24229a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("BIC  %0, %0, #3": "=r"(ref));
24329a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
24429a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((int32*)(ref + 12));
24529a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(ref + 16));
24629a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((int32*)(ref + 8));
24729a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((int32*)(blk + 12));
24829a84457aed4c45bc900998b5e11c03023264208James Dong
24929a84457aed4c45bc900998b5e11c03023264208James Dong#if (SHIFT==8)
25029a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #8\n\tBIC   %0, %0, %1,lsl #24\n\tMVN   %1, %1,lsr #8\n\tBIC   %1, %1, %2,lsl #24": "=&r"(x10), "=&r"(x11): "r"(x12));
25129a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==16)
25229a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #16\n\tBIC   %0, %0, %1,lsl #16\n\tMVN   %1, %1,lsr #16\n\tBIC   %1, %1, %2,lsl #16": "=&r"(x10), "=&r"(x11): "r"(x12));
25329a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==24)
25429a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #24\n\tBIC   %0, %0, %1,lsl #8\n\tMVN   %1, %1,lsr #24\n\tBIC   %1, %1, %2,lsl #8": "=&r"(x10), "=&r"(x11): "r"(x12));
25529a84457aed4c45bc900998b5e11c03023264208James Dong#endif
25629a84457aed4c45bc900998b5e11c03023264208James Dong
25729a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(blk + 8));
25829a84457aed4c45bc900998b5e11c03023264208James Dong
25929a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
26029a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
26129a84457aed4c45bc900998b5e11c03023264208James Dong
26229a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
26329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
26429a84457aed4c45bc900998b5e11c03023264208James Dong
26529a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
26629a84457aed4c45bc900998b5e11c03023264208James Dong
26729a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
26829a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((int32*)(ref + 4));
26929a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(ref + 8));
27029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((int32*)ref); ref += lx;
27129a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((int32*)(blk + 4));
27229a84457aed4c45bc900998b5e11c03023264208James Dong
27329a84457aed4c45bc900998b5e11c03023264208James Dong#if (SHIFT==8)
27429a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #8\n\tBIC   %0, %0, %1,lsl #24\n\tMVN   %1, %1,lsr #8\n\tBIC   %1, %1, %2,lsl #24": "=&r"(x10), "=&r"(x11): "r"(x12));
27529a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==16)
27629a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #16\n\tBIC   %0, %0, %1,lsl #16\n\tMVN   %1, %1,lsr #16\n\tBIC   %1, %1, %2,lsl #16": "=&r"(x10), "=&r"(x11): "r"(x12));
27729a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==24)
27829a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #24\n\tBIC   %0, %0, %1,lsl #8\n\tMVN   %1, %1,lsr #24\n\tBIC   %1, %1, %2,lsl #8": "=&r"(x10), "=&r"(x11): "r"(x12));
27929a84457aed4c45bc900998b5e11c03023264208James Dong#endif
28029a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("LDR   %0, [%1], #16": "=&r"(x12), "=r"(blk));
28129a84457aed4c45bc900998b5e11c03023264208James Dong
28229a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
28329a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
28429a84457aed4c45bc900998b5e11c03023264208James Dong
28529a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
28629a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
28729a84457aed4c45bc900998b5e11c03023264208James Dong
28829a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
28929a84457aed4c45bc900998b5e11c03023264208James Dong
29029a84457aed4c45bc900998b5e11c03023264208James Dong    /****************/
29129a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x5 - (x4 << 8); /* extract low bytes */
29229a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + x4;     /* add with high bytes */
29329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + (x10 << 16); /* add with lower half word */
29429a84457aed4c45bc900998b5e11c03023264208James Dong
29529a84457aed4c45bc900998b5e11c03023264208James Dong    if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
29629a84457aed4c45bc900998b5e11c03023264208James Dong    {
29729a84457aed4c45bc900998b5e11c03023264208James Dong        if (--x8)
29829a84457aed4c45bc900998b5e11c03023264208James Dong        {
29929a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
30029a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD3;
30129a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
30229a84457aed4c45bc900998b5e11c03023264208James Donggoto         LOOP_SAD2;
30329a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
30429a84457aed4c45bc900998b5e11c03023264208James Donggoto         LOOP_SAD1;
30529a84457aed4c45bc900998b5e11c03023264208James Dong#endif
30629a84457aed4c45bc900998b5e11c03023264208James Dong        }
30729a84457aed4c45bc900998b5e11c03023264208James Dong
30829a84457aed4c45bc900998b5e11c03023264208James Dong    }
30929a84457aed4c45bc900998b5e11c03023264208James Dong
31029a84457aed4c45bc900998b5e11c03023264208James Dong    return ((uint32)x10 >> 16);
31129a84457aed4c45bc900998b5e11c03023264208James Dong}
31229a84457aed4c45bc900998b5e11c03023264208James Dong
31329a84457aed4c45bc900998b5e11c03023264208James Dong#endif
31429a84457aed4c45bc900998b5e11c03023264208James Dong
315