129a84457aed4c45bc900998b5e11c03023264208James Dong/* ------------------------------------------------------------------
229a84457aed4c45bc900998b5e11c03023264208James Dong * Copyright (C) 1998-2009 PacketVideo
329a84457aed4c45bc900998b5e11c03023264208James Dong *
429a84457aed4c45bc900998b5e11c03023264208James Dong * Licensed under the Apache License, Version 2.0 (the "License");
529a84457aed4c45bc900998b5e11c03023264208James Dong * you may not use this file except in compliance with the License.
629a84457aed4c45bc900998b5e11c03023264208James Dong * You may obtain a copy of the License at
729a84457aed4c45bc900998b5e11c03023264208James Dong *
829a84457aed4c45bc900998b5e11c03023264208James Dong *      http://www.apache.org/licenses/LICENSE-2.0
929a84457aed4c45bc900998b5e11c03023264208James Dong *
1029a84457aed4c45bc900998b5e11c03023264208James Dong * Unless required by applicable law or agreed to in writing, software
1129a84457aed4c45bc900998b5e11c03023264208James Dong * distributed under the License is distributed on an "AS IS" BASIS,
1229a84457aed4c45bc900998b5e11c03023264208James Dong * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
1329a84457aed4c45bc900998b5e11c03023264208James Dong * express or implied.
1429a84457aed4c45bc900998b5e11c03023264208James Dong * See the License for the specific language governing permissions
1529a84457aed4c45bc900998b5e11c03023264208James Dong * and limitations under the License.
1629a84457aed4c45bc900998b5e11c03023264208James Dong * -------------------------------------------------------------------
1729a84457aed4c45bc900998b5e11c03023264208James Dong */
1829a84457aed4c45bc900998b5e11c03023264208James Dong
1942d515121f11389df082dd02319904c99dd50cd6Martin Storsjo/* Intentionally not using the gcc asm version, since it is
20f5af6314db25ff3bef9bd2eeba201bc6cc60805dMartin Storsjo * slightly slower than the plain C version on modern GCC versions. */
21f5af6314db25ff3bef9bd2eeba201bc6cc60805dMartin Storsjo#if !defined(__CC_ARM) /* Generic C version */
2229a84457aed4c45bc900998b5e11c03023264208James Dong
2329a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
2429a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
2529a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
2629a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
2729a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
2829a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
2929a84457aed4c45bc900998b5e11c03023264208James Dong#endif
3029a84457aed4c45bc900998b5e11c03023264208James Dong{
3129a84457aed4c45bc900998b5e11c03023264208James Dong    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
3229a84457aed4c45bc900998b5e11c03023264208James Dong
3329a84457aed4c45bc900998b5e11c03023264208James Dong    //  x5 = (x4<<8) - x4;
3429a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x5 = 0;
3529a84457aed4c45bc900998b5e11c03023264208James Dong    x6 = 0xFFFF00FF;
3629a84457aed4c45bc900998b5e11c03023264208James Dong    x9 = 0x80808080; /* const. */
3729a84457aed4c45bc900998b5e11c03023264208James Dong    ref -= NUMBER; /* bic ref, ref, #3 */
3829a84457aed4c45bc900998b5e11c03023264208James Dong    ref -= lx;
3929a84457aed4c45bc900998b5e11c03023264208James Dong    blk -= 16;
4029a84457aed4c45bc900998b5e11c03023264208James Dong    x8 = 16;
4129a84457aed4c45bc900998b5e11c03023264208James Dong
4229a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
4329a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD3:
4429a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
4529a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD2:
4629a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
4729a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD1:
4829a84457aed4c45bc900998b5e11c03023264208James Dong#endif
4929a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
5029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((uint32*)(ref += lx)); /* D C B A */
5129a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((uint32*)(ref + 4));    /* H G F E */
5229a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(ref + 8));    /* L K J I */
5329a84457aed4c45bc900998b5e11c03023264208James Dong
5429a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
5529a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 | (x11 << (32 - SHIFT));        /* G F E D */
5629a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
5729a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 | (x12 << (32 - SHIFT));        /* K J I H */
5829a84457aed4c45bc900998b5e11c03023264208James Dong
5929a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(blk += 16));
6029a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((uint32*)(blk + 4));
6129a84457aed4c45bc900998b5e11c03023264208James Dong
6229a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
6329a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixel(x11, x14, x9);
6429a84457aed4c45bc900998b5e11c03023264208James Dong
6529a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
6629a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixel(x10, x12, x9);
6729a84457aed4c45bc900998b5e11c03023264208James Dong
6829a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x10; /* accumulate low bytes */
6929a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
7029a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
7129a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x11;  /* accumulate low bytes */
7229a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
7329a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
7429a84457aed4c45bc900998b5e11c03023264208James Dong
7529a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
7629a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((uint32*)(ref + 8)); /* D C B A */
7729a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((uint32*)(ref + 12));   /* H G F E */
7829a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(ref + 16));   /* L K J I */
7929a84457aed4c45bc900998b5e11c03023264208James Dong
8029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24  = 0xFF 0xFF 0xFF ~D */
8129a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 | (x11 << (32 - SHIFT));        /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
8229a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
8329a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 | (x12 << (32 - SHIFT));        /* ~K ~J ~I ~H */
8429a84457aed4c45bc900998b5e11c03023264208James Dong
8529a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(blk + 8));
8629a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((uint32*)(blk + 12));
8729a84457aed4c45bc900998b5e11c03023264208James Dong
8829a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
8929a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixel(x11, x14, x9);
9029a84457aed4c45bc900998b5e11c03023264208James Dong
9129a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
9229a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixel(x10, x12, x9);
9329a84457aed4c45bc900998b5e11c03023264208James Dong
9429a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x10; /* accumulate low bytes */
9529a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
9629a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
9729a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x11;  /* accumulate low bytes */
9829a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
9929a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
10029a84457aed4c45bc900998b5e11c03023264208James Dong
10129a84457aed4c45bc900998b5e11c03023264208James Dong    /****************/
10229a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x5 - (x4 << 8); /* extract low bytes */
10329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + x4;     /* add with high bytes */
10429a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + (x10 << 16); /* add with lower half word */
10529a84457aed4c45bc900998b5e11c03023264208James Dong
10629a84457aed4c45bc900998b5e11c03023264208James Dong    if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
10729a84457aed4c45bc900998b5e11c03023264208James Dong    {
10829a84457aed4c45bc900998b5e11c03023264208James Dong        if (--x8)
10929a84457aed4c45bc900998b5e11c03023264208James Dong        {
11029a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
11129a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD3;
11229a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
11329a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD2;
11429a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
11529a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD1;
11629a84457aed4c45bc900998b5e11c03023264208James Dong#endif
11729a84457aed4c45bc900998b5e11c03023264208James Dong        }
11829a84457aed4c45bc900998b5e11c03023264208James Dong
11929a84457aed4c45bc900998b5e11c03023264208James Dong    }
12029a84457aed4c45bc900998b5e11c03023264208James Dong
12129a84457aed4c45bc900998b5e11c03023264208James Dong    return ((uint32)x10 >> 16);
12229a84457aed4c45bc900998b5e11c03023264208James Dong}
12329a84457aed4c45bc900998b5e11c03023264208James Dong
12429a84457aed4c45bc900998b5e11c03023264208James Dong#elif defined(__CC_ARM)  /* only work with arm v5 */
12529a84457aed4c45bc900998b5e11c03023264208James Dong
12629a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
12729a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
12829a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
12929a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
13029a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
13129a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
13229a84457aed4c45bc900998b5e11c03023264208James Dong#endif
13329a84457aed4c45bc900998b5e11c03023264208James Dong{
13429a84457aed4c45bc900998b5e11c03023264208James Dong    int32 x4, x5, x6, x9, x10, x11, x12, x14;
13529a84457aed4c45bc900998b5e11c03023264208James Dong
13629a84457aed4c45bc900998b5e11c03023264208James Dong    x9 = 0x80808080; /* const. */
13729a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x5 = 0;
13829a84457aed4c45bc900998b5e11c03023264208James Dong
13929a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
14029a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x6, #0xff0000;
14129a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
14229a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD3:
14329a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
14429a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD2:
14529a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
14629a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD1:
14729a84457aed4c45bc900998b5e11c03023264208James Dong#endif
14829a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      ref, ref, #3;
14929a84457aed4c45bc900998b5e11c03023264208James Dong    }
15029a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
15129a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((int32*)(ref + 12));
15229a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(ref + 16));
15329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((int32*)(ref + 8));
15429a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((int32*)(blk + 12));
15529a84457aed4c45bc900998b5e11c03023264208James Dong
15629a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
15729a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x10, x10, lsr #SHIFT;
15829a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x10, x10, x11, lsl #(32-SHIFT);
15929a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x11, x11, lsr #SHIFT;
16029a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x11, x11, x12, lsl #(32-SHIFT);
16129a84457aed4c45bc900998b5e11c03023264208James Dong
16229a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x12, [blk, #8];
16329a84457aed4c45bc900998b5e11c03023264208James Dong    }
16429a84457aed4c45bc900998b5e11c03023264208James Dong
16529a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
16629a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
16729a84457aed4c45bc900998b5e11c03023264208James Dong
16829a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
16929a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
17029a84457aed4c45bc900998b5e11c03023264208James Dong
17129a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
17229a84457aed4c45bc900998b5e11c03023264208James Dong
17329a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
17429a84457aed4c45bc900998b5e11c03023264208James Dong        /****** process 8 pixels ******/
17529a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x11, [ref, #4];
17629a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x12, [ref, #8];
17729a84457aed4c45bc900998b5e11c03023264208James Dong        LDR  x10, [ref], lx ;
17829a84457aed4c45bc900998b5e11c03023264208James Dong        LDR  x14, [blk, #4];
17929a84457aed4c45bc900998b5e11c03023264208James Dong
18029a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x10, x10, lsr #SHIFT;
18129a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x10, x10, x11, lsl #(32-SHIFT);
18229a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x11, x11, lsr #SHIFT;
18329a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x11, x11, x12, lsl #(32-SHIFT);
18429a84457aed4c45bc900998b5e11c03023264208James Dong
18529a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x12, [blk], #16;
18629a84457aed4c45bc900998b5e11c03023264208James Dong    }
18729a84457aed4c45bc900998b5e11c03023264208James Dong
18829a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
18929a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
19029a84457aed4c45bc900998b5e11c03023264208James Dong
19129a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
19229a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
19329a84457aed4c45bc900998b5e11c03023264208James Dong
19429a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
19529a84457aed4c45bc900998b5e11c03023264208James Dong
19629a84457aed4c45bc900998b5e11c03023264208James Dong    /****************/
19729a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x5 - (x4 << 8); /* extract low bytes */
19829a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + x4;     /* add with high bytes */
19929a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + (x10 << 16); /* add with lower half word */
20029a84457aed4c45bc900998b5e11c03023264208James Dong
20129a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
20229a84457aed4c45bc900998b5e11c03023264208James Dong        RSBS     x11, dmin, x10, lsr #16
20329a84457aed4c45bc900998b5e11c03023264208James Dong        ADDLSS   x8, x8, #INC_X8
20429a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
20529a84457aed4c45bc900998b5e11c03023264208James Dong        BLS      LOOP_SAD3;
20629a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
20729a84457aed4c45bc900998b5e11c03023264208James DongBLS      LOOP_SAD2;
20829a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
20929a84457aed4c45bc900998b5e11c03023264208James DongBLS      LOOP_SAD1;
21029a84457aed4c45bc900998b5e11c03023264208James Dong#endif
21129a84457aed4c45bc900998b5e11c03023264208James Dong    }
21229a84457aed4c45bc900998b5e11c03023264208James Dong
21329a84457aed4c45bc900998b5e11c03023264208James Dong    return ((uint32)x10 >> 16);
21429a84457aed4c45bc900998b5e11c03023264208James Dong}
21529a84457aed4c45bc900998b5e11c03023264208James Dong
21629a84457aed4c45bc900998b5e11c03023264208James Dong#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
21729a84457aed4c45bc900998b5e11c03023264208James Dong
21829a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
21929a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
22029a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
22129a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
22229a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
22329a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
22429a84457aed4c45bc900998b5e11c03023264208James Dong#endif
22529a84457aed4c45bc900998b5e11c03023264208James Dong{
22629a84457aed4c45bc900998b5e11c03023264208James Dong    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
22729a84457aed4c45bc900998b5e11c03023264208James Dong
22829a84457aed4c45bc900998b5e11c03023264208James Dong    x9 = 0x80808080; /* const. */
22929a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x5 = 0;
23029a84457aed4c45bc900998b5e11c03023264208James Dong    x8 = 16; //<<===========*******
23129a84457aed4c45bc900998b5e11c03023264208James Dong
232ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    __asm__ volatile("MVN       %0, #0xFF0000": "=r"(x6));
23329a84457aed4c45bc900998b5e11c03023264208James Dong
23429a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
23529a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD3:
23629a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
23729a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD2:
23829a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
23929a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD1:
24029a84457aed4c45bc900998b5e11c03023264208James Dong#endif
2413fdb405597f0e062a9bb8af20199c5e67f0f764cMartin Storsjo    __asm__ volatile("BIC  %0, %0, #3": "+r"(ref));
24229a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
24329a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((int32*)(ref + 12));
24429a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(ref + 16));
24529a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((int32*)(ref + 8));
24629a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((int32*)(blk + 12));
24729a84457aed4c45bc900998b5e11c03023264208James Dong
24829a84457aed4c45bc900998b5e11c03023264208James Dong#if (SHIFT==8)
249ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    __asm__ volatile(
250ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %0, %0, lsr #8\n\t"
251ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %0, %0, %1, lsl #24\n\t"
252ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %1, %1, lsr #8\n\t"
253ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %1, %1, %2, lsl #24"
2543fdb405597f0e062a9bb8af20199c5e67f0f764cMartin Storsjo        : "+r"(x10), "+r"(x11)
255ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        : "r"(x12)
256ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    );
25729a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==16)
258ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    __asm__ volatile(
259ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %0, %0, lsr #16\n\t"
260ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %0, %0, %1, lsl #16\n\t"
261ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %1, %1, lsr #16\n\t"
262ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %1, %1, %2, lsl #16"
2633fdb405597f0e062a9bb8af20199c5e67f0f764cMartin Storsjo        : "+r"(x10), "+r"(x11)
264ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        : "r"(x12)
265ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    );
26629a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==24)
267ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    __asm__ volatile(
268ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %0, %0, lsr #24\n\t"
269ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %0, %0, %1, lsl #8\n\t"
270ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %1, %1, lsr #24\n\t"
271ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %1, %1, %2, lsl #8"
2723fdb405597f0e062a9bb8af20199c5e67f0f764cMartin Storsjo        : "+r"(x10), "+r"(x11)
273ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        : "r"(x12)
274ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    );
27529a84457aed4c45bc900998b5e11c03023264208James Dong#endif
27629a84457aed4c45bc900998b5e11c03023264208James Dong
27729a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(blk + 8));
27829a84457aed4c45bc900998b5e11c03023264208James Dong
27929a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
28029a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
28129a84457aed4c45bc900998b5e11c03023264208James Dong
28229a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
28329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
28429a84457aed4c45bc900998b5e11c03023264208James Dong
28529a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
28629a84457aed4c45bc900998b5e11c03023264208James Dong
28729a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
28829a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((int32*)(ref + 4));
28929a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(ref + 8));
29029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((int32*)ref); ref += lx;
29129a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((int32*)(blk + 4));
29229a84457aed4c45bc900998b5e11c03023264208James Dong
29329a84457aed4c45bc900998b5e11c03023264208James Dong#if (SHIFT==8)
294ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    __asm__ volatile(
295ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %0, %0, lsr #8\n\t"
296ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %0, %0, %1, lsl #24\n\t"
297ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %1, %1, lsr #8\n\t"
298ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %1, %1, %2, lsl #24"
2993fdb405597f0e062a9bb8af20199c5e67f0f764cMartin Storsjo        : "+r"(x10), "+r"(x11)
300ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        : "r"(x12)
301ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    );
30229a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==16)
303ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    __asm__ volatile(
304ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %0, %0, lsr #16\n\t"
305ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %0, %0, %1, lsl #16\n\t"
306ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %1, %1, lsr #16\n\t"
307ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %1, %1, %2, lsl #16"
3083fdb405597f0e062a9bb8af20199c5e67f0f764cMartin Storsjo        : "+r"(x10), "+r"(x11)
309ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        : "r"(x12)
310ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    );
31129a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==24)
312ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    __asm__ volatile(
313ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %0, %0, lsr #24\n\t"
314ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %0, %0, %1, lsl #8\n\t"
315ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "MVN   %1, %1, lsr #24\n\t"
316ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        "BIC   %1, %1, %2, lsl #8"
3173fdb405597f0e062a9bb8af20199c5e67f0f764cMartin Storsjo        : "+r"(x10), "+r"(x11)
318ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo        : "r"(x12)
319ccde1257952d2c073e51ecba6180060570ffa41fMartin Storsjo    );
32029a84457aed4c45bc900998b5e11c03023264208James Dong#endif
3213fdb405597f0e062a9bb8af20199c5e67f0f764cMartin Storsjo    __asm__ volatile("LDR   %0, [%1], #16": "=&r"(x12), "+r"(blk));
32229a84457aed4c45bc900998b5e11c03023264208James Dong
32329a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
32429a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
32529a84457aed4c45bc900998b5e11c03023264208James Dong
32629a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
32729a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
32829a84457aed4c45bc900998b5e11c03023264208James Dong
32929a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
33029a84457aed4c45bc900998b5e11c03023264208James Dong
33129a84457aed4c45bc900998b5e11c03023264208James Dong    /****************/
33229a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x5 - (x4 << 8); /* extract low bytes */
33329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + x4;     /* add with high bytes */
33429a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + (x10 << 16); /* add with lower half word */
33529a84457aed4c45bc900998b5e11c03023264208James Dong
33629a84457aed4c45bc900998b5e11c03023264208James Dong    if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
33729a84457aed4c45bc900998b5e11c03023264208James Dong    {
33829a84457aed4c45bc900998b5e11c03023264208James Dong        if (--x8)
33929a84457aed4c45bc900998b5e11c03023264208James Dong        {
34029a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
34129a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD3;
34229a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
343955585cca11173b07e2e7db3d636ee97b69b053bMartin Storsjo            goto         LOOP_SAD2;
34429a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
345955585cca11173b07e2e7db3d636ee97b69b053bMartin Storsjo            goto         LOOP_SAD1;
34629a84457aed4c45bc900998b5e11c03023264208James Dong#endif
34729a84457aed4c45bc900998b5e11c03023264208James Dong        }
34829a84457aed4c45bc900998b5e11c03023264208James Dong
34929a84457aed4c45bc900998b5e11c03023264208James Dong    }
35029a84457aed4c45bc900998b5e11c03023264208James Dong
35129a84457aed4c45bc900998b5e11c03023264208James Dong    return ((uint32)x10 >> 16);
35229a84457aed4c45bc900998b5e11c03023264208James Dong}
35329a84457aed4c45bc900998b5e11c03023264208James Dong
35429a84457aed4c45bc900998b5e11c03023264208James Dong#endif
35529a84457aed4c45bc900998b5e11c03023264208James Dong
356