sad_mb_offset.h revision 29a84457aed4c45bc900998b5e11c03023264208
129a84457aed4c45bc900998b5e11c03023264208James Dong/* ------------------------------------------------------------------
229a84457aed4c45bc900998b5e11c03023264208James Dong * Copyright (C) 1998-2009 PacketVideo
329a84457aed4c45bc900998b5e11c03023264208James Dong *
429a84457aed4c45bc900998b5e11c03023264208James Dong * Licensed under the Apache License, Version 2.0 (the "License");
529a84457aed4c45bc900998b5e11c03023264208James Dong * you may not use this file except in compliance with the License.
629a84457aed4c45bc900998b5e11c03023264208James Dong * You may obtain a copy of the License at
729a84457aed4c45bc900998b5e11c03023264208James Dong *
829a84457aed4c45bc900998b5e11c03023264208James Dong *      http://www.apache.org/licenses/LICENSE-2.0
929a84457aed4c45bc900998b5e11c03023264208James Dong *
1029a84457aed4c45bc900998b5e11c03023264208James Dong * Unless required by applicable law or agreed to in writing, software
1129a84457aed4c45bc900998b5e11c03023264208James Dong * distributed under the License is distributed on an "AS IS" BASIS,
1229a84457aed4c45bc900998b5e11c03023264208James Dong * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
1329a84457aed4c45bc900998b5e11c03023264208James Dong * express or implied.
1429a84457aed4c45bc900998b5e11c03023264208James Dong * See the License for the specific language governing permissions
1529a84457aed4c45bc900998b5e11c03023264208James Dong * and limitations under the License.
1629a84457aed4c45bc900998b5e11c03023264208James Dong * -------------------------------------------------------------------
1729a84457aed4c45bc900998b5e11c03023264208James Dong */
1829a84457aed4c45bc900998b5e11c03023264208James Dong
1929a84457aed4c45bc900998b5e11c03023264208James Dong#if defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
2029a84457aed4c45bc900998b5e11c03023264208James Dong
2129a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
2229a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
2329a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
2429a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
2529a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
2629a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
2729a84457aed4c45bc900998b5e11c03023264208James Dong#endif
2829a84457aed4c45bc900998b5e11c03023264208James Dong{
2929a84457aed4c45bc900998b5e11c03023264208James Dong    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
3029a84457aed4c45bc900998b5e11c03023264208James Dong
3129a84457aed4c45bc900998b5e11c03023264208James Dong    //  x5 = (x4<<8) - x4;
3229a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x5 = 0;
3329a84457aed4c45bc900998b5e11c03023264208James Dong    x6 = 0xFFFF00FF;
3429a84457aed4c45bc900998b5e11c03023264208James Dong    x9 = 0x80808080; /* const. */
3529a84457aed4c45bc900998b5e11c03023264208James Dong    ref -= NUMBER; /* bic ref, ref, #3 */
3629a84457aed4c45bc900998b5e11c03023264208James Dong    ref -= lx;
3729a84457aed4c45bc900998b5e11c03023264208James Dong    blk -= 16;
3829a84457aed4c45bc900998b5e11c03023264208James Dong    x8 = 16;
3929a84457aed4c45bc900998b5e11c03023264208James Dong
4029a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
4129a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD3:
4229a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
4329a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD2:
4429a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
4529a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD1:
4629a84457aed4c45bc900998b5e11c03023264208James Dong#endif
4729a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
4829a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((uint32*)(ref += lx)); /* D C B A */
4929a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((uint32*)(ref + 4));    /* H G F E */
5029a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(ref + 8));    /* L K J I */
5129a84457aed4c45bc900998b5e11c03023264208James Dong
5229a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
5329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 | (x11 << (32 - SHIFT));        /* G F E D */
5429a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
5529a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 | (x12 << (32 - SHIFT));        /* K J I H */
5629a84457aed4c45bc900998b5e11c03023264208James Dong
5729a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(blk += 16));
5829a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((uint32*)(blk + 4));
5929a84457aed4c45bc900998b5e11c03023264208James Dong
6029a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
6129a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixel(x11, x14, x9);
6229a84457aed4c45bc900998b5e11c03023264208James Dong
6329a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
6429a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixel(x10, x12, x9);
6529a84457aed4c45bc900998b5e11c03023264208James Dong
6629a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x10; /* accumulate low bytes */
6729a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
6829a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
6929a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x11;  /* accumulate low bytes */
7029a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
7129a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
7229a84457aed4c45bc900998b5e11c03023264208James Dong
7329a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
7429a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((uint32*)(ref + 8)); /* D C B A */
7529a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((uint32*)(ref + 12));   /* H G F E */
7629a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(ref + 16));   /* L K J I */
7729a84457aed4c45bc900998b5e11c03023264208James Dong
7829a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24  = 0xFF 0xFF 0xFF ~D */
7929a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 | (x11 << (32 - SHIFT));        /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
8029a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
8129a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 | (x12 << (32 - SHIFT));        /* ~K ~J ~I ~H */
8229a84457aed4c45bc900998b5e11c03023264208James Dong
8329a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((uint32*)(blk + 8));
8429a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((uint32*)(blk + 12));
8529a84457aed4c45bc900998b5e11c03023264208James Dong
8629a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
8729a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixel(x11, x14, x9);
8829a84457aed4c45bc900998b5e11c03023264208James Dong
8929a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
9029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixel(x10, x12, x9);
9129a84457aed4c45bc900998b5e11c03023264208James Dong
9229a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x10; /* accumulate low bytes */
9329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
9429a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
9529a84457aed4c45bc900998b5e11c03023264208James Dong    x5 = x5 + x11;  /* accumulate low bytes */
9629a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
9729a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
9829a84457aed4c45bc900998b5e11c03023264208James Dong
9929a84457aed4c45bc900998b5e11c03023264208James Dong    /****************/
10029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x5 - (x4 << 8); /* extract low bytes */
10129a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + x4;     /* add with high bytes */
10229a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + (x10 << 16); /* add with lower half word */
10329a84457aed4c45bc900998b5e11c03023264208James Dong
10429a84457aed4c45bc900998b5e11c03023264208James Dong    if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
10529a84457aed4c45bc900998b5e11c03023264208James Dong    {
10629a84457aed4c45bc900998b5e11c03023264208James Dong        if (--x8)
10729a84457aed4c45bc900998b5e11c03023264208James Dong        {
10829a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
10929a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD3;
11029a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
11129a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD2;
11229a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
11329a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD1;
11429a84457aed4c45bc900998b5e11c03023264208James Dong#endif
11529a84457aed4c45bc900998b5e11c03023264208James Dong        }
11629a84457aed4c45bc900998b5e11c03023264208James Dong
11729a84457aed4c45bc900998b5e11c03023264208James Dong    }
11829a84457aed4c45bc900998b5e11c03023264208James Dong
11929a84457aed4c45bc900998b5e11c03023264208James Dong    return ((uint32)x10 >> 16);
12029a84457aed4c45bc900998b5e11c03023264208James Dong}
12129a84457aed4c45bc900998b5e11c03023264208James Dong
12229a84457aed4c45bc900998b5e11c03023264208James Dong#elif defined(__CC_ARM)  /* only work with arm v5 */
12329a84457aed4c45bc900998b5e11c03023264208James Dong
12429a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
12529a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
12629a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
12729a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
12829a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
12929a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
13029a84457aed4c45bc900998b5e11c03023264208James Dong#endif
13129a84457aed4c45bc900998b5e11c03023264208James Dong{
13229a84457aed4c45bc900998b5e11c03023264208James Dong    int32 x4, x5, x6, x9, x10, x11, x12, x14;
13329a84457aed4c45bc900998b5e11c03023264208James Dong
13429a84457aed4c45bc900998b5e11c03023264208James Dong    x9 = 0x80808080; /* const. */
13529a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x5 = 0;
13629a84457aed4c45bc900998b5e11c03023264208James Dong
13729a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
13829a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x6, #0xff0000;
13929a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
14029a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD3:
14129a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
14229a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD2:
14329a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
14429a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD1:
14529a84457aed4c45bc900998b5e11c03023264208James Dong#endif
14629a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      ref, ref, #3;
14729a84457aed4c45bc900998b5e11c03023264208James Dong    }
14829a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
14929a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((int32*)(ref + 12));
15029a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(ref + 16));
15129a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((int32*)(ref + 8));
15229a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((int32*)(blk + 12));
15329a84457aed4c45bc900998b5e11c03023264208James Dong
15429a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
15529a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x10, x10, lsr #SHIFT;
15629a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x10, x10, x11, lsl #(32-SHIFT);
15729a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x11, x11, lsr #SHIFT;
15829a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x11, x11, x12, lsl #(32-SHIFT);
15929a84457aed4c45bc900998b5e11c03023264208James Dong
16029a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x12, [blk, #8];
16129a84457aed4c45bc900998b5e11c03023264208James Dong    }
16229a84457aed4c45bc900998b5e11c03023264208James Dong
16329a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
16429a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
16529a84457aed4c45bc900998b5e11c03023264208James Dong
16629a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
16729a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
16829a84457aed4c45bc900998b5e11c03023264208James Dong
16929a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
17029a84457aed4c45bc900998b5e11c03023264208James Dong
17129a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
17229a84457aed4c45bc900998b5e11c03023264208James Dong        /****** process 8 pixels ******/
17329a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x11, [ref, #4];
17429a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x12, [ref, #8];
17529a84457aed4c45bc900998b5e11c03023264208James Dong        LDR  x10, [ref], lx ;
17629a84457aed4c45bc900998b5e11c03023264208James Dong        LDR  x14, [blk, #4];
17729a84457aed4c45bc900998b5e11c03023264208James Dong
17829a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x10, x10, lsr #SHIFT;
17929a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x10, x10, x11, lsl #(32-SHIFT);
18029a84457aed4c45bc900998b5e11c03023264208James Dong        MVN      x11, x11, lsr #SHIFT;
18129a84457aed4c45bc900998b5e11c03023264208James Dong        BIC      x11, x11, x12, lsl #(32-SHIFT);
18229a84457aed4c45bc900998b5e11c03023264208James Dong
18329a84457aed4c45bc900998b5e11c03023264208James Dong        LDR      x12, [blk], #16;
18429a84457aed4c45bc900998b5e11c03023264208James Dong    }
18529a84457aed4c45bc900998b5e11c03023264208James Dong
18629a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
18729a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
18829a84457aed4c45bc900998b5e11c03023264208James Dong
18929a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
19029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
19129a84457aed4c45bc900998b5e11c03023264208James Dong
19229a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
19329a84457aed4c45bc900998b5e11c03023264208James Dong
19429a84457aed4c45bc900998b5e11c03023264208James Dong    /****************/
19529a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x5 - (x4 << 8); /* extract low bytes */
19629a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + x4;     /* add with high bytes */
19729a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + (x10 << 16); /* add with lower half word */
19829a84457aed4c45bc900998b5e11c03023264208James Dong
19929a84457aed4c45bc900998b5e11c03023264208James Dong    __asm{
20029a84457aed4c45bc900998b5e11c03023264208James Dong        RSBS     x11, dmin, x10, lsr #16
20129a84457aed4c45bc900998b5e11c03023264208James Dong        ADDLSS   x8, x8, #INC_X8
20229a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
20329a84457aed4c45bc900998b5e11c03023264208James Dong        BLS      LOOP_SAD3;
20429a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
20529a84457aed4c45bc900998b5e11c03023264208James DongBLS      LOOP_SAD2;
20629a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
20729a84457aed4c45bc900998b5e11c03023264208James DongBLS      LOOP_SAD1;
20829a84457aed4c45bc900998b5e11c03023264208James Dong#endif
20929a84457aed4c45bc900998b5e11c03023264208James Dong    }
21029a84457aed4c45bc900998b5e11c03023264208James Dong
21129a84457aed4c45bc900998b5e11c03023264208James Dong    return ((uint32)x10 >> 16);
21229a84457aed4c45bc900998b5e11c03023264208James Dong}
21329a84457aed4c45bc900998b5e11c03023264208James Dong
21429a84457aed4c45bc900998b5e11c03023264208James Dong#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
21529a84457aed4c45bc900998b5e11c03023264208James Dong
21629a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
21729a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
21829a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
21929a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
22029a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
22129a84457aed4c45bc900998b5e11c03023264208James Dong__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
22229a84457aed4c45bc900998b5e11c03023264208James Dong#endif
22329a84457aed4c45bc900998b5e11c03023264208James Dong{
22429a84457aed4c45bc900998b5e11c03023264208James Dong    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
22529a84457aed4c45bc900998b5e11c03023264208James Dong
22629a84457aed4c45bc900998b5e11c03023264208James Dong    x9 = 0x80808080; /* const. */
22729a84457aed4c45bc900998b5e11c03023264208James Dong    x4 = x5 = 0;
22829a84457aed4c45bc900998b5e11c03023264208James Dong    x8 = 16; //<<===========*******
22929a84457aed4c45bc900998b5e11c03023264208James Dong
23029a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN	%0, #0xFF0000": "=r"(x6));
23129a84457aed4c45bc900998b5e11c03023264208James Dong
23229a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
23329a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD3:
23429a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
23529a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD2:
23629a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
23729a84457aed4c45bc900998b5e11c03023264208James DongLOOP_SAD1:
23829a84457aed4c45bc900998b5e11c03023264208James Dong#endif
23929a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("BIC  %0, %0, #3": "=r"(ref));
24029a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
24129a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((int32*)(ref + 12));
24229a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(ref + 16));
24329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((int32*)(ref + 8));
24429a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((int32*)(blk + 12));
24529a84457aed4c45bc900998b5e11c03023264208James Dong
24629a84457aed4c45bc900998b5e11c03023264208James Dong#if (SHIFT==8)
24729a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #8\n\tBIC   %0, %0, %1,lsl #24\n\tMVN   %1, %1,lsr #8\n\tBIC   %1, %1, %2,lsl #24": "=&r"(x10), "=&r"(x11): "r"(x12));
24829a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==16)
24929a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #16\n\tBIC   %0, %0, %1,lsl #16\n\tMVN   %1, %1,lsr #16\n\tBIC   %1, %1, %2,lsl #16": "=&r"(x10), "=&r"(x11): "r"(x12));
25029a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==24)
25129a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #24\n\tBIC   %0, %0, %1,lsl #8\n\tMVN   %1, %1,lsr #24\n\tBIC   %1, %1, %2,lsl #8": "=&r"(x10), "=&r"(x11): "r"(x12));
25229a84457aed4c45bc900998b5e11c03023264208James Dong#endif
25329a84457aed4c45bc900998b5e11c03023264208James Dong
25429a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(blk + 8));
25529a84457aed4c45bc900998b5e11c03023264208James Dong
25629a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
25729a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
25829a84457aed4c45bc900998b5e11c03023264208James Dong
25929a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
26029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
26129a84457aed4c45bc900998b5e11c03023264208James Dong
26229a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
26329a84457aed4c45bc900998b5e11c03023264208James Dong
26429a84457aed4c45bc900998b5e11c03023264208James Dong    /****** process 8 pixels ******/
26529a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = *((int32*)(ref + 4));
26629a84457aed4c45bc900998b5e11c03023264208James Dong    x12 = *((int32*)(ref + 8));
26729a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = *((int32*)ref); ref += lx;
26829a84457aed4c45bc900998b5e11c03023264208James Dong    x14 = *((int32*)(blk + 4));
26929a84457aed4c45bc900998b5e11c03023264208James Dong
27029a84457aed4c45bc900998b5e11c03023264208James Dong#if (SHIFT==8)
27129a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #8\n\tBIC   %0, %0, %1,lsl #24\n\tMVN   %1, %1,lsr #8\n\tBIC   %1, %1, %2,lsl #24": "=&r"(x10), "=&r"(x11): "r"(x12));
27229a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==16)
27329a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #16\n\tBIC   %0, %0, %1,lsl #16\n\tMVN   %1, %1,lsr #16\n\tBIC   %1, %1, %2,lsl #16": "=&r"(x10), "=&r"(x11): "r"(x12));
27429a84457aed4c45bc900998b5e11c03023264208James Dong#elif (SHIFT==24)
27529a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("MVN   %0, %0, lsr #24\n\tBIC   %0, %0, %1,lsl #8\n\tMVN   %1, %1,lsr #24\n\tBIC   %1, %1, %2,lsl #8": "=&r"(x10), "=&r"(x11): "r"(x12));
27629a84457aed4c45bc900998b5e11c03023264208James Dong#endif
27729a84457aed4c45bc900998b5e11c03023264208James Dong__asm__ volatile("LDR   %0, [%1], #16": "=&r"(x12), "=r"(blk));
27829a84457aed4c45bc900998b5e11c03023264208James Dong
27929a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x11 & x14 */
28029a84457aed4c45bc900998b5e11c03023264208James Dong    x11 = sad_4pixelN(x11, x14, x9);
28129a84457aed4c45bc900998b5e11c03023264208James Dong
28229a84457aed4c45bc900998b5e11c03023264208James Dong    /* process x12 & x10 */
28329a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = sad_4pixelN(x10, x12, x9);
28429a84457aed4c45bc900998b5e11c03023264208James Dong
28529a84457aed4c45bc900998b5e11c03023264208James Dong    sum_accumulate;
28629a84457aed4c45bc900998b5e11c03023264208James Dong
28729a84457aed4c45bc900998b5e11c03023264208James Dong    /****************/
28829a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x5 - (x4 << 8); /* extract low bytes */
28929a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + x4;     /* add with high bytes */
29029a84457aed4c45bc900998b5e11c03023264208James Dong    x10 = x10 + (x10 << 16); /* add with lower half word */
29129a84457aed4c45bc900998b5e11c03023264208James Dong
29229a84457aed4c45bc900998b5e11c03023264208James Dong    if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
29329a84457aed4c45bc900998b5e11c03023264208James Dong    {
29429a84457aed4c45bc900998b5e11c03023264208James Dong        if (--x8)
29529a84457aed4c45bc900998b5e11c03023264208James Dong        {
29629a84457aed4c45bc900998b5e11c03023264208James Dong#if (NUMBER==3)
29729a84457aed4c45bc900998b5e11c03023264208James Dong            goto         LOOP_SAD3;
29829a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==2)
29929a84457aed4c45bc900998b5e11c03023264208James Donggoto         LOOP_SAD2;
30029a84457aed4c45bc900998b5e11c03023264208James Dong#elif (NUMBER==1)
30129a84457aed4c45bc900998b5e11c03023264208James Donggoto         LOOP_SAD1;
30229a84457aed4c45bc900998b5e11c03023264208James Dong#endif
30329a84457aed4c45bc900998b5e11c03023264208James Dong        }
30429a84457aed4c45bc900998b5e11c03023264208James Dong
30529a84457aed4c45bc900998b5e11c03023264208James Dong    }
30629a84457aed4c45bc900998b5e11c03023264208James Dong
30729a84457aed4c45bc900998b5e11c03023264208James Dong    return ((uint32)x10 >> 16);
30829a84457aed4c45bc900998b5e11c03023264208James Dong}
30929a84457aed4c45bc900998b5e11c03023264208James Dong
31029a84457aed4c45bc900998b5e11c03023264208James Dong#endif
31129a84457aed4c45bc900998b5e11c03023264208James Dong
312