1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18#ifndef _SAD_INLINE_H_
19#define _SAD_INLINE_H_
20
21#ifdef __cplusplus
22extern "C"
23{
24#endif
25
26/* Intentionally not using the gcc asm version, since it is
27 * slightly slower than the plain C version on modern GCC versions. */
28#if !defined(__CC_ARM) /* Generic C version */
29
30    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
31    {
32        tmp = tmp - tmp2;
33        if (tmp > 0) sad += tmp;
34        else sad -= tmp;
35
36        return sad;
37    }
38
39    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
40    {
41        int32 x7;
42
43        x7 = src2 ^ src1;       /* check odd/even combination */
44        if ((uint32)src2 >= (uint32)src1)
45        {
46            src1 = src2 - src1;     /* subs */
47        }
48        else
49        {
50            src1 = src1 - src2;
51        }
52        x7 = x7 ^ src1;     /* only odd bytes need to add carry */
53        x7 = mask & ((uint32)x7 >> 1);
54        x7 = (x7 << 8) - x7;
55        src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
56        src1 = src1 ^(x7 >> 7);   /* take absolute value of negative byte */
57
58        return src1;
59    }
60
61#define NUMBER 3
62#define SHIFT 24
63
64#include "sad_mb_offset.h"
65
66#undef NUMBER
67#define NUMBER 2
68#undef SHIFT
69#define SHIFT 16
70#include "sad_mb_offset.h"
71
72#undef NUMBER
73#define NUMBER 1
74#undef SHIFT
75#define SHIFT 8
76#include "sad_mb_offset.h"
77
78
79    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
80    {
81        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
82
83        x9 = 0x80808080; /* const. */
84
85        x8 = (intptr_t)ref & 0x3;
86        if (x8 == 3)
87            goto SadMBOffset3;
88        if (x8 == 2)
89            goto SadMBOffset2;
90        if (x8 == 1)
91            goto SadMBOffset1;
92
93//  x5 = (x4<<8)-x4; /* x5 = x4*255; */
94        x4 = x5 = 0;
95
96        x6 = 0xFFFF00FF;
97
98        ref -= lx;
99        blk -= 16;
100
101        x8 = 16;
102
103LOOP_SAD0:
104        /****** process 8 pixels ******/
105        x10 = *((uint32*)(ref += lx));
106        x11 = *((uint32*)(ref + 4));
107        x12 = *((uint32*)(blk += 16));
108        x14 = *((uint32*)(blk + 4));
109
110        /* process x11 & x14 */
111        x11 = sad_4pixel(x11, x14, x9);
112
113        /* process x12 & x10 */
114        x10 = sad_4pixel(x10, x12, x9);
115
116        x5 = x5 + x10; /* accumulate low bytes */
117        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
118        x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
119        x5 = x5 + x11;  /* accumulate low bytes */
120        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
121        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
122
123        /****** process 8 pixels ******/
124        x10 = *((uint32*)(ref + 8));
125        x11 = *((uint32*)(ref + 12));
126        x12 = *((uint32*)(blk + 8));
127        x14 = *((uint32*)(blk + 12));
128
129        /* process x11 & x14 */
130        x11 = sad_4pixel(x11, x14, x9);
131
132        /* process x12 & x10 */
133        x10 = sad_4pixel(x10, x12, x9);
134
135        x5 = x5 + x10;  /* accumulate low bytes */
136        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
137        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
138        x5 = x5 + x11;  /* accumulate low bytes */
139        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
140        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
141
142        /****************/
143        x10 = x5 - (x4 << 8); /* extract low bytes */
144        x10 = x10 + x4;     /* add with high bytes */
145        x10 = x10 + (x10 << 16); /* add with lower half word */
146
147        if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
148        {
149            if (--x8)
150            {
151                goto LOOP_SAD0;
152            }
153
154        }
155
156        return ((uint32)x10 >> 16);
157
158SadMBOffset3:
159
160        return sad_mb_offset3(ref, blk, lx, dmin);
161
162SadMBOffset2:
163
164        return sad_mb_offset2(ref, blk, lx, dmin);
165
166SadMBOffset1:
167
168        return sad_mb_offset1(ref, blk, lx, dmin);
169
170    }
171
172#elif defined(__CC_ARM)  /* only work with arm v5 */
173
174    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
175    {
176        __asm
177        {
178            rsbs    tmp, tmp, tmp2 ;
179            rsbmi   tmp, tmp, #0 ;
180            add     sad, sad, tmp ;
181        }
182
183        return sad;
184    }
185
186    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
187    {
188        int32 x7;
189
190        __asm
191        {
192            EOR     x7, src2, src1;     /* check odd/even combination */
193            SUBS    src1, src2, src1;
194            EOR     x7, x7, src1;
195            AND     x7, mask, x7, lsr #1;
196            ORRCC   x7, x7, #0x80000000;
197            RSB     x7, x7, x7, lsl #8;
198            ADD     src1, src1, x7, asr #7;   /* add 0xFF to the negative byte, add back carry */
199            EOR     src1, src1, x7, asr #7;   /* take absolute value of negative byte */
200        }
201
202        return src1;
203    }
204
205    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
206    {
207        int32 x7;
208
209        __asm
210        {
211            EOR      x7, src2, src1;        /* check odd/even combination */
212            ADDS     src1, src2, src1;
213            EOR      x7, x7, src1;      /* only odd bytes need to add carry */
214            ANDS     x7, mask, x7, rrx;
215            RSB      x7, x7, x7, lsl #8;
216            SUB      src1, src1, x7, asr #7;  /* add 0xFF to the negative byte, add back carry */
217            EOR      src1, src1, x7, asr #7; /* take absolute value of negative byte */
218        }
219
220        return src1;
221    }
222
223#define sum_accumulate  __asm{      SBC      x5, x5, x10;  /* accumulate low bytes */ \
224        BIC      x10, x6, x10;   /* x10 & 0xFF00FF00 */ \
225        ADD      x4, x4, x10,lsr #8;   /* accumulate high bytes */ \
226        SBC      x5, x5, x11;    /* accumulate low bytes */ \
227        BIC      x11, x6, x11;   /* x11 & 0xFF00FF00 */ \
228        ADD      x4, x4, x11,lsr #8; } /* accumulate high bytes */
229
230
231#define NUMBER 3
232#define SHIFT 24
233#define INC_X8 0x08000001
234
235#include "sad_mb_offset.h"
236
237#undef NUMBER
238#define NUMBER 2
239#undef SHIFT
240#define SHIFT 16
241#undef INC_X8
242#define INC_X8 0x10000001
243#include "sad_mb_offset.h"
244
245#undef NUMBER
246#define NUMBER 1
247#undef SHIFT
248#define SHIFT 8
249#undef INC_X8
250#define INC_X8 0x08000001
251#include "sad_mb_offset.h"
252
253
254    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
255    {
256        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
257
258        x9 = 0x80808080; /* const. */
259        x4 = x5 = 0;
260
261        __asm
262        {
263            MOVS    x8, ref, lsl #31 ;
264            BHI     SadMBOffset3;
265            BCS     SadMBOffset2;
266            BMI     SadMBOffset1;
267
268            MVN     x6, #0xFF00;
269        }
270LOOP_SAD0:
271        /****** process 8 pixels ******/
272        x11 = *((int32*)(ref + 12));
273        x10 = *((int32*)(ref + 8));
274        x14 = *((int32*)(blk + 12));
275        x12 = *((int32*)(blk + 8));
276
277        /* process x11 & x14 */
278        x11 = sad_4pixel(x11, x14, x9);
279
280        /* process x12 & x10 */
281        x10 = sad_4pixel(x10, x12, x9);
282
283        x5 = x5 + x10;  /* accumulate low bytes */
284        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
285        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
286        x5 = x5 + x11;  /* accumulate low bytes */
287        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
288        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
289
290        __asm
291        {
292            /****** process 8 pixels ******/
293            LDR     x11, [ref, #4];
294            LDR     x10, [ref], lx ;
295            LDR     x14, [blk, #4];
296            LDR     x12, [blk], #16 ;
297        }
298
299        /* process x11 & x14 */
300        x11 = sad_4pixel(x11, x14, x9);
301
302        /* process x12 & x10 */
303        x10 = sad_4pixel(x10, x12, x9);
304
305        x5 = x5 + x10;  /* accumulate low bytes */
306        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
307        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
308        x5 = x5 + x11;  /* accumulate low bytes */
309        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
310        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
311
312        /****************/
313        x10 = x5 - (x4 << 8); /* extract low bytes */
314        x10 = x10 + x4;     /* add with high bytes */
315        x10 = x10 + (x10 << 16); /* add with lower half word */
316
317        __asm
318        {
319            /****************/
320            RSBS    x11, dmin, x10, lsr #16;
321            ADDLSS  x8, x8, #0x10000001;
322            BLS     LOOP_SAD0;
323        }
324
325        return ((uint32)x10 >> 16);
326
327SadMBOffset3:
328
329        return sad_mb_offset3(ref, blk, lx, dmin, x8);
330
331SadMBOffset2:
332
333        return sad_mb_offset2(ref, blk, lx, dmin, x8);
334
335SadMBOffset1:
336
337        return sad_mb_offset1(ref, blk, lx, dmin, x8);
338    }
339
340
341#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
342
343    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
344    {
345        __asm__ volatile(
346            "rsbs       %1, %1, %2\n\t"
347            "rsbmi      %1, %1, #0\n\t"
348            "add        %0, %0, %1"
349            : "+r"(sad), "+r"(tmp)
350            : "r"(tmp2)
351        );
352        return sad;
353    }
354
355    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
356    {
357        int32 x7;
358
359        __asm__ volatile(
360            "EOR        %1, %2, %0\n\t"
361            "SUBS       %0, %2, %0\n\t"
362            "EOR        %1, %1, %0\n\t"
363            "AND        %1, %3, %1, lsr #1\n\t"
364            "ORRCC      %1, %1, #0x80000000\n\t"
365            "RSB        %1, %1, %1, lsl #8\n\t"
366            "ADD        %0, %0, %1, asr #7\n\t"
367            "EOR        %0, %0, %1, asr #7"
368            : "+r"(src1), "=&r"(x7)
369            : "r"(src2), "r"(mask)
370        );
371
372        return src1;
373    }
374
375    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
376    {
377        int32 x7;
378
379        __asm__ volatile(
380            "EOR        %1, %2, %0\n\t"
381            "ADDS       %0, %2, %0\n\t"
382            "EOR        %1, %1, %0\n\t"
383            "ANDS       %1, %3, %1, rrx\n\t"
384            "RSB        %1, %1, %1, lsl #8\n\t"
385            "SUB        %0, %0, %1, asr #7\n\t"
386            "EOR        %0, %0, %1, asr #7"
387            : "+r"(src1), "=&r"(x7)
388            : "r"(src2), "r"(mask)
389        );
390
391        return src1;
392    }
393
394#define sum_accumulate  __asm__ volatile(              \
395    "SBC   %0, %0, %1\n\t"                             \
396    "BIC   %1, %4, %1\n\t"                             \
397    "ADD   %2, %2, %1, lsr #8\n\t"                     \
398    "SBC   %0, %0, %3\n\t"                             \
399    "BIC   %3, %4, %3\n\t"                             \
400    "ADD   %2, %2, %3, lsr #8"                         \
401    : "+r" (x5), "+r" (x10), "+r" (x4), "+r" (x11)     \
402    : "r" (x6)                                         \
403    );
404
405#define NUMBER 3
406#define SHIFT 24
407#define INC_X8 0x08000001
408
409#include "sad_mb_offset.h"
410
411#undef NUMBER
412#define NUMBER 2
413#undef SHIFT
414#define SHIFT 16
415#undef INC_X8
416#define INC_X8 0x10000001
417#include "sad_mb_offset.h"
418
419#undef NUMBER
420#define NUMBER 1
421#undef SHIFT
422#define SHIFT 8
423#undef INC_X8
424#define INC_X8 0x08000001
425#include "sad_mb_offset.h"
426
427
428    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
429    {
430        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
431
432        x9 = 0x80808080; /* const. */
433        x4 = x5 = 0;
434
435        x8 = (uint32)ref & 0x3;
436        if (x8 == 3)
437            goto SadMBOffset3;
438        if (x8 == 2)
439            goto SadMBOffset2;
440        if (x8 == 1)
441            goto SadMBOffset1;
442
443        x8 = 16;
444///
445        __asm__ volatile("MVN   %0, #0xFF00": "=r"(x6));
446
447LOOP_SAD0:
448        /****** process 8 pixels ******/
449        x11 = *((int32*)(ref + 12));
450        x10 = *((int32*)(ref + 8));
451        x14 = *((int32*)(blk + 12));
452        x12 = *((int32*)(blk + 8));
453
454        /* process x11 & x14 */
455        x11 = sad_4pixel(x11, x14, x9);
456
457        /* process x12 & x10 */
458        x10 = sad_4pixel(x10, x12, x9);
459
460        x5 = x5 + x10;  /* accumulate low bytes */
461        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
462        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
463        x5 = x5 + x11;  /* accumulate low bytes */
464        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
465        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
466
467        /****** process 8 pixels ******/
468        x11 = *((int32*)(ref + 4));
469        __asm__ volatile("LDR   %0, [%1], %2": "=&r"(x10), "+r"(ref): "r"(lx));
470        //x10 = *((int32*)ref); ref+=lx;
471        x14 = *((int32*)(blk + 4));
472        __asm__ volatile("LDR   %0, [%1], #16": "=&r"(x12), "+r"(blk));
473
474        /* process x11 & x14 */
475        x11 = sad_4pixel(x11, x14, x9);
476
477        /* process x12 & x10 */
478        x10 = sad_4pixel(x10, x12, x9);
479
480        x5 = x5 + x10;  /* accumulate low bytes */
481        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
482        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
483        x5 = x5 + x11;  /* accumulate low bytes */
484        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
485        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
486
487        /****************/
488        x10 = x5 - (x4 << 8); /* extract low bytes */
489        x10 = x10 + x4;     /* add with high bytes */
490        x10 = x10 + (x10 << 16); /* add with lower half word */
491
492        /****************/
493
494        if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */
495        {
496            if (--x8)
497            {
498                goto LOOP_SAD0;
499            }
500
501        }
502
503        return ((uint32)x10 >> 16);
504
505SadMBOffset3:
506
507        return sad_mb_offset3(ref, blk, lx, dmin);
508
509SadMBOffset2:
510
511        return sad_mb_offset2(ref, blk, lx, dmin);
512
513SadMBOffset1:
514
515        return sad_mb_offset1(ref, blk, lx, dmin);
516    }
517
518
519#endif
520
521#ifdef __cplusplus
522}
523#endif
524
525#endif // _SAD_INLINE_H_
526
527