sad_inline.h revision ccde1257952d2c073e51ecba6180060570ffa41f
1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18#ifndef _SAD_INLINE_H_
19#define _SAD_INLINE_H_
20
21#ifdef __cplusplus
22extern "C"
23{
24#endif
25
26/* Intentionally not using the gcc asm version, since it (if fixed so
27 * as to not crash - the current register constraints are faulty) is
28 * slightly slower than the plain C version on modern GCC versions. */
29#if !defined(__CC_ARM) /* Generic C version */
30
31    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
32    {
33        tmp = tmp - tmp2;
34        if (tmp > 0) sad += tmp;
35        else sad -= tmp;
36
37        return sad;
38    }
39
40    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
41    {
42        int32 x7;
43
44        x7 = src2 ^ src1;       /* check odd/even combination */
45        if ((uint32)src2 >= (uint32)src1)
46        {
47            src1 = src2 - src1;     /* subs */
48        }
49        else
50        {
51            src1 = src1 - src2;
52        }
53        x7 = x7 ^ src1;     /* only odd bytes need to add carry */
54        x7 = mask & ((uint32)x7 >> 1);
55        x7 = (x7 << 8) - x7;
56        src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
57        src1 = src1 ^(x7 >> 7);   /* take absolute value of negative byte */
58
59        return src1;
60    }
61
62#define NUMBER 3
63#define SHIFT 24
64
65#include "sad_mb_offset.h"
66
67#undef NUMBER
68#define NUMBER 2
69#undef SHIFT
70#define SHIFT 16
71#include "sad_mb_offset.h"
72
73#undef NUMBER
74#define NUMBER 1
75#undef SHIFT
76#define SHIFT 8
77#include "sad_mb_offset.h"
78
79
80    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
81    {
82        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
83
84        x9 = 0x80808080; /* const. */
85
86        x8 = (intptr_t)ref & 0x3;
87        if (x8 == 3)
88            goto SadMBOffset3;
89        if (x8 == 2)
90            goto SadMBOffset2;
91        if (x8 == 1)
92            goto SadMBOffset1;
93
94//  x5 = (x4<<8)-x4; /* x5 = x4*255; */
95        x4 = x5 = 0;
96
97        x6 = 0xFFFF00FF;
98
99        ref -= lx;
100        blk -= 16;
101
102        x8 = 16;
103
104LOOP_SAD0:
105        /****** process 8 pixels ******/
106        x10 = *((uint32*)(ref += lx));
107        x11 = *((uint32*)(ref + 4));
108        x12 = *((uint32*)(blk += 16));
109        x14 = *((uint32*)(blk + 4));
110
111        /* process x11 & x14 */
112        x11 = sad_4pixel(x11, x14, x9);
113
114        /* process x12 & x10 */
115        x10 = sad_4pixel(x10, x12, x9);
116
117        x5 = x5 + x10; /* accumulate low bytes */
118        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
119        x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
120        x5 = x5 + x11;  /* accumulate low bytes */
121        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
122        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
123
124        /****** process 8 pixels ******/
125        x10 = *((uint32*)(ref + 8));
126        x11 = *((uint32*)(ref + 12));
127        x12 = *((uint32*)(blk + 8));
128        x14 = *((uint32*)(blk + 12));
129
130        /* process x11 & x14 */
131        x11 = sad_4pixel(x11, x14, x9);
132
133        /* process x12 & x10 */
134        x10 = sad_4pixel(x10, x12, x9);
135
136        x5 = x5 + x10;  /* accumulate low bytes */
137        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
138        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
139        x5 = x5 + x11;  /* accumulate low bytes */
140        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
141        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
142
143        /****************/
144        x10 = x5 - (x4 << 8); /* extract low bytes */
145        x10 = x10 + x4;     /* add with high bytes */
146        x10 = x10 + (x10 << 16); /* add with lower half word */
147
148        if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
149        {
150            if (--x8)
151            {
152                goto LOOP_SAD0;
153            }
154
155        }
156
157        return ((uint32)x10 >> 16);
158
159SadMBOffset3:
160
161        return sad_mb_offset3(ref, blk, lx, dmin);
162
163SadMBOffset2:
164
165        return sad_mb_offset2(ref, blk, lx, dmin);
166
167SadMBOffset1:
168
169        return sad_mb_offset1(ref, blk, lx, dmin);
170
171    }
172
173#elif defined(__CC_ARM)  /* only work with arm v5 */
174
175    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
176    {
177        __asm
178        {
179            rsbs    tmp, tmp, tmp2 ;
180            rsbmi   tmp, tmp, #0 ;
181            add     sad, sad, tmp ;
182        }
183
184        return sad;
185    }
186
187    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
188    {
189        int32 x7;
190
191        __asm
192        {
193            EOR     x7, src2, src1;     /* check odd/even combination */
194            SUBS    src1, src2, src1;
195            EOR     x7, x7, src1;
196            AND     x7, mask, x7, lsr #1;
197            ORRCC   x7, x7, #0x80000000;
198            RSB     x7, x7, x7, lsl #8;
199            ADD     src1, src1, x7, asr #7;   /* add 0xFF to the negative byte, add back carry */
200            EOR     src1, src1, x7, asr #7;   /* take absolute value of negative byte */
201        }
202
203        return src1;
204    }
205
206    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
207    {
208        int32 x7;
209
210        __asm
211        {
212            EOR      x7, src2, src1;        /* check odd/even combination */
213            ADDS     src1, src2, src1;
214            EOR      x7, x7, src1;      /* only odd bytes need to add carry */
215            ANDS     x7, mask, x7, rrx;
216            RSB      x7, x7, x7, lsl #8;
217            SUB      src1, src1, x7, asr #7;  /* add 0xFF to the negative byte, add back carry */
218            EOR      src1, src1, x7, asr #7; /* take absolute value of negative byte */
219        }
220
221        return src1;
222    }
223
224#define sum_accumulate  __asm{      SBC      x5, x5, x10;  /* accumulate low bytes */ \
225        BIC      x10, x6, x10;   /* x10 & 0xFF00FF00 */ \
226        ADD      x4, x4, x10,lsr #8;   /* accumulate high bytes */ \
227        SBC      x5, x5, x11;    /* accumulate low bytes */ \
228        BIC      x11, x6, x11;   /* x11 & 0xFF00FF00 */ \
229        ADD      x4, x4, x11,lsr #8; } /* accumulate high bytes */
230
231
232#define NUMBER 3
233#define SHIFT 24
234#define INC_X8 0x08000001
235
236#include "sad_mb_offset.h"
237
238#undef NUMBER
239#define NUMBER 2
240#undef SHIFT
241#define SHIFT 16
242#undef INC_X8
243#define INC_X8 0x10000001
244#include "sad_mb_offset.h"
245
246#undef NUMBER
247#define NUMBER 1
248#undef SHIFT
249#define SHIFT 8
250#undef INC_X8
251#define INC_X8 0x08000001
252#include "sad_mb_offset.h"
253
254
255    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
256    {
257        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
258
259        x9 = 0x80808080; /* const. */
260        x4 = x5 = 0;
261
262        __asm
263        {
264            MOVS    x8, ref, lsl #31 ;
265            BHI     SadMBOffset3;
266            BCS     SadMBOffset2;
267            BMI     SadMBOffset1;
268
269            MVN     x6, #0xFF00;
270        }
271LOOP_SAD0:
272        /****** process 8 pixels ******/
273        x11 = *((int32*)(ref + 12));
274        x10 = *((int32*)(ref + 8));
275        x14 = *((int32*)(blk + 12));
276        x12 = *((int32*)(blk + 8));
277
278        /* process x11 & x14 */
279        x11 = sad_4pixel(x11, x14, x9);
280
281        /* process x12 & x10 */
282        x10 = sad_4pixel(x10, x12, x9);
283
284        x5 = x5 + x10;  /* accumulate low bytes */
285        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
286        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
287        x5 = x5 + x11;  /* accumulate low bytes */
288        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
289        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
290
291        __asm
292        {
293            /****** process 8 pixels ******/
294            LDR     x11, [ref, #4];
295            LDR     x10, [ref], lx ;
296            LDR     x14, [blk, #4];
297            LDR     x12, [blk], #16 ;
298        }
299
300        /* process x11 & x14 */
301        x11 = sad_4pixel(x11, x14, x9);
302
303        /* process x12 & x10 */
304        x10 = sad_4pixel(x10, x12, x9);
305
306        x5 = x5 + x10;  /* accumulate low bytes */
307        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
308        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
309        x5 = x5 + x11;  /* accumulate low bytes */
310        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
311        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
312
313        /****************/
314        x10 = x5 - (x4 << 8); /* extract low bytes */
315        x10 = x10 + x4;     /* add with high bytes */
316        x10 = x10 + (x10 << 16); /* add with lower half word */
317
318        __asm
319        {
320            /****************/
321            RSBS    x11, dmin, x10, lsr #16;
322            ADDLSS  x8, x8, #0x10000001;
323            BLS     LOOP_SAD0;
324        }
325
326        return ((uint32)x10 >> 16);
327
328SadMBOffset3:
329
330        return sad_mb_offset3(ref, blk, lx, dmin, x8);
331
332SadMBOffset2:
333
334        return sad_mb_offset2(ref, blk, lx, dmin, x8);
335
336SadMBOffset1:
337
338        return sad_mb_offset1(ref, blk, lx, dmin, x8);
339    }
340
341
342#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
343
344    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
345    {
346        __asm__ volatile(
347            "rsbs       %1, %1, %2\n\t"
348            "rsbmi      %1, %1, #0\n\t"
349            "add        %0, %0, %1"
350            : "=r"(sad)
351            : "r"(tmp), "r"(tmp2)
352        );
353        return sad;
354    }
355
356    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
357    {
358        int32 x7;
359
360        __asm__ volatile(
361            "EOR        %1, %2, %0\n\t"
362            "SUBS       %0, %2, %0\n\t"
363            "EOR        %1, %1, %0\n\t"
364            "AND        %1, %3, %1, lsr #1\n\t"
365            "ORRCC      %1, %1, #0x80000000\n\t"
366            "RSB        %1, %1, %1, lsl #8\n\t"
367            "ADD        %0, %0, %1, asr #7\n\t"
368            "EOR        %0, %0, %1, asr #7"
369            : "=r"(src1), "=&r"(x7)
370            : "r"(src2), "r"(mask)
371        );
372
373        return src1;
374    }
375
376    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
377    {
378        int32 x7;
379
380        __asm__ volatile(
381            "EOR        %1, %2, %0\n\t"
382            "ADDS       %0, %2, %0\n\t"
383            "EOR        %1, %1, %0\n\t"
384            "ANDS       %1, %3, %1, rrx\n\t"
385            "RSB        %1, %1, %1, lsl #8\n\t"
386            "SUB        %0, %0, %1, asr #7\n\t"
387            "EOR        %0, %0, %1, asr #7"
388            : "=r"(src1), "=&r"(x7)
389            : "r"(src2), "r"(mask)
390        );
391
392        return src1;
393    }
394
395#define sum_accumulate  __asm__ volatile(              \
396    "SBC   %0, %0, %1\n\t"                             \
397    "BIC   %1, %4, %1\n\t"                             \
398    "ADD   %2, %2, %1, lsr #8\n\t"                     \
399    "SBC   %0, %0, %3\n\t"                             \
400    "BIC   %3, %4, %3\n\t"                             \
401    "ADD   %2, %2, %3, lsr #8"                         \
402    : "=&r" (x5), "=&r" (x10), "=&r" (x4), "=&r" (x11) \
403    : "r" (x6)                                         \
404    );
405
406#define NUMBER 3
407#define SHIFT 24
408#define INC_X8 0x08000001
409
410#include "sad_mb_offset.h"
411
412#undef NUMBER
413#define NUMBER 2
414#undef SHIFT
415#define SHIFT 16
416#undef INC_X8
417#define INC_X8 0x10000001
418#include "sad_mb_offset.h"
419
420#undef NUMBER
421#define NUMBER 1
422#undef SHIFT
423#define SHIFT 8
424#undef INC_X8
425#define INC_X8 0x08000001
426#include "sad_mb_offset.h"
427
428
429    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
430    {
431        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
432
433        x9 = 0x80808080; /* const. */
434        x4 = x5 = 0;
435
436        x8 = (uint32)ref & 0x3;
437        if (x8 == 3)
438            goto SadMBOffset3;
439        if (x8 == 2)
440            goto SadMBOffset2;
441        if (x8 == 1)
442            goto SadMBOffset1;
443
444        x8 = 16;
445///
446        __asm__ volatile("MVN   %0, #0xFF00": "=r"(x6));
447
448LOOP_SAD0:
449        /****** process 8 pixels ******/
450        x11 = *((int32*)(ref + 12));
451        x10 = *((int32*)(ref + 8));
452        x14 = *((int32*)(blk + 12));
453        x12 = *((int32*)(blk + 8));
454
455        /* process x11 & x14 */
456        x11 = sad_4pixel(x11, x14, x9);
457
458        /* process x12 & x10 */
459        x10 = sad_4pixel(x10, x12, x9);
460
461        x5 = x5 + x10;  /* accumulate low bytes */
462        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
463        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
464        x5 = x5 + x11;  /* accumulate low bytes */
465        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
466        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
467
468        /****** process 8 pixels ******/
469        x11 = *((int32*)(ref + 4));
470        __asm__ volatile("LDR   %0, [%1], %2": "=&r"(x10), "=r"(ref): "r"(lx));
471        //x10 = *((int32*)ref); ref+=lx;
472        x14 = *((int32*)(blk + 4));
473        __asm__ volatile("LDR   %0, [%1], #16": "=&r"(x12), "=r"(blk));
474
475        /* process x11 & x14 */
476        x11 = sad_4pixel(x11, x14, x9);
477
478        /* process x12 & x10 */
479        x10 = sad_4pixel(x10, x12, x9);
480
481        x5 = x5 + x10;  /* accumulate low bytes */
482        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
483        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
484        x5 = x5 + x11;  /* accumulate low bytes */
485        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
486        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
487
488        /****************/
489        x10 = x5 - (x4 << 8); /* extract low bytes */
490        x10 = x10 + x4;     /* add with high bytes */
491        x10 = x10 + (x10 << 16); /* add with lower half word */
492
493        /****************/
494
495        if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */
496        {
497            if (--x8)
498            {
499                goto LOOP_SAD0;
500            }
501
502        }
503
504        return ((uint32)x10 >> 16);
505
506SadMBOffset3:
507
508        return sad_mb_offset3(ref, blk, lx, dmin);
509
510SadMBOffset2:
511
512        return sad_mb_offset2(ref, blk, lx, dmin);
513
514SadMBOffset1:
515
516        return sad_mb_offset1(ref, blk, lx, dmin);
517    }
518
519
520#endif
521
522#ifdef __cplusplus
523}
524#endif
525
526#endif // _SAD_INLINE_H_
527
528