sad_inline.h revision f5af6314db25ff3bef9bd2eeba201bc6cc60805d
1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18#ifndef _SAD_INLINE_H_
19#define _SAD_INLINE_H_
20
21#ifdef __cplusplus
22extern "C"
23{
24#endif
25
26/* Intentionally not using the gcc asm version, since it (if fixed so
27 * as to not crash - the current register constraints are faulty) is
28 * slightly slower than the plain C version on modern GCC versions. */
29#if !defined(__CC_ARM) /* Generic C version */
30
31    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
32    {
33        tmp = tmp - tmp2;
34        if (tmp > 0) sad += tmp;
35        else sad -= tmp;
36
37        return sad;
38    }
39
40    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
41    {
42        int32 x7;
43
44        x7 = src2 ^ src1;       /* check odd/even combination */
45        if ((uint32)src2 >= (uint32)src1)
46        {
47            src1 = src2 - src1;     /* subs */
48        }
49        else
50        {
51            src1 = src1 - src2;
52        }
53        x7 = x7 ^ src1;     /* only odd bytes need to add carry */
54        x7 = mask & ((uint32)x7 >> 1);
55        x7 = (x7 << 8) - x7;
56        src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
57        src1 = src1 ^(x7 >> 7);   /* take absolute value of negative byte */
58
59        return src1;
60    }
61
62#define NUMBER 3
63#define SHIFT 24
64
65#include "sad_mb_offset.h"
66
67#undef NUMBER
68#define NUMBER 2
69#undef SHIFT
70#define SHIFT 16
71#include "sad_mb_offset.h"
72
73#undef NUMBER
74#define NUMBER 1
75#undef SHIFT
76#define SHIFT 8
77#include "sad_mb_offset.h"
78
79
80    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
81    {
82        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
83
84        x9 = 0x80808080; /* const. */
85
86        x8 = (intptr_t)ref & 0x3;
87        if (x8 == 3)
88            goto SadMBOffset3;
89        if (x8 == 2)
90            goto SadMBOffset2;
91        if (x8 == 1)
92            goto SadMBOffset1;
93
94//  x5 = (x4<<8)-x4; /* x5 = x4*255; */
95        x4 = x5 = 0;
96
97        x6 = 0xFFFF00FF;
98
99        ref -= lx;
100        blk -= 16;
101
102        x8 = 16;
103
104LOOP_SAD0:
105        /****** process 8 pixels ******/
106        x10 = *((uint32*)(ref += lx));
107        x11 = *((uint32*)(ref + 4));
108        x12 = *((uint32*)(blk += 16));
109        x14 = *((uint32*)(blk + 4));
110
111        /* process x11 & x14 */
112        x11 = sad_4pixel(x11, x14, x9);
113
114        /* process x12 & x10 */
115        x10 = sad_4pixel(x10, x12, x9);
116
117        x5 = x5 + x10; /* accumulate low bytes */
118        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
119        x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
120        x5 = x5 + x11;  /* accumulate low bytes */
121        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
122        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
123
124        /****** process 8 pixels ******/
125        x10 = *((uint32*)(ref + 8));
126        x11 = *((uint32*)(ref + 12));
127        x12 = *((uint32*)(blk + 8));
128        x14 = *((uint32*)(blk + 12));
129
130        /* process x11 & x14 */
131        x11 = sad_4pixel(x11, x14, x9);
132
133        /* process x12 & x10 */
134        x10 = sad_4pixel(x10, x12, x9);
135
136        x5 = x5 + x10;  /* accumulate low bytes */
137        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
138        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
139        x5 = x5 + x11;  /* accumulate low bytes */
140        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
141        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
142
143        /****************/
144        x10 = x5 - (x4 << 8); /* extract low bytes */
145        x10 = x10 + x4;     /* add with high bytes */
146        x10 = x10 + (x10 << 16); /* add with lower half word */
147
148        if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
149        {
150            if (--x8)
151            {
152                goto LOOP_SAD0;
153            }
154
155        }
156
157        return ((uint32)x10 >> 16);
158
159SadMBOffset3:
160
161        return sad_mb_offset3(ref, blk, lx, dmin);
162
163SadMBOffset2:
164
165        return sad_mb_offset2(ref, blk, lx, dmin);
166
167SadMBOffset1:
168
169        return sad_mb_offset1(ref, blk, lx, dmin);
170
171    }
172
173#elif defined(__CC_ARM)  /* only work with arm v5 */
174
175    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
176    {
177        __asm
178        {
179            rsbs    tmp, tmp, tmp2 ;
180            rsbmi   tmp, tmp, #0 ;
181            add     sad, sad, tmp ;
182        }
183
184        return sad;
185    }
186
187    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
188    {
189        int32 x7;
190
191        __asm
192        {
193            EOR     x7, src2, src1;     /* check odd/even combination */
194            SUBS    src1, src2, src1;
195            EOR     x7, x7, src1;
196            AND     x7, mask, x7, lsr #1;
197            ORRCC   x7, x7, #0x80000000;
198            RSB     x7, x7, x7, lsl #8;
199            ADD     src1, src1, x7, asr #7;   /* add 0xFF to the negative byte, add back carry */
200            EOR     src1, src1, x7, asr #7;   /* take absolute value of negative byte */
201        }
202
203        return src1;
204    }
205
206    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
207    {
208        int32 x7;
209
210        __asm
211        {
212            EOR      x7, src2, src1;        /* check odd/even combination */
213            ADDS     src1, src2, src1;
214            EOR      x7, x7, src1;      /* only odd bytes need to add carry */
215            ANDS     x7, mask, x7, rrx;
216            RSB      x7, x7, x7, lsl #8;
217            SUB      src1, src1, x7, asr #7;  /* add 0xFF to the negative byte, add back carry */
218            EOR      src1, src1, x7, asr #7; /* take absolute value of negative byte */
219        }
220
221        return src1;
222    }
223
224#define sum_accumulate  __asm{      SBC      x5, x5, x10;  /* accumulate low bytes */ \
225        BIC      x10, x6, x10;   /* x10 & 0xFF00FF00 */ \
226        ADD      x4, x4, x10,lsr #8;   /* accumulate high bytes */ \
227        SBC      x5, x5, x11;    /* accumulate low bytes */ \
228        BIC      x11, x6, x11;   /* x11 & 0xFF00FF00 */ \
229        ADD      x4, x4, x11,lsr #8; } /* accumulate high bytes */
230
231
232#define NUMBER 3
233#define SHIFT 24
234#define INC_X8 0x08000001
235
236#include "sad_mb_offset.h"
237
238#undef NUMBER
239#define NUMBER 2
240#undef SHIFT
241#define SHIFT 16
242#undef INC_X8
243#define INC_X8 0x10000001
244#include "sad_mb_offset.h"
245
246#undef NUMBER
247#define NUMBER 1
248#undef SHIFT
249#define SHIFT 8
250#undef INC_X8
251#define INC_X8 0x08000001
252#include "sad_mb_offset.h"
253
254
255    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
256    {
257        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
258
259        x9 = 0x80808080; /* const. */
260        x4 = x5 = 0;
261
262        __asm
263        {
264            MOVS    x8, ref, lsl #31 ;
265            BHI     SadMBOffset3;
266            BCS     SadMBOffset2;
267            BMI     SadMBOffset1;
268
269            MVN     x6, #0xFF00;
270        }
271LOOP_SAD0:
272        /****** process 8 pixels ******/
273        x11 = *((int32*)(ref + 12));
274        x10 = *((int32*)(ref + 8));
275        x14 = *((int32*)(blk + 12));
276        x12 = *((int32*)(blk + 8));
277
278        /* process x11 & x14 */
279        x11 = sad_4pixel(x11, x14, x9);
280
281        /* process x12 & x10 */
282        x10 = sad_4pixel(x10, x12, x9);
283
284        x5 = x5 + x10;  /* accumulate low bytes */
285        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
286        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
287        x5 = x5 + x11;  /* accumulate low bytes */
288        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
289        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
290
291        __asm
292        {
293            /****** process 8 pixels ******/
294            LDR     x11, [ref, #4];
295            LDR     x10, [ref], lx ;
296            LDR     x14, [blk, #4];
297            LDR     x12, [blk], #16 ;
298        }
299
300        /* process x11 & x14 */
301        x11 = sad_4pixel(x11, x14, x9);
302
303        /* process x12 & x10 */
304        x10 = sad_4pixel(x10, x12, x9);
305
306        x5 = x5 + x10;  /* accumulate low bytes */
307        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
308        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
309        x5 = x5 + x11;  /* accumulate low bytes */
310        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
311        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
312
313        /****************/
314        x10 = x5 - (x4 << 8); /* extract low bytes */
315        x10 = x10 + x4;     /* add with high bytes */
316        x10 = x10 + (x10 << 16); /* add with lower half word */
317
318        __asm
319        {
320            /****************/
321            RSBS    x11, dmin, x10, lsr #16;
322            ADDLSS  x8, x8, #0x10000001;
323            BLS     LOOP_SAD0;
324        }
325
326        return ((uint32)x10 >> 16);
327
328SadMBOffset3:
329
330        return sad_mb_offset3(ref, blk, lx, dmin, x8);
331
332SadMBOffset2:
333
334        return sad_mb_offset2(ref, blk, lx, dmin, x8);
335
336SadMBOffset1:
337
338        return sad_mb_offset1(ref, blk, lx, dmin, x8);
339    }
340
341
342#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
343
344    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
345    {
346__asm__ volatile("rsbs	%1, %1, %2\n\trsbmi %1, %1, #0\n\tadd	%0, %0, %1": "=r"(sad): "r"(tmp), "r"(tmp2));
347        return sad;
348    }
349
350    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
351    {
352        int32 x7;
353
354__asm__ volatile("EOR	%1, %2, %0\n\tSUBS  %0, %2, %0\n\tEOR	%1, %1, %0\n\tAND  %1, %3, %1, lsr #1\n\tORRCC	%1, %1, #0x80000000\n\tRSB  %1, %1, %1, lsl #8\n\tADD  %0, %0, %1, asr #7\n\tEOR  %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask));
355
356        return src1;
357    }
358
359    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
360    {
361        int32 x7;
362
363__asm__ volatile("EOR	%1, %2, %0\n\tADDS  %0, %2, %0\n\tEOR  %1, %1, %0\n\tANDS  %1, %3, %1, rrx\n\tRSB  %1, %1, %1, lsl #8\n\tSUB	%0, %0, %1, asr #7\n\tEOR   %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask));
364
365        return src1;
366    }
367
368#define sum_accumulate  __asm__ volatile("SBC  %0, %0, %1\n\tBIC   %1, %4, %1\n\tADD   %2, %2, %1, lsr #8\n\tSBC   %0, %0, %3\n\tBIC   %3, %4, %3\n\tADD   %2, %2, %3, lsr #8": "=&r" (x5), "=&r" (x10), "=&r" (x4), "=&r" (x11): "r" (x6));
369
370#define NUMBER 3
371#define SHIFT 24
372#define INC_X8 0x08000001
373
374#include "sad_mb_offset.h"
375
376#undef NUMBER
377#define NUMBER 2
378#undef SHIFT
379#define SHIFT 16
380#undef INC_X8
381#define INC_X8 0x10000001
382#include "sad_mb_offset.h"
383
384#undef NUMBER
385#define NUMBER 1
386#undef SHIFT
387#define SHIFT 8
388#undef INC_X8
389#define INC_X8 0x08000001
390#include "sad_mb_offset.h"
391
392
393    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
394    {
395        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
396
397        x9 = 0x80808080; /* const. */
398        x4 = x5 = 0;
399
400        x8 = (uint32)ref & 0x3;
401        if (x8 == 3)
402            goto SadMBOffset3;
403        if (x8 == 2)
404            goto SadMBOffset2;
405        if (x8 == 1)
406            goto SadMBOffset1;
407
408        x8 = 16;
409///
410__asm__ volatile("MVN	%0, #0xFF00": "=r"(x6));
411
412LOOP_SAD0:
413        /****** process 8 pixels ******/
414        x11 = *((int32*)(ref + 12));
415        x10 = *((int32*)(ref + 8));
416        x14 = *((int32*)(blk + 12));
417        x12 = *((int32*)(blk + 8));
418
419        /* process x11 & x14 */
420        x11 = sad_4pixel(x11, x14, x9);
421
422        /* process x12 & x10 */
423        x10 = sad_4pixel(x10, x12, x9);
424
425        x5 = x5 + x10;  /* accumulate low bytes */
426        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
427        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
428        x5 = x5 + x11;  /* accumulate low bytes */
429        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
430        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
431
432        /****** process 8 pixels ******/
433        x11 = *((int32*)(ref + 4));
434__asm__ volatile("LDR	%0, [%1], %2": "=&r"(x10), "=r"(ref): "r"(lx));
435        //x10 = *((int32*)ref); ref+=lx;
436        x14 = *((int32*)(blk + 4));
437__asm__ volatile("LDR	%0, [%1], #16": "=&r"(x12), "=r"(blk));
438
439        /* process x11 & x14 */
440        x11 = sad_4pixel(x11, x14, x9);
441
442        /* process x12 & x10 */
443        x10 = sad_4pixel(x10, x12, x9);
444
445        x5 = x5 + x10;  /* accumulate low bytes */
446        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
447        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
448        x5 = x5 + x11;  /* accumulate low bytes */
449        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
450        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
451
452        /****************/
453        x10 = x5 - (x4 << 8); /* extract low bytes */
454        x10 = x10 + x4;     /* add with high bytes */
455        x10 = x10 + (x10 << 16); /* add with lower half word */
456
457        /****************/
458
459        if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */
460        {
461            if (--x8)
462            {
463                goto LOOP_SAD0;
464            }
465
466        }
467
468        return ((uint32)x10 >> 16);
469
470SadMBOffset3:
471
472        return sad_mb_offset3(ref, blk, lx, dmin);
473
474SadMBOffset2:
475
476        return sad_mb_offset2(ref, blk, lx, dmin);
477
478SadMBOffset1:
479
480        return sad_mb_offset1(ref, blk, lx, dmin);
481    }
482
483
484#endif
485
486#ifdef __cplusplus
487}
488#endif
489
490#endif // _SAD_INLINE_H_
491
492