sad_inline.h revision 4b43b41eaf8c4c80f66185e13620cf94b8b2ef5b
1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18#ifndef _SAD_INLINE_H_
19#define _SAD_INLINE_H_
20
21#ifdef __cplusplus
22extern "C"
23{
24#endif
25
26#if defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
27
28    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
29    {
30        tmp = tmp - tmp2;
31        if (tmp > 0) sad += tmp;
32        else sad -= tmp;
33
34        return sad;
35    }
36
37    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
38    {
39        int32 x7;
40
41        x7 = src2 ^ src1;       /* check odd/even combination */
42        if ((uint32)src2 >= (uint32)src1)
43        {
44            src1 = src2 - src1;     /* subs */
45        }
46        else
47        {
48            src1 = src1 - src2;
49        }
50        x7 = x7 ^ src1;     /* only odd bytes need to add carry */
51        x7 = mask & ((uint32)x7 >> 1);
52        x7 = (x7 << 8) - x7;
53        src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
54        src1 = src1 ^(x7 >> 7);   /* take absolute value of negative byte */
55
56        return src1;
57    }
58
59#define NUMBER 3
60#define SHIFT 24
61
62#include "sad_mb_offset.h"
63
64#undef NUMBER
65#define NUMBER 2
66#undef SHIFT
67#define SHIFT 16
68#include "sad_mb_offset.h"
69
70#undef NUMBER
71#define NUMBER 1
72#undef SHIFT
73#define SHIFT 8
74#include "sad_mb_offset.h"
75
76
77    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
78    {
79        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
80
81        x9 = 0x80808080; /* const. */
82
83        x8 = (intptr_t)ref & 0x3;
84        if (x8 == 3)
85            goto SadMBOffset3;
86        if (x8 == 2)
87            goto SadMBOffset2;
88        if (x8 == 1)
89            goto SadMBOffset1;
90
91//  x5 = (x4<<8)-x4; /* x5 = x4*255; */
92        x4 = x5 = 0;
93
94        x6 = 0xFFFF00FF;
95
96        ref -= lx;
97        blk -= 16;
98
99        x8 = 16;
100
101LOOP_SAD0:
102        /****** process 8 pixels ******/
103        x10 = *((uint32*)(ref += lx));
104        x11 = *((uint32*)(ref + 4));
105        x12 = *((uint32*)(blk += 16));
106        x14 = *((uint32*)(blk + 4));
107
108        /* process x11 & x14 */
109        x11 = sad_4pixel(x11, x14, x9);
110
111        /* process x12 & x10 */
112        x10 = sad_4pixel(x10, x12, x9);
113
114        x5 = x5 + x10; /* accumulate low bytes */
115        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
116        x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
117        x5 = x5 + x11;  /* accumulate low bytes */
118        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
119        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
120
121        /****** process 8 pixels ******/
122        x10 = *((uint32*)(ref + 8));
123        x11 = *((uint32*)(ref + 12));
124        x12 = *((uint32*)(blk + 8));
125        x14 = *((uint32*)(blk + 12));
126
127        /* process x11 & x14 */
128        x11 = sad_4pixel(x11, x14, x9);
129
130        /* process x12 & x10 */
131        x10 = sad_4pixel(x10, x12, x9);
132
133        x5 = x5 + x10;  /* accumulate low bytes */
134        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
135        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
136        x5 = x5 + x11;  /* accumulate low bytes */
137        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
138        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
139
140        /****************/
141        x10 = x5 - (x4 << 8); /* extract low bytes */
142        x10 = x10 + x4;     /* add with high bytes */
143        x10 = x10 + (x10 << 16); /* add with lower half word */
144
145        if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
146        {
147            if (--x8)
148            {
149                goto LOOP_SAD0;
150            }
151
152        }
153
154        return ((uint32)x10 >> 16);
155
156SadMBOffset3:
157
158        return sad_mb_offset3(ref, blk, lx, dmin);
159
160SadMBOffset2:
161
162        return sad_mb_offset2(ref, blk, lx, dmin);
163
164SadMBOffset1:
165
166        return sad_mb_offset1(ref, blk, lx, dmin);
167
168    }
169
170#elif defined(__CC_ARM)  /* only work with arm v5 */
171
172    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
173    {
174        __asm
175        {
176            rsbs    tmp, tmp, tmp2 ;
177            rsbmi   tmp, tmp, #0 ;
178            add     sad, sad, tmp ;
179        }
180
181        return sad;
182    }
183
184    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
185    {
186        int32 x7;
187
188        __asm
189        {
190            EOR     x7, src2, src1;     /* check odd/even combination */
191            SUBS    src1, src2, src1;
192            EOR     x7, x7, src1;
193            AND     x7, mask, x7, lsr #1;
194            ORRCC   x7, x7, #0x80000000;
195            RSB     x7, x7, x7, lsl #8;
196            ADD     src1, src1, x7, asr #7;   /* add 0xFF to the negative byte, add back carry */
197            EOR     src1, src1, x7, asr #7;   /* take absolute value of negative byte */
198        }
199
200        return src1;
201    }
202
203    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
204    {
205        int32 x7;
206
207        __asm
208        {
209            EOR      x7, src2, src1;        /* check odd/even combination */
210            ADDS     src1, src2, src1;
211            EOR      x7, x7, src1;      /* only odd bytes need to add carry */
212            ANDS     x7, mask, x7, rrx;
213            RSB      x7, x7, x7, lsl #8;
214            SUB      src1, src1, x7, asr #7;  /* add 0xFF to the negative byte, add back carry */
215            EOR      src1, src1, x7, asr #7; /* take absolute value of negative byte */
216        }
217
218        return src1;
219    }
220
221#define sum_accumulate  __asm{      SBC      x5, x5, x10;  /* accumulate low bytes */ \
222        BIC      x10, x6, x10;   /* x10 & 0xFF00FF00 */ \
223        ADD      x4, x4, x10,lsr #8;   /* accumulate high bytes */ \
224        SBC      x5, x5, x11;    /* accumulate low bytes */ \
225        BIC      x11, x6, x11;   /* x11 & 0xFF00FF00 */ \
226        ADD      x4, x4, x11,lsr #8; } /* accumulate high bytes */
227
228
229#define NUMBER 3
230#define SHIFT 24
231#define INC_X8 0x08000001
232
233#include "sad_mb_offset.h"
234
235#undef NUMBER
236#define NUMBER 2
237#undef SHIFT
238#define SHIFT 16
239#undef INC_X8
240#define INC_X8 0x10000001
241#include "sad_mb_offset.h"
242
243#undef NUMBER
244#define NUMBER 1
245#undef SHIFT
246#define SHIFT 8
247#undef INC_X8
248#define INC_X8 0x08000001
249#include "sad_mb_offset.h"
250
251
252    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
253    {
254        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
255
256        x9 = 0x80808080; /* const. */
257        x4 = x5 = 0;
258
259        __asm
260        {
261            MOVS    x8, ref, lsl #31 ;
262            BHI     SadMBOffset3;
263            BCS     SadMBOffset2;
264            BMI     SadMBOffset1;
265
266            MVN     x6, #0xFF00;
267        }
268LOOP_SAD0:
269        /****** process 8 pixels ******/
270        x11 = *((int32*)(ref + 12));
271        x10 = *((int32*)(ref + 8));
272        x14 = *((int32*)(blk + 12));
273        x12 = *((int32*)(blk + 8));
274
275        /* process x11 & x14 */
276        x11 = sad_4pixel(x11, x14, x9);
277
278        /* process x12 & x10 */
279        x10 = sad_4pixel(x10, x12, x9);
280
281        x5 = x5 + x10;  /* accumulate low bytes */
282        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
283        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
284        x5 = x5 + x11;  /* accumulate low bytes */
285        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
286        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
287
288        __asm
289        {
290            /****** process 8 pixels ******/
291            LDR     x11, [ref, #4];
292            LDR     x10, [ref], lx ;
293            LDR     x14, [blk, #4];
294            LDR     x12, [blk], #16 ;
295        }
296
297        /* process x11 & x14 */
298        x11 = sad_4pixel(x11, x14, x9);
299
300        /* process x12 & x10 */
301        x10 = sad_4pixel(x10, x12, x9);
302
303        x5 = x5 + x10;  /* accumulate low bytes */
304        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
305        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
306        x5 = x5 + x11;  /* accumulate low bytes */
307        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
308        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
309
310        /****************/
311        x10 = x5 - (x4 << 8); /* extract low bytes */
312        x10 = x10 + x4;     /* add with high bytes */
313        x10 = x10 + (x10 << 16); /* add with lower half word */
314
315        __asm
316        {
317            /****************/
318            RSBS    x11, dmin, x10, lsr #16;
319            ADDLSS  x8, x8, #0x10000001;
320            BLS     LOOP_SAD0;
321        }
322
323        return ((uint32)x10 >> 16);
324
325SadMBOffset3:
326
327        return sad_mb_offset3(ref, blk, lx, dmin, x8);
328
329SadMBOffset2:
330
331        return sad_mb_offset2(ref, blk, lx, dmin, x8);
332
333SadMBOffset1:
334
335        return sad_mb_offset1(ref, blk, lx, dmin, x8);
336    }
337
338
339#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
340
341    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
342    {
343__asm__ volatile("rsbs	%1, %1, %2\n\trsbmi %1, %1, #0\n\tadd	%0, %0, %1": "=r"(sad): "r"(tmp), "r"(tmp2));
344        return sad;
345    }
346
347    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
348    {
349        int32 x7;
350
351__asm__ volatile("EOR	%1, %2, %0\n\tSUBS  %0, %2, %0\n\tEOR	%1, %1, %0\n\tAND  %1, %3, %1, lsr #1\n\tORRCC	%1, %1, #0x80000000\n\tRSB  %1, %1, %1, lsl #8\n\tADD  %0, %0, %1, asr #7\n\tEOR  %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask));
352
353        return src1;
354    }
355
356    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
357    {
358        int32 x7;
359
360__asm__ volatile("EOR	%1, %2, %0\n\tADDS  %0, %2, %0\n\tEOR  %1, %1, %0\n\tANDS  %1, %3, %1, rrx\n\tRSB  %1, %1, %1, lsl #8\n\tSUB	%0, %0, %1, asr #7\n\tEOR   %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask));
361
362        return src1;
363    }
364
365#define sum_accumulate  __asm__ volatile("SBC  %0, %0, %1\n\tBIC   %1, %4, %1\n\tADD   %2, %2, %1, lsr #8\n\tSBC   %0, %0, %3\n\tBIC   %3, %4, %3\n\tADD   %2, %2, %3, lsr #8": "=&r" (x5), "=&r" (x10), "=&r" (x4), "=&r" (x11): "r" (x6));
366
367#define NUMBER 3
368#define SHIFT 24
369#define INC_X8 0x08000001
370
371#include "sad_mb_offset.h"
372
373#undef NUMBER
374#define NUMBER 2
375#undef SHIFT
376#define SHIFT 16
377#undef INC_X8
378#define INC_X8 0x10000001
379#include "sad_mb_offset.h"
380
381#undef NUMBER
382#define NUMBER 1
383#undef SHIFT
384#define SHIFT 8
385#undef INC_X8
386#define INC_X8 0x08000001
387#include "sad_mb_offset.h"
388
389
390    __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx)
391    {
392        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
393
394        x9 = 0x80808080; /* const. */
395        x4 = x5 = 0;
396
397        x8 = (uint32)ref & 0x3;
398        if (x8 == 3)
399            goto SadMBOffset3;
400        if (x8 == 2)
401            goto SadMBOffset2;
402        if (x8 == 1)
403            goto SadMBOffset1;
404
405        x8 = 16;
406///
407__asm__ volatile("MVN	%0, #0xFF00": "=r"(x6));
408
409LOOP_SAD0:
410        /****** process 8 pixels ******/
411        x11 = *((int32*)(ref + 12));
412        x10 = *((int32*)(ref + 8));
413        x14 = *((int32*)(blk + 12));
414        x12 = *((int32*)(blk + 8));
415
416        /* process x11 & x14 */
417        x11 = sad_4pixel(x11, x14, x9);
418
419        /* process x12 & x10 */
420        x10 = sad_4pixel(x10, x12, x9);
421
422        x5 = x5 + x10;  /* accumulate low bytes */
423        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
424        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
425        x5 = x5 + x11;  /* accumulate low bytes */
426        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
427        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
428
429        /****** process 8 pixels ******/
430        x11 = *((int32*)(ref + 4));
431__asm__ volatile("LDR	%0, [%1], %2": "=&r"(x10), "=r"(ref): "r"(lx));
432        //x10 = *((int32*)ref); ref+=lx;
433        x14 = *((int32*)(blk + 4));
434__asm__ volatile("LDR	%0, [%1], #16": "=&r"(x12), "=r"(blk));
435
436        /* process x11 & x14 */
437        x11 = sad_4pixel(x11, x14, x9);
438
439        /* process x12 & x10 */
440        x10 = sad_4pixel(x10, x12, x9);
441
442        x5 = x5 + x10;  /* accumulate low bytes */
443        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
444        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
445        x5 = x5 + x11;  /* accumulate low bytes */
446        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
447        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
448
449        /****************/
450        x10 = x5 - (x4 << 8); /* extract low bytes */
451        x10 = x10 + x4;     /* add with high bytes */
452        x10 = x10 + (x10 << 16); /* add with lower half word */
453
454        /****************/
455
456        if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */
457        {
458            if (--x8)
459            {
460                goto LOOP_SAD0;
461            }
462
463        }
464
465        return ((uint32)x10 >> 16);
466
467SadMBOffset3:
468
469        return sad_mb_offset3(ref, blk, lx, dmin);
470
471SadMBOffset2:
472
473        return sad_mb_offset2(ref, blk, lx, dmin);
474
475SadMBOffset1:
476
477        return sad_mb_offset1(ref, blk, lx, dmin);
478    }
479
480
481#endif
482
483#ifdef __cplusplus
484}
485#endif
486
487#endif // _SAD_INLINE_H_
488
489