sad_inline.h revision 59f566c4ec3dfc097ad8163523e522280b27e5c3
1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18/*********************************************************************************/
19/*  Filename: sad_inline.h                                                      */
20/*  Description: Implementation for in-line functions used in dct.cpp           */
21/*  Modified:                                                                   */
22/*********************************************************************************/
23#ifndef _SAD_INLINE_H_
24#define _SAD_INLINE_H_
25
26#ifdef __cplusplus
27extern "C"
28{
29#endif
30
31#if !defined(PV_ARM_GCC_V5) && !defined(PV_ARM_GCC_V4) /* ARM GNU COMPILER  */
32
33    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
34    {
35        tmp = tmp - tmp2;
36        if (tmp > 0) sad += tmp;
37        else sad -= tmp;
38
39        return sad;
40    }
41
42    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
43    {
44        int32 x7;
45
46        x7 = src2 ^ src1;       /* check odd/even combination */
47        if ((uint32)src2 >= (uint32)src1)
48        {
49            src1 = src2 - src1;     /* subs */
50        }
51        else
52        {
53            src1 = src1 - src2;
54        }
55        x7 = x7 ^ src1;     /* only odd bytes need to add carry */
56        x7 = mask & ((uint32)x7 >> 1);
57        x7 = (x7 << 8) - x7;
58        src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */
59        src1 = src1 ^(x7 >> 7);   /* take absolute value of negative byte */
60
61        return src1;
62    }
63
64#define NUMBER 3
65#define SHIFT 24
66
67#include "sad_mb_offset.h"
68
69#undef NUMBER
70#define NUMBER 2
71#undef SHIFT
72#define SHIFT 16
73#include "sad_mb_offset.h"
74
75#undef NUMBER
76#define NUMBER 1
77#undef SHIFT
78#define SHIFT 8
79#include "sad_mb_offset.h"
80
81
82    __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
83    {
84        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
85
86        x9 = 0x80808080; /* const. */
87
88        x8 = (uint32)ref & 0x3;
89        if (x8 == 3)
90            goto SadMBOffset3;
91        if (x8 == 2)
92            goto SadMBOffset2;
93        if (x8 == 1)
94            goto SadMBOffset1;
95
96//  x5 = (x4<<8)-x4; /* x5 = x4*255; */
97        x4 = x5 = 0;
98
99        x6 = 0xFFFF00FF;
100
101        ref -= lx;
102        blk -= 16;
103
104        x8 = 16;
105
106LOOP_SAD0:
107        /****** process 8 pixels ******/
108        x10 = *((uint32*)(ref += lx));
109        x11 = *((uint32*)(ref + 4));
110        x12 = *((uint32*)(blk += 16));
111        x14 = *((uint32*)(blk + 4));
112
113        /* process x11 & x14 */
114        x11 = sad_4pixel(x11, x14, x9);
115
116        /* process x12 & x10 */
117        x10 = sad_4pixel(x10, x12, x9);
118
119        x5 = x5 + x10; /* accumulate low bytes */
120        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
121        x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
122        x5 = x5 + x11;  /* accumulate low bytes */
123        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
124        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
125
126        /****** process 8 pixels ******/
127        x10 = *((uint32*)(ref + 8));
128        x11 = *((uint32*)(ref + 12));
129        x12 = *((uint32*)(blk + 8));
130        x14 = *((uint32*)(blk + 12));
131
132        /* process x11 & x14 */
133        x11 = sad_4pixel(x11, x14, x9);
134
135        /* process x12 & x10 */
136        x10 = sad_4pixel(x10, x12, x9);
137
138        x5 = x5 + x10;  /* accumulate low bytes */
139        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
140        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
141        x5 = x5 + x11;  /* accumulate low bytes */
142        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
143        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
144
145        /****************/
146        x10 = x5 - (x4 << 8); /* extract low bytes */
147        x10 = x10 + x4;     /* add with high bytes */
148        x10 = x10 + (x10 << 16); /* add with lower half word */
149
150        if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
151        {
152            if (--x8)
153            {
154                goto LOOP_SAD0;
155            }
156
157        }
158
159        return ((uint32)x10 >> 16);
160
161SadMBOffset3:
162
163        return sad_mb_offset3(ref, blk, lx, dmin);
164
165SadMBOffset2:
166
167        return sad_mb_offset2(ref, blk, lx, dmin);
168
169SadMBOffset1:
170
171        return sad_mb_offset1(ref, blk, lx, dmin);
172
173    }
174
175#elif defined(__CC_ARM)  /* only work with arm v5 */
176
177    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
178    {
179        __asm
180        {
181            rsbs    tmp, tmp, tmp2 ;
182            rsbmi   tmp, tmp, #0 ;
183            add     sad, sad, tmp ;
184        }
185
186        return sad;
187    }
188
189    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
190    {
191        int32 x7;
192
193        __asm
194        {
195            EOR     x7, src2, src1;     /* check odd/even combination */
196            SUBS    src1, src2, src1;
197            EOR     x7, x7, src1;
198            AND     x7, mask, x7, lsr #1;
199            ORRCC   x7, x7, #0x80000000;
200            RSB     x7, x7, x7, lsl #8;
201            ADD     src1, src1, x7, asr #7;   /* add 0xFF to the negative byte, add back carry */
202            EOR     src1, src1, x7, asr #7;   /* take absolute value of negative byte */
203        }
204
205        return src1;
206    }
207
208    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
209    {
210        int32 x7;
211
212        __asm
213        {
214            EOR      x7, src2, src1;        /* check odd/even combination */
215            ADDS     src1, src2, src1;
216            EOR      x7, x7, src1;      /* only odd bytes need to add carry */
217            ANDS     x7, mask, x7, rrx;
218            RSB      x7, x7, x7, lsl #8;
219            SUB      src1, src1, x7, asr #7;  /* add 0xFF to the negative byte, add back carry */
220            EOR      src1, src1, x7, asr #7; /* take absolute value of negative byte */
221        }
222
223        return src1;
224    }
225
226#define sum_accumulate  __asm{      SBC      x5, x5, x10;  /* accumulate low bytes */ \
227        BIC      x10, x6, x10;   /* x10 & 0xFF00FF00 */ \
228        ADD      x4, x4, x10,lsr #8;   /* accumulate high bytes */ \
229        SBC      x5, x5, x11;    /* accumulate low bytes */ \
230        BIC      x11, x6, x11;   /* x11 & 0xFF00FF00 */ \
231        ADD      x4, x4, x11,lsr #8; } /* accumulate high bytes */
232
233
234#define NUMBER 3
235#define SHIFT 24
236#define INC_X8 0x08000001
237
238#include "sad_mb_offset.h"
239
240#undef NUMBER
241#define NUMBER 2
242#undef SHIFT
243#define SHIFT 16
244#undef INC_X8
245#define INC_X8 0x10000001
246#include "sad_mb_offset.h"
247
248#undef NUMBER
249#define NUMBER 1
250#undef SHIFT
251#define SHIFT 8
252#undef INC_X8
253#define INC_X8 0x08000001
254#include "sad_mb_offset.h"
255
256
257    __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
258    {
259        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
260
261        x9 = 0x80808080; /* const. */
262        x4 = x5 = 0;
263
264        __asm
265        {
266            MOVS    x8, ref, lsl #31 ;
267            BHI     SadMBOffset3;
268            BCS     SadMBOffset2;
269            BMI     SadMBOffset1;
270
271            MVN     x6, #0xFF00;
272        }
273LOOP_SAD0:
274        /****** process 8 pixels ******/
275        x11 = *((int32*)(ref + 12));
276        x10 = *((int32*)(ref + 8));
277        x14 = *((int32*)(blk + 12));
278        x12 = *((int32*)(blk + 8));
279
280        /* process x11 & x14 */
281        x11 = sad_4pixel(x11, x14, x9);
282
283        /* process x12 & x10 */
284        x10 = sad_4pixel(x10, x12, x9);
285
286        x5 = x5 + x10;  /* accumulate low bytes */
287        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
288        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
289        x5 = x5 + x11;  /* accumulate low bytes */
290        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
291        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
292
293        __asm
294        {
295            /****** process 8 pixels ******/
296            LDR     x11, [ref, #4];
297            LDR     x10, [ref], lx ;
298            LDR     x14, [blk, #4];
299            LDR     x12, [blk], #16 ;
300        }
301
302        /* process x11 & x14 */
303        x11 = sad_4pixel(x11, x14, x9);
304
305        /* process x12 & x10 */
306        x10 = sad_4pixel(x10, x12, x9);
307
308        x5 = x5 + x10;  /* accumulate low bytes */
309        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
310        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
311        x5 = x5 + x11;  /* accumulate low bytes */
312        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
313        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
314
315        /****************/
316        x10 = x5 - (x4 << 8); /* extract low bytes */
317        x10 = x10 + x4;     /* add with high bytes */
318        x10 = x10 + (x10 << 16); /* add with lower half word */
319
320        __asm
321        {
322            /****************/
323            RSBS    x11, dmin, x10, lsr #16;
324            ADDLSS  x8, x8, #0x10000001;
325            BLS     LOOP_SAD0;
326        }
327
328        return ((uint32)x10 >> 16);
329
330SadMBOffset3:
331
332        return sad_mb_offset3(ref, blk, lx, dmin, x8);
333
334SadMBOffset2:
335
336        return sad_mb_offset2(ref, blk, lx, dmin, x8);
337
338SadMBOffset1:
339
340        return sad_mb_offset1(ref, blk, lx, dmin, x8);
341    }
342
343
344#elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER  */
345
346    __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2)
347    {
348        register int32 out;
349        register int32 temp1;
350        register int32 ss = sad;
351        register int32 tt = tmp;
352        register int32 uu = tmp2;
353
354        asm volatile("rsbs  %1, %4, %3\n\t"
355                     "rsbmi %1, %1, #0\n\t"
356                     "add   %0, %2, %1"
357             : "=&r"(out),
358                     "=&r"(temp1)
359                             : "r"(ss),
360                             "r"(tt),
361                             "r"(uu));
362        return out;
363    }
364
365    __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask)
366{
367        register int32 out;
368        register int32 temp1;
369        register int32 s1 = src1;
370        register int32 s2 = src2;
371        register int32 mm = mask;
372
373        asm volatile("eor   %0, %3, %2\n\t"
374                     "subs  %1, %3, %2\n\t"
375                     "eor   %0, %0, %1\n\t"
376                     "and   %0, %4, %0, lsr #1\n\t"
377                     "orrcc %0, %0, #0x80000000\n\t"
378                     "rsb   %0, %0, %0, lsl #8\n\t"
379                     "add   %1, %1, %0, asr #7\n\t"
380                     "eor   %1, %1, %0, asr #7"
381             : "=&r"(out),
382                     "=&r"(temp1)
383                             : "r"(s1),
384                             "r"(s2),
385                             "r"(mm));
386
387        return temp1;
388    }
389
390    __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask)
391{
392        register int32 out;
393        register int32 temp1;
394        register int32 s1 = src1;
395        register int32 s2 = src2;
396        register int32 mm = mask;
397
398        asm volatile("eor    %1, %3, %2\n\t"
399                     "adds   %0, %3, %2\n\t"
400                     "eor    %1, %1, %0\n\t"
401                     "ands   %1, %4, %1,rrx\n\t"
402                     "rsb    %1, %1, %1, lsl #8\n\t"
403                     "sub    %0, %0, %1, asr #7\n\t"
404                     "eor    %0, %0, %1, asr #7"
405             : "=&r"(out),
406                     "=&r"(temp1)
407                             : "r"(s1),
408                             "r"(s2),
409                             "r"(mm));
410
411        return (out);
412    }
413
414#define sum_accumulate asm volatile("sbc  %0, %0, %1\n\t" \
415                                "bic  %1, %4, %1\n\t" \
416                                "add  %2, %2, %1, lsr #8\n\t" \
417                                "sbc  %0, %0, %3\n\t" \
418                                "bic  %3, %4, %3\n\t" \
419                                "add  %2, %2, %3, lsr #8" \
420                                :"+r"(x5), "+r"(x10), "+r"(x4), "+r"(x11) \
421                                :"r"(x6));
422
423#define NUMBER 3
424#define SHIFT 24
425#define INC_X8 0x08000001
426
427#include "sad_mb_offset.h"
428
429#undef NUMBER
430#define NUMBER 2
431#undef SHIFT
432#define SHIFT 16
433#undef INC_X8
434#define INC_X8 0x10000001
435#include "sad_mb_offset.h"
436
437#undef NUMBER
438#define NUMBER 1
439#undef SHIFT
440#define SHIFT 8
441#undef INC_X8
442#define INC_X8 0x08000001
443#include "sad_mb_offset.h"
444
445
446    __inline int32 simd_sad_mb(UChar *ref, UChar *blk, Int dmin, Int lx)
447{
448        int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
449
450        x9 = 0x80808080; /* const. */
451        x4 = x5 = 0;
452
453        x8 = (uint32)ref & 0x3;
454        if (x8 == 3)
455            goto SadMBOffset3;
456        if (x8 == 2)
457            goto SadMBOffset2;
458        if (x8 == 1)
459            goto SadMBOffset1;
460
461asm volatile("mvn %0, #0xFF00": "=r"(x6));
462
463LOOP_SAD0:
464        /****** process 8 pixels ******/
465        x11 = *((int32*)(ref + 12));
466        x10 = *((int32*)(ref + 8));
467        x14 = *((int32*)(blk + 12));
468        x12 = *((int32*)(blk + 8));
469
470        /* process x11 & x14 */
471        x11 = sad_4pixel(x11, x14, x9);
472
473        /* process x12 & x10 */
474        x10 = sad_4pixel(x10, x12, x9);
475
476        x5 = x5 + x10;  /* accumulate low bytes */
477        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
478        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
479        x5 = x5 + x11;  /* accumulate low bytes */
480        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
481        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
482
483        asm volatile("ldr  %0, [%4, #4]\n\t"
484                     "ldr  %1, [%4], %6\n\t"
485                     "ldr  %2, [%5, #4]\n\t"
486                     "ldr  %3, [%5], #16"
487             : "=r"(x11), "=r"(x10), "=r"(x14), "=r"(x12), "+r"(ref), "+r"(blk)
488                             : "r"(lx));
489
490        /* process x11 & x14 */
491        x11 = sad_4pixel(x11, x14, x9);
492
493        /* process x12 & x10 */
494        x10 = sad_4pixel(x10, x12, x9);
495
496        x5 = x5 + x10;  /* accumulate low bytes */
497        x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
498        x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */
499        x5 = x5 + x11;  /* accumulate low bytes */
500        x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
501        x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
502
503        /****************/
504        x10 = x5 - (x4 << 8); /* extract low bytes */
505        x10 = x10 + x4;     /* add with high bytes */
506        x10 = x10 + (x10 << 16); /* add with lower half word */
507
508        if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
509        {
510            if (--x8)
511            {
512                goto LOOP_SAD0;
513            }
514
515        }
516
517        return ((uint32)x10 >> 16);
518
519SadMBOffset3:
520
521        return sad_mb_offset3(ref, blk, lx, dmin);
522
523SadMBOffset2:
524
525        return sad_mb_offset2(ref, blk, lx, dmin);
526
527SadMBOffset1:
528
529        return sad_mb_offset1(ref, blk, lx, dmin);
530    }
531
532#endif // OS
533
534#ifdef __cplusplus
535}
536#endif
537
538#endif // _SAD_INLINE_H_
539
540