sad_mb_offset.h revision 3fdb405597f0e062a9bb8af20199c5e67f0f764c
1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18
19/* Intentionally not using the gcc asm version, since it (if fixed so
20 * as to not crash - the current register constraints are faulty) is
21 * slightly slower than the plain C version on modern GCC versions. */
22#if !defined(__CC_ARM) /* Generic C version */
23
24#if (NUMBER==3)
25__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
26#elif (NUMBER==2)
27__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
28#elif (NUMBER==1)
29__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
30#endif
31{
32    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
33
34    //  x5 = (x4<<8) - x4;
35    x4 = x5 = 0;
36    x6 = 0xFFFF00FF;
37    x9 = 0x80808080; /* const. */
38    ref -= NUMBER; /* bic ref, ref, #3 */
39    ref -= lx;
40    blk -= 16;
41    x8 = 16;
42
43#if (NUMBER==3)
44LOOP_SAD3:
45#elif (NUMBER==2)
46LOOP_SAD2:
47#elif (NUMBER==1)
48LOOP_SAD1:
49#endif
50    /****** process 8 pixels ******/
51    x10 = *((uint32*)(ref += lx)); /* D C B A */
52    x11 = *((uint32*)(ref + 4));    /* H G F E */
53    x12 = *((uint32*)(ref + 8));    /* L K J I */
54
55    x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
56    x10 = x10 | (x11 << (32 - SHIFT));        /* G F E D */
57    x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
58    x11 = x11 | (x12 << (32 - SHIFT));        /* K J I H */
59
60    x12 = *((uint32*)(blk += 16));
61    x14 = *((uint32*)(blk + 4));
62
63    /* process x11 & x14 */
64    x11 = sad_4pixel(x11, x14, x9);
65
66    /* process x12 & x10 */
67    x10 = sad_4pixel(x10, x12, x9);
68
69    x5 = x5 + x10; /* accumulate low bytes */
70    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
71    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
72    x5 = x5 + x11;  /* accumulate low bytes */
73    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
74    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
75
76    /****** process 8 pixels ******/
77    x10 = *((uint32*)(ref + 8)); /* D C B A */
78    x11 = *((uint32*)(ref + 12));   /* H G F E */
79    x12 = *((uint32*)(ref + 16));   /* L K J I */
80
81    x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24  = 0xFF 0xFF 0xFF ~D */
82    x10 = x10 | (x11 << (32 - SHIFT));        /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
83    x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
84    x11 = x11 | (x12 << (32 - SHIFT));        /* ~K ~J ~I ~H */
85
86    x12 = *((uint32*)(blk + 8));
87    x14 = *((uint32*)(blk + 12));
88
89    /* process x11 & x14 */
90    x11 = sad_4pixel(x11, x14, x9);
91
92    /* process x12 & x10 */
93    x10 = sad_4pixel(x10, x12, x9);
94
95    x5 = x5 + x10; /* accumulate low bytes */
96    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
97    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
98    x5 = x5 + x11;  /* accumulate low bytes */
99    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
100    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
101
102    /****************/
103    x10 = x5 - (x4 << 8); /* extract low bytes */
104    x10 = x10 + x4;     /* add with high bytes */
105    x10 = x10 + (x10 << 16); /* add with lower half word */
106
107    if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
108    {
109        if (--x8)
110        {
111#if (NUMBER==3)
112            goto         LOOP_SAD3;
113#elif (NUMBER==2)
114            goto         LOOP_SAD2;
115#elif (NUMBER==1)
116            goto         LOOP_SAD1;
117#endif
118        }
119
120    }
121
122    return ((uint32)x10 >> 16);
123}
124
125#elif defined(__CC_ARM)  /* only work with arm v5 */
126
127#if (NUMBER==3)
128__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
129#elif (NUMBER==2)
130__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
131#elif (NUMBER==1)
132__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
133#endif
134{
135    int32 x4, x5, x6, x9, x10, x11, x12, x14;
136
137    x9 = 0x80808080; /* const. */
138    x4 = x5 = 0;
139
140    __asm{
141        MVN      x6, #0xff0000;
142#if (NUMBER==3)
143LOOP_SAD3:
144#elif (NUMBER==2)
145LOOP_SAD2:
146#elif (NUMBER==1)
147LOOP_SAD1:
148#endif
149        BIC      ref, ref, #3;
150    }
151    /****** process 8 pixels ******/
152    x11 = *((int32*)(ref + 12));
153    x12 = *((int32*)(ref + 16));
154    x10 = *((int32*)(ref + 8));
155    x14 = *((int32*)(blk + 12));
156
157    __asm{
158        MVN      x10, x10, lsr #SHIFT;
159        BIC      x10, x10, x11, lsl #(32-SHIFT);
160        MVN      x11, x11, lsr #SHIFT;
161        BIC      x11, x11, x12, lsl #(32-SHIFT);
162
163        LDR      x12, [blk, #8];
164    }
165
166    /* process x11 & x14 */
167    x11 = sad_4pixelN(x11, x14, x9);
168
169    /* process x12 & x10 */
170    x10 = sad_4pixelN(x10, x12, x9);
171
172    sum_accumulate;
173
174    __asm{
175        /****** process 8 pixels ******/
176        LDR      x11, [ref, #4];
177        LDR      x12, [ref, #8];
178        LDR  x10, [ref], lx ;
179        LDR  x14, [blk, #4];
180
181        MVN      x10, x10, lsr #SHIFT;
182        BIC      x10, x10, x11, lsl #(32-SHIFT);
183        MVN      x11, x11, lsr #SHIFT;
184        BIC      x11, x11, x12, lsl #(32-SHIFT);
185
186        LDR      x12, [blk], #16;
187    }
188
189    /* process x11 & x14 */
190    x11 = sad_4pixelN(x11, x14, x9);
191
192    /* process x12 & x10 */
193    x10 = sad_4pixelN(x10, x12, x9);
194
195    sum_accumulate;
196
197    /****************/
198    x10 = x5 - (x4 << 8); /* extract low bytes */
199    x10 = x10 + x4;     /* add with high bytes */
200    x10 = x10 + (x10 << 16); /* add with lower half word */
201
202    __asm{
203        RSBS     x11, dmin, x10, lsr #16
204        ADDLSS   x8, x8, #INC_X8
205#if (NUMBER==3)
206        BLS      LOOP_SAD3;
207#elif (NUMBER==2)
208BLS      LOOP_SAD2;
209#elif (NUMBER==1)
210BLS      LOOP_SAD1;
211#endif
212    }
213
214    return ((uint32)x10 >> 16);
215}
216
217#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
218
219#if (NUMBER==3)
220__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
221#elif (NUMBER==2)
222__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
223#elif (NUMBER==1)
224__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
225#endif
226{
227    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
228
229    x9 = 0x80808080; /* const. */
230    x4 = x5 = 0;
231    x8 = 16; //<<===========*******
232
233    __asm__ volatile("MVN       %0, #0xFF0000": "=r"(x6));
234
235#if (NUMBER==3)
236LOOP_SAD3:
237#elif (NUMBER==2)
238LOOP_SAD2:
239#elif (NUMBER==1)
240LOOP_SAD1:
241#endif
242    __asm__ volatile("BIC  %0, %0, #3": "+r"(ref));
243    /****** process 8 pixels ******/
244    x11 = *((int32*)(ref + 12));
245    x12 = *((int32*)(ref + 16));
246    x10 = *((int32*)(ref + 8));
247    x14 = *((int32*)(blk + 12));
248
249#if (SHIFT==8)
250    __asm__ volatile(
251        "MVN   %0, %0, lsr #8\n\t"
252        "BIC   %0, %0, %1, lsl #24\n\t"
253        "MVN   %1, %1, lsr #8\n\t"
254        "BIC   %1, %1, %2, lsl #24"
255        : "+r"(x10), "+r"(x11)
256        : "r"(x12)
257    );
258#elif (SHIFT==16)
259    __asm__ volatile(
260        "MVN   %0, %0, lsr #16\n\t"
261        "BIC   %0, %0, %1, lsl #16\n\t"
262        "MVN   %1, %1, lsr #16\n\t"
263        "BIC   %1, %1, %2, lsl #16"
264        : "+r"(x10), "+r"(x11)
265        : "r"(x12)
266    );
267#elif (SHIFT==24)
268    __asm__ volatile(
269        "MVN   %0, %0, lsr #24\n\t"
270        "BIC   %0, %0, %1, lsl #8\n\t"
271        "MVN   %1, %1, lsr #24\n\t"
272        "BIC   %1, %1, %2, lsl #8"
273        : "+r"(x10), "+r"(x11)
274        : "r"(x12)
275    );
276#endif
277
278    x12 = *((int32*)(blk + 8));
279
280    /* process x11 & x14 */
281    x11 = sad_4pixelN(x11, x14, x9);
282
283    /* process x12 & x10 */
284    x10 = sad_4pixelN(x10, x12, x9);
285
286    sum_accumulate;
287
288    /****** process 8 pixels ******/
289    x11 = *((int32*)(ref + 4));
290    x12 = *((int32*)(ref + 8));
291    x10 = *((int32*)ref); ref += lx;
292    x14 = *((int32*)(blk + 4));
293
294#if (SHIFT==8)
295    __asm__ volatile(
296        "MVN   %0, %0, lsr #8\n\t"
297        "BIC   %0, %0, %1, lsl #24\n\t"
298        "MVN   %1, %1, lsr #8\n\t"
299        "BIC   %1, %1, %2, lsl #24"
300        : "+r"(x10), "+r"(x11)
301        : "r"(x12)
302    );
303#elif (SHIFT==16)
304    __asm__ volatile(
305        "MVN   %0, %0, lsr #16\n\t"
306        "BIC   %0, %0, %1, lsl #16\n\t"
307        "MVN   %1, %1, lsr #16\n\t"
308        "BIC   %1, %1, %2, lsl #16"
309        : "+r"(x10), "+r"(x11)
310        : "r"(x12)
311    );
312#elif (SHIFT==24)
313    __asm__ volatile(
314        "MVN   %0, %0, lsr #24\n\t"
315        "BIC   %0, %0, %1, lsl #8\n\t"
316        "MVN   %1, %1, lsr #24\n\t"
317        "BIC   %1, %1, %2, lsl #8"
318        : "+r"(x10), "+r"(x11)
319        : "r"(x12)
320    );
321#endif
322    __asm__ volatile("LDR   %0, [%1], #16": "=&r"(x12), "+r"(blk));
323
324    /* process x11 & x14 */
325    x11 = sad_4pixelN(x11, x14, x9);
326
327    /* process x12 & x10 */
328    x10 = sad_4pixelN(x10, x12, x9);
329
330    sum_accumulate;
331
332    /****************/
333    x10 = x5 - (x4 << 8); /* extract low bytes */
334    x10 = x10 + x4;     /* add with high bytes */
335    x10 = x10 + (x10 << 16); /* add with lower half word */
336
337    if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
338    {
339        if (--x8)
340        {
341#if (NUMBER==3)
342            goto         LOOP_SAD3;
343#elif (NUMBER==2)
344            goto         LOOP_SAD2;
345#elif (NUMBER==1)
346            goto         LOOP_SAD1;
347#endif
348        }
349
350    }
351
352    return ((uint32)x10 >> 16);
353}
354
355#endif
356
357