1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18
19/* Intentionally not using the gcc asm version, since it is
20 * slightly slower than the plain C version on modern GCC versions. */
21#if !defined(__CC_ARM) /* Generic C version */
22
23#if (NUMBER==3)
24__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
25#elif (NUMBER==2)
26__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
27#elif (NUMBER==1)
28__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
29#endif
30{
31    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
32
33    //  x5 = (x4<<8) - x4;
34    x4 = x5 = 0;
35    x6 = 0xFFFF00FF;
36    x9 = 0x80808080; /* const. */
37    ref -= NUMBER; /* bic ref, ref, #3 */
38    ref -= lx;
39    blk -= 16;
40    x8 = 16;
41
42#if (NUMBER==3)
43LOOP_SAD3:
44#elif (NUMBER==2)
45LOOP_SAD2:
46#elif (NUMBER==1)
47LOOP_SAD1:
48#endif
49    /****** process 8 pixels ******/
50    x10 = *((uint32*)(ref += lx)); /* D C B A */
51    x11 = *((uint32*)(ref + 4));    /* H G F E */
52    x12 = *((uint32*)(ref + 8));    /* L K J I */
53
54    x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
55    x10 = x10 | (x11 << (32 - SHIFT));        /* G F E D */
56    x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
57    x11 = x11 | (x12 << (32 - SHIFT));        /* K J I H */
58
59    x12 = *((uint32*)(blk += 16));
60    x14 = *((uint32*)(blk + 4));
61
62    /* process x11 & x14 */
63    x11 = sad_4pixel(x11, x14, x9);
64
65    /* process x12 & x10 */
66    x10 = sad_4pixel(x10, x12, x9);
67
68    x5 = x5 + x10; /* accumulate low bytes */
69    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
70    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
71    x5 = x5 + x11;  /* accumulate low bytes */
72    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
73    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
74
75    /****** process 8 pixels ******/
76    x10 = *((uint32*)(ref + 8)); /* D C B A */
77    x11 = *((uint32*)(ref + 12));   /* H G F E */
78    x12 = *((uint32*)(ref + 16));   /* L K J I */
79
80    x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24  = 0xFF 0xFF 0xFF ~D */
81    x10 = x10 | (x11 << (32 - SHIFT));        /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
82    x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
83    x11 = x11 | (x12 << (32 - SHIFT));        /* ~K ~J ~I ~H */
84
85    x12 = *((uint32*)(blk + 8));
86    x14 = *((uint32*)(blk + 12));
87
88    /* process x11 & x14 */
89    x11 = sad_4pixel(x11, x14, x9);
90
91    /* process x12 & x10 */
92    x10 = sad_4pixel(x10, x12, x9);
93
94    x5 = x5 + x10; /* accumulate low bytes */
95    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
96    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
97    x5 = x5 + x11;  /* accumulate low bytes */
98    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
99    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
100
101    /****************/
102    x10 = x5 - (x4 << 8); /* extract low bytes */
103    x10 = x10 + x4;     /* add with high bytes */
104    x10 = x10 + (x10 << 16); /* add with lower half word */
105
106    if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
107    {
108        if (--x8)
109        {
110#if (NUMBER==3)
111            goto         LOOP_SAD3;
112#elif (NUMBER==2)
113            goto         LOOP_SAD2;
114#elif (NUMBER==1)
115            goto         LOOP_SAD1;
116#endif
117        }
118
119    }
120
121    return ((uint32)x10 >> 16);
122}
123
124#elif defined(__CC_ARM)  /* only work with arm v5 */
125
126#if (NUMBER==3)
127__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
128#elif (NUMBER==2)
129__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
130#elif (NUMBER==1)
131__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
132#endif
133{
134    int32 x4, x5, x6, x9, x10, x11, x12, x14;
135
136    x9 = 0x80808080; /* const. */
137    x4 = x5 = 0;
138
139    __asm{
140        MVN      x6, #0xff0000;
141#if (NUMBER==3)
142LOOP_SAD3:
143#elif (NUMBER==2)
144LOOP_SAD2:
145#elif (NUMBER==1)
146LOOP_SAD1:
147#endif
148        BIC      ref, ref, #3;
149    }
150    /****** process 8 pixels ******/
151    x11 = *((int32*)(ref + 12));
152    x12 = *((int32*)(ref + 16));
153    x10 = *((int32*)(ref + 8));
154    x14 = *((int32*)(blk + 12));
155
156    __asm{
157        MVN      x10, x10, lsr #SHIFT;
158        BIC      x10, x10, x11, lsl #(32-SHIFT);
159        MVN      x11, x11, lsr #SHIFT;
160        BIC      x11, x11, x12, lsl #(32-SHIFT);
161
162        LDR      x12, [blk, #8];
163    }
164
165    /* process x11 & x14 */
166    x11 = sad_4pixelN(x11, x14, x9);
167
168    /* process x12 & x10 */
169    x10 = sad_4pixelN(x10, x12, x9);
170
171    sum_accumulate;
172
173    __asm{
174        /****** process 8 pixels ******/
175        LDR      x11, [ref, #4];
176        LDR      x12, [ref, #8];
177        LDR  x10, [ref], lx ;
178        LDR  x14, [blk, #4];
179
180        MVN      x10, x10, lsr #SHIFT;
181        BIC      x10, x10, x11, lsl #(32-SHIFT);
182        MVN      x11, x11, lsr #SHIFT;
183        BIC      x11, x11, x12, lsl #(32-SHIFT);
184
185        LDR      x12, [blk], #16;
186    }
187
188    /* process x11 & x14 */
189    x11 = sad_4pixelN(x11, x14, x9);
190
191    /* process x12 & x10 */
192    x10 = sad_4pixelN(x10, x12, x9);
193
194    sum_accumulate;
195
196    /****************/
197    x10 = x5 - (x4 << 8); /* extract low bytes */
198    x10 = x10 + x4;     /* add with high bytes */
199    x10 = x10 + (x10 << 16); /* add with lower half word */
200
201    __asm{
202        RSBS     x11, dmin, x10, lsr #16
203        ADDLSS   x8, x8, #INC_X8
204#if (NUMBER==3)
205        BLS      LOOP_SAD3;
206#elif (NUMBER==2)
207BLS      LOOP_SAD2;
208#elif (NUMBER==1)
209BLS      LOOP_SAD1;
210#endif
211    }
212
213    return ((uint32)x10 >> 16);
214}
215
216#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
217
218#if (NUMBER==3)
219__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
220#elif (NUMBER==2)
221__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
222#elif (NUMBER==1)
223__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
224#endif
225{
226    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
227
228    x9 = 0x80808080; /* const. */
229    x4 = x5 = 0;
230    x8 = 16; //<<===========*******
231
232    __asm__ volatile("MVN       %0, #0xFF0000": "=r"(x6));
233
234#if (NUMBER==3)
235LOOP_SAD3:
236#elif (NUMBER==2)
237LOOP_SAD2:
238#elif (NUMBER==1)
239LOOP_SAD1:
240#endif
241    __asm__ volatile("BIC  %0, %0, #3": "+r"(ref));
242    /****** process 8 pixels ******/
243    x11 = *((int32*)(ref + 12));
244    x12 = *((int32*)(ref + 16));
245    x10 = *((int32*)(ref + 8));
246    x14 = *((int32*)(blk + 12));
247
248#if (SHIFT==8)
249    __asm__ volatile(
250        "MVN   %0, %0, lsr #8\n\t"
251        "BIC   %0, %0, %1, lsl #24\n\t"
252        "MVN   %1, %1, lsr #8\n\t"
253        "BIC   %1, %1, %2, lsl #24"
254        : "+r"(x10), "+r"(x11)
255        : "r"(x12)
256    );
257#elif (SHIFT==16)
258    __asm__ volatile(
259        "MVN   %0, %0, lsr #16\n\t"
260        "BIC   %0, %0, %1, lsl #16\n\t"
261        "MVN   %1, %1, lsr #16\n\t"
262        "BIC   %1, %1, %2, lsl #16"
263        : "+r"(x10), "+r"(x11)
264        : "r"(x12)
265    );
266#elif (SHIFT==24)
267    __asm__ volatile(
268        "MVN   %0, %0, lsr #24\n\t"
269        "BIC   %0, %0, %1, lsl #8\n\t"
270        "MVN   %1, %1, lsr #24\n\t"
271        "BIC   %1, %1, %2, lsl #8"
272        : "+r"(x10), "+r"(x11)
273        : "r"(x12)
274    );
275#endif
276
277    x12 = *((int32*)(blk + 8));
278
279    /* process x11 & x14 */
280    x11 = sad_4pixelN(x11, x14, x9);
281
282    /* process x12 & x10 */
283    x10 = sad_4pixelN(x10, x12, x9);
284
285    sum_accumulate;
286
287    /****** process 8 pixels ******/
288    x11 = *((int32*)(ref + 4));
289    x12 = *((int32*)(ref + 8));
290    x10 = *((int32*)ref); ref += lx;
291    x14 = *((int32*)(blk + 4));
292
293#if (SHIFT==8)
294    __asm__ volatile(
295        "MVN   %0, %0, lsr #8\n\t"
296        "BIC   %0, %0, %1, lsl #24\n\t"
297        "MVN   %1, %1, lsr #8\n\t"
298        "BIC   %1, %1, %2, lsl #24"
299        : "+r"(x10), "+r"(x11)
300        : "r"(x12)
301    );
302#elif (SHIFT==16)
303    __asm__ volatile(
304        "MVN   %0, %0, lsr #16\n\t"
305        "BIC   %0, %0, %1, lsl #16\n\t"
306        "MVN   %1, %1, lsr #16\n\t"
307        "BIC   %1, %1, %2, lsl #16"
308        : "+r"(x10), "+r"(x11)
309        : "r"(x12)
310    );
311#elif (SHIFT==24)
312    __asm__ volatile(
313        "MVN   %0, %0, lsr #24\n\t"
314        "BIC   %0, %0, %1, lsl #8\n\t"
315        "MVN   %1, %1, lsr #24\n\t"
316        "BIC   %1, %1, %2, lsl #8"
317        : "+r"(x10), "+r"(x11)
318        : "r"(x12)
319    );
320#endif
321    __asm__ volatile("LDR   %0, [%1], #16": "=&r"(x12), "+r"(blk));
322
323    /* process x11 & x14 */
324    x11 = sad_4pixelN(x11, x14, x9);
325
326    /* process x12 & x10 */
327    x10 = sad_4pixelN(x10, x12, x9);
328
329    sum_accumulate;
330
331    /****************/
332    x10 = x5 - (x4 << 8); /* extract low bytes */
333    x10 = x10 + x4;     /* add with high bytes */
334    x10 = x10 + (x10 << 16); /* add with lower half word */
335
336    if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
337    {
338        if (--x8)
339        {
340#if (NUMBER==3)
341            goto         LOOP_SAD3;
342#elif (NUMBER==2)
343            goto         LOOP_SAD2;
344#elif (NUMBER==1)
345            goto         LOOP_SAD1;
346#endif
347        }
348
349    }
350
351    return ((uint32)x10 >> 16);
352}
353
354#endif
355
356