1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18
19#if defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
20
21#if (NUMBER==3)
22__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
23#elif (NUMBER==2)
24__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
25#elif (NUMBER==1)
26__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
27#endif
28{
29    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
30
31    //  x5 = (x4<<8) - x4;
32    x4 = x5 = 0;
33    x6 = 0xFFFF00FF;
34    x9 = 0x80808080; /* const. */
35    ref -= NUMBER; /* bic ref, ref, #3 */
36    ref -= lx;
37    blk -= 16;
38    x8 = 16;
39
40#if (NUMBER==3)
41LOOP_SAD3:
42#elif (NUMBER==2)
43LOOP_SAD2:
44#elif (NUMBER==1)
45LOOP_SAD1:
46#endif
47    /****** process 8 pixels ******/
48    x10 = *((uint32*)(ref += lx)); /* D C B A */
49    x11 = *((uint32*)(ref + 4));    /* H G F E */
50    x12 = *((uint32*)(ref + 8));    /* L K J I */
51
52    x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
53    x10 = x10 | (x11 << (32 - SHIFT));        /* G F E D */
54    x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
55    x11 = x11 | (x12 << (32 - SHIFT));        /* K J I H */
56
57    x12 = *((uint32*)(blk += 16));
58    x14 = *((uint32*)(blk + 4));
59
60    /* process x11 & x14 */
61    x11 = sad_4pixel(x11, x14, x9);
62
63    /* process x12 & x10 */
64    x10 = sad_4pixel(x10, x12, x9);
65
66    x5 = x5 + x10; /* accumulate low bytes */
67    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
68    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
69    x5 = x5 + x11;  /* accumulate low bytes */
70    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
71    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
72
73    /****** process 8 pixels ******/
74    x10 = *((uint32*)(ref + 8)); /* D C B A */
75    x11 = *((uint32*)(ref + 12));   /* H G F E */
76    x12 = *((uint32*)(ref + 16));   /* L K J I */
77
78    x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24  = 0xFF 0xFF 0xFF ~D */
79    x10 = x10 | (x11 << (32 - SHIFT));        /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
80    x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
81    x11 = x11 | (x12 << (32 - SHIFT));        /* ~K ~J ~I ~H */
82
83    x12 = *((uint32*)(blk + 8));
84    x14 = *((uint32*)(blk + 12));
85
86    /* process x11 & x14 */
87    x11 = sad_4pixel(x11, x14, x9);
88
89    /* process x12 & x10 */
90    x10 = sad_4pixel(x10, x12, x9);
91
92    x5 = x5 + x10; /* accumulate low bytes */
93    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
94    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
95    x5 = x5 + x11;  /* accumulate low bytes */
96    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
97    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
98
99    /****************/
100    x10 = x5 - (x4 << 8); /* extract low bytes */
101    x10 = x10 + x4;     /* add with high bytes */
102    x10 = x10 + (x10 << 16); /* add with lower half word */
103
104    if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */
105    {
106        if (--x8)
107        {
108#if (NUMBER==3)
109            goto         LOOP_SAD3;
110#elif (NUMBER==2)
111            goto         LOOP_SAD2;
112#elif (NUMBER==1)
113            goto         LOOP_SAD1;
114#endif
115        }
116
117    }
118
119    return ((uint32)x10 >> 16);
120}
121
122#elif defined(__CC_ARM)  /* only work with arm v5 */
123
124#if (NUMBER==3)
125__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
126#elif (NUMBER==2)
127__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
128#elif (NUMBER==1)
129__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8)
130#endif
131{
132    int32 x4, x5, x6, x9, x10, x11, x12, x14;
133
134    x9 = 0x80808080; /* const. */
135    x4 = x5 = 0;
136
137    __asm{
138        MVN      x6, #0xff0000;
139#if (NUMBER==3)
140LOOP_SAD3:
141#elif (NUMBER==2)
142LOOP_SAD2:
143#elif (NUMBER==1)
144LOOP_SAD1:
145#endif
146        BIC      ref, ref, #3;
147    }
148    /****** process 8 pixels ******/
149    x11 = *((int32*)(ref + 12));
150    x12 = *((int32*)(ref + 16));
151    x10 = *((int32*)(ref + 8));
152    x14 = *((int32*)(blk + 12));
153
154    __asm{
155        MVN      x10, x10, lsr #SHIFT;
156        BIC      x10, x10, x11, lsl #(32-SHIFT);
157        MVN      x11, x11, lsr #SHIFT;
158        BIC      x11, x11, x12, lsl #(32-SHIFT);
159
160        LDR      x12, [blk, #8];
161    }
162
163    /* process x11 & x14 */
164    x11 = sad_4pixelN(x11, x14, x9);
165
166    /* process x12 & x10 */
167    x10 = sad_4pixelN(x10, x12, x9);
168
169    sum_accumulate;
170
171    __asm{
172        /****** process 8 pixels ******/
173        LDR      x11, [ref, #4];
174        LDR      x12, [ref, #8];
175        LDR  x10, [ref], lx ;
176        LDR  x14, [blk, #4];
177
178        MVN      x10, x10, lsr #SHIFT;
179        BIC      x10, x10, x11, lsl #(32-SHIFT);
180        MVN      x11, x11, lsr #SHIFT;
181        BIC      x11, x11, x12, lsl #(32-SHIFT);
182
183        LDR      x12, [blk], #16;
184    }
185
186    /* process x11 & x14 */
187    x11 = sad_4pixelN(x11, x14, x9);
188
189    /* process x12 & x10 */
190    x10 = sad_4pixelN(x10, x12, x9);
191
192    sum_accumulate;
193
194    /****************/
195    x10 = x5 - (x4 << 8); /* extract low bytes */
196    x10 = x10 + x4;     /* add with high bytes */
197    x10 = x10 + (x10 << 16); /* add with lower half word */
198
199    __asm{
200        RSBS     x11, dmin, x10, lsr #16
201        ADDLSS   x8, x8, #INC_X8
202#if (NUMBER==3)
203        BLS      LOOP_SAD3;
204#elif (NUMBER==2)
205BLS      LOOP_SAD2;
206#elif (NUMBER==1)
207BLS      LOOP_SAD1;
208#endif
209    }
210
211    return ((uint32)x10 >> 16);
212}
213
214#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER  */
215
216#if (NUMBER==3)
217__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin)
218#elif (NUMBER==2)
219__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin)
220#elif (NUMBER==1)
221__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin)
222#endif
223{
224    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
225
226    x9 = 0x80808080; /* const. */
227    x4 = x5 = 0;
228    x8 = 16; //<<===========*******
229
230__asm__ volatile("MVN	%0, #0xFF0000": "=r"(x6));
231
232#if (NUMBER==3)
233LOOP_SAD3:
234#elif (NUMBER==2)
235LOOP_SAD2:
236#elif (NUMBER==1)
237LOOP_SAD1:
238#endif
239__asm__ volatile("BIC  %0, %0, #3": "=r"(ref));
240    /****** process 8 pixels ******/
241    x11 = *((int32*)(ref + 12));
242    x12 = *((int32*)(ref + 16));
243    x10 = *((int32*)(ref + 8));
244    x14 = *((int32*)(blk + 12));
245
246#if (SHIFT==8)
247__asm__ volatile("MVN   %0, %0, lsr #8\n\tBIC   %0, %0, %1,lsl #24\n\tMVN   %1, %1,lsr #8\n\tBIC   %1, %1, %2,lsl #24": "=&r"(x10), "=&r"(x11): "r"(x12));
248#elif (SHIFT==16)
249__asm__ volatile("MVN   %0, %0, lsr #16\n\tBIC   %0, %0, %1,lsl #16\n\tMVN   %1, %1,lsr #16\n\tBIC   %1, %1, %2,lsl #16": "=&r"(x10), "=&r"(x11): "r"(x12));
250#elif (SHIFT==24)
251__asm__ volatile("MVN   %0, %0, lsr #24\n\tBIC   %0, %0, %1,lsl #8\n\tMVN   %1, %1,lsr #24\n\tBIC   %1, %1, %2,lsl #8": "=&r"(x10), "=&r"(x11): "r"(x12));
252#endif
253
254    x12 = *((int32*)(blk + 8));
255
256    /* process x11 & x14 */
257    x11 = sad_4pixelN(x11, x14, x9);
258
259    /* process x12 & x10 */
260    x10 = sad_4pixelN(x10, x12, x9);
261
262    sum_accumulate;
263
264    /****** process 8 pixels ******/
265    x11 = *((int32*)(ref + 4));
266    x12 = *((int32*)(ref + 8));
267    x10 = *((int32*)ref); ref += lx;
268    x14 = *((int32*)(blk + 4));
269
270#if (SHIFT==8)
271__asm__ volatile("MVN   %0, %0, lsr #8\n\tBIC   %0, %0, %1,lsl #24\n\tMVN   %1, %1,lsr #8\n\tBIC   %1, %1, %2,lsl #24": "=&r"(x10), "=&r"(x11): "r"(x12));
272#elif (SHIFT==16)
273__asm__ volatile("MVN   %0, %0, lsr #16\n\tBIC   %0, %0, %1,lsl #16\n\tMVN   %1, %1,lsr #16\n\tBIC   %1, %1, %2,lsl #16": "=&r"(x10), "=&r"(x11): "r"(x12));
274#elif (SHIFT==24)
275__asm__ volatile("MVN   %0, %0, lsr #24\n\tBIC   %0, %0, %1,lsl #8\n\tMVN   %1, %1,lsr #24\n\tBIC   %1, %1, %2,lsl #8": "=&r"(x10), "=&r"(x11): "r"(x12));
276#endif
277__asm__ volatile("LDR   %0, [%1], #16": "=&r"(x12), "=r"(blk));
278
279    /* process x11 & x14 */
280    x11 = sad_4pixelN(x11, x14, x9);
281
282    /* process x12 & x10 */
283    x10 = sad_4pixelN(x10, x12, x9);
284
285    sum_accumulate;
286
287    /****************/
288    x10 = x5 - (x4 << 8); /* extract low bytes */
289    x10 = x10 + x4;     /* add with high bytes */
290    x10 = x10 + (x10 << 16); /* add with lower half word */
291
292    if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
293    {
294        if (--x8)
295        {
296#if (NUMBER==3)
297            goto         LOOP_SAD3;
298#elif (NUMBER==2)
299goto         LOOP_SAD2;
300#elif (NUMBER==1)
301goto         LOOP_SAD1;
302#endif
303        }
304
305    }
306
307    return ((uint32)x10 >> 16);
308}
309
310#endif
311
312