1/* ------------------------------------------------------------------
2 * Copyright (C) 1998-2009 PacketVideo
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
13 * express or implied.
14 * See the License for the specific language governing permissions
15 * and limitations under the License.
16 * -------------------------------------------------------------------
17 */
18/*********************************************************************************/
19/*  Filename: sad_mb_offset.h                                                       */
20/*  Description: Implementation for in-line functions used in dct.cpp           */
21/*  Modified:                                                                   */
22/*********************************************************************************/
23
24#if !defined(PV_ARM_GCC_V4) && !defined(PV_ARM_GCC_V5) /* ARM GNU COMPILER  */
25
26#if (NUMBER==3)
27__inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin)
28#elif (NUMBER==2)
29__inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin)
30#elif (NUMBER==1)
31__inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin)
32#endif
33{
34    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
35
36    //  x5 = (x4<<8) - x4;
37    x4 = x5 = 0;
38    x6 = 0xFFFF00FF;
39    x9 = 0x80808080; /* const. */
40    ref -= NUMBER; /* bic ref, ref, #3 */
41    ref -= lx;
42    blk -= 16;
43    x8 = 16;
44
45#if (NUMBER==3)
46LOOP_SAD3:
47#elif (NUMBER==2)
48LOOP_SAD2:
49#elif (NUMBER==1)
50LOOP_SAD1:
51#endif
52    /****** process 8 pixels ******/
53    x10 = *((uint32*)(ref += lx)); /* D C B A */
54    x11 = *((uint32*)(ref + 4));    /* H G F E */
55    x12 = *((uint32*)(ref + 8));    /* L K J I */
56
57    x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */
58    x10 = x10 | (x11 << (32 - SHIFT));        /* G F E D */
59    x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */
60    x11 = x11 | (x12 << (32 - SHIFT));        /* K J I H */
61
62    x12 = *((uint32*)(blk += 16));
63    x14 = *((uint32*)(blk + 4));
64
65    /* process x11 & x14 */
66    x11 = sad_4pixel(x11, x14, x9);
67
68    /* process x12 & x10 */
69    x10 = sad_4pixel(x10, x12, x9);
70
71    x5 = x5 + x10; /* accumulate low bytes */
72    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
73    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
74    x5 = x5 + x11;  /* accumulate low bytes */
75    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
76    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
77
78    /****** process 8 pixels ******/
79    x10 = *((uint32*)(ref + 8)); /* D C B A */
80    x11 = *((uint32*)(ref + 12));   /* H G F E */
81    x12 = *((uint32*)(ref + 16));   /* L K J I */
82
83    x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24  = 0xFF 0xFF 0xFF ~D */
84    x10 = x10 | (x11 << (32 - SHIFT));        /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */
85    x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */
86    x11 = x11 | (x12 << (32 - SHIFT));        /* ~K ~J ~I ~H */
87
88    x12 = *((uint32*)(blk + 8));
89    x14 = *((uint32*)(blk + 12));
90
91    /* process x11 & x14 */
92    x11 = sad_4pixel(x11, x14, x9);
93
94    /* process x12 & x10 */
95    x10 = sad_4pixel(x10, x12, x9);
96
97    x5 = x5 + x10; /* accumulate low bytes */
98    x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */
99    x4 = x4 + ((uint32)x10 >> 8);  /* accumulate high bytes */
100    x5 = x5 + x11;  /* accumulate low bytes */
101    x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */
102    x4 = x4 + ((uint32)x11 >> 8);  /* accumulate high bytes */
103
104    /****************/
105    x10 = x5 - (x4 << 8); /* extract low bytes */
106    x10 = x10 + x4;     /* add with high bytes */
107    x10 = x10 + (x10 << 16); /* add with lower half word */
108
109    if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
110    {
111        if (--x8)
112        {
113#if (NUMBER==3)
114            goto         LOOP_SAD3;
115#elif (NUMBER==2)
116            goto         LOOP_SAD2;
117#elif (NUMBER==1)
118            goto         LOOP_SAD1;
119#endif
120        }
121
122    }
123
124    return ((uint32)x10 >> 16);
125}
126
127#elif defined(__CC_ARM)  /* only work with arm v5 */
128
129#if (NUMBER==3)
130__inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
131#elif (NUMBER==2)
132__inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
133#elif (NUMBER==1)
134__inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin, int32 x8)
135#endif
136{
137    int32 x4, x5, x6, x9, x10, x11, x12, x14;
138
139    x9 = 0x80808080; /* const. */
140    x4 = x5 = 0;
141
142    __asm{
143        MVN      x6, #0xff0000;
144        BIC      ref, ref, #3;
145
146#if (NUMBER==3)
147LOOP_SAD3:
148#elif (NUMBER==2)
149LOOP_SAD2:
150#elif (NUMBER==1)
151LOOP_SAD1:
152#endif
153    }
154    /****** process 8 pixels ******/
155    x11 = *((int32*)(ref + 12));
156    x12 = *((int32*)(ref + 16));
157    x10 = *((int32*)(ref + 8));
158    x14 = *((int32*)(blk + 12));
159
160    __asm{
161        MVN      x10, x10, lsr #SHIFT;
162        BIC      x10, x10, x11, lsl #(32-SHIFT);
163        MVN      x11, x11, lsr #SHIFT;
164        BIC      x11, x11, x12, lsl #(32-SHIFT);
165
166        LDR      x12, [blk, #8];
167    }
168
169    /* process x11 & x14 */
170    x11 = sad_4pixelN(x11, x14, x9);
171
172    /* process x12 & x10 */
173    x10 = sad_4pixelN(x10, x12, x9);
174
175    sum_accumulate;
176
177    __asm{
178        /****** process 8 pixels ******/
179        LDR      x11, [ref, #4];
180        LDR      x12, [ref, #8];
181        LDR  x10, [ref], lx ;
182        LDR  x14, [blk, #4];
183
184        MVN      x10, x10, lsr #SHIFT;
185        BIC      x10, x10, x11, lsl #(32-SHIFT);
186        MVN      x11, x11, lsr #SHIFT;
187        BIC      x11, x11, x12, lsl #(32-SHIFT);
188
189        LDR      x12, [blk], #16;
190    }
191
192    /* process x11 & x14 */
193    x11 = sad_4pixelN(x11, x14, x9);
194
195    /* process x12 & x10 */
196    x10 = sad_4pixelN(x10, x12, x9);
197
198    sum_accumulate;
199
200    /****************/
201    x10 = x5 - (x4 << 8); /* extract low bytes */
202    x10 = x10 + x4;     /* add with high bytes */
203    x10 = x10 + (x10 << 16); /* add with lower half word */
204
205    __asm{
206        RSBS     x11, dmin, x10, lsr #16
207        ADDLSS   x8, x8, #INC_X8
208#if (NUMBER==3)
209        BLS      LOOP_SAD3;
210#elif (NUMBER==2)
211BLS      LOOP_SAD2;
212#elif (NUMBER==1)
213BLS      LOOP_SAD1;
214#endif
215    }
216
217    return ((uint32)x10 >> 16);
218}
219
220#elif ( defined(PV_ARM_GCC_V5) || defined(PV_ARM_GCC_V4) ) /* ARM GNU COMPILER  */
221
222#if (NUMBER==3)
223__inline int32 sad_mb_offset3(UChar *ref, UChar *blk, Int lx, Int dmin)
224#elif (NUMBER==2)
225__inline int32 sad_mb_offset2(UChar *ref, UChar *blk, Int lx, Int dmin)
226#elif (NUMBER==1)
227__inline int32 sad_mb_offset1(UChar *ref, UChar *blk, Int lx, Int dmin)
228#endif
229{
230    int32 x4, x5, x6, x8, x9, x10, x11, x12, x14;
231
232    //  x5 = (x4<<8) - x4;
233    x4 = x5 = 0;
234    x6 = 0xFFFF00FF;
235    x9 = 0x80808080; /* const. */
236    ref -= NUMBER; /* bic ref, ref, #3 */
237    ref -= lx;
238    x8 = 16;
239
240#if (NUMBER==3)
241LOOP_SAD3:
242#elif (NUMBER==2)
243LOOP_SAD2:
244#elif (NUMBER==1)
245LOOP_SAD1:
246#endif
247    /****** process 8 pixels ******/
248    x10 = *((uint32*)(ref += lx)); /* D C B A */
249    x11 = *((uint32*)(ref + 4));    /* H G F E */
250    x12 = *((uint32*)(ref + 8));    /* L K J I */
251
252    int32 shift = SHIFT;
253    int32 shift2 = 32 - SHIFT;
254    asm volatile("ldr  %3, [%4, #4]\n\t"
255                 "mvn  %0, %0, lsr %5\n\t"
256                 "bic  %0, %0, %1, lsl %6\n\t"
257                 "mvn  %1, %1, lsr %5\n\t"
258                 "bic  %1, %1, %2, lsl %6\n\t"
259                 "ldr  %2, [%4, #8]"
260             : "+r"(x10), "+r"(x11), "+r"(x12), "=r"(x14)
261                         : "r"(blk), "r"(shift), "r"(shift2));
262
263    /* process x11 & x14 */
264    x11 = sad_4pixel(x11, x14, x9);
265
266    /* process x12 & x10 */
267    x10 = sad_4pixel(x10, x12, x9);
268
269    sum_accumulate;
270
271    /****** process 8 pixels ******/
272    x10 = *((uint32*)(ref + 8)); /* D C B A */
273    x11 = *((uint32*)(ref + 12));   /* H G F E */
274    x12 = *((uint32*)(ref + 16));   /* L K J I */
275
276    asm volatile("ldr  %3, [%4, #4]\n\t"
277                 "mvn  %0, %0, lsr %5\n\t"
278                 "bic  %0, %0, %1, lsl %6\n\t"
279                 "mvn  %1, %1, lsr %5\n\t"
280                 "bic  %1, %1, %2, lsl %6\n\t"
281                 "ldr  %2, [%4, #8]"
282             : "+r"(x10), "+r"(x11), "+r"(x12), "=r"(x14)
283                         : "r"(blk), "r"(shift), "r"(shift2));
284
285    /* process x11 & x14 */
286    x11 = sad_4pixel(x11, x14, x9);
287
288    /* process x12 & x10 */
289    x10 = sad_4pixel(x10, x12, x9);
290
291    sum_accumulate;
292
293    /****************/
294    x10 = x5 - (x4 << 8); /* extract low bytes */
295    x10 = x10 + x4;     /* add with high bytes */
296    x10 = x10 + (x10 << 16); /* add with lower half word */
297
298    if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */
299    {
300        if (--x8)
301        {
302#if (NUMBER==3)
303            goto         LOOP_SAD3;
304#elif (NUMBER==2)
305goto         LOOP_SAD2;
306#elif (NUMBER==1)
307goto         LOOP_SAD1;
308#endif
309        }
310
311    }
312
313    return ((uint32)x10 >> 16);
314}
315
316#endif
317
318