1/* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 19#if defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ 20 21#if (NUMBER==3) 22__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin) 23#elif (NUMBER==2) 24__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin) 25#elif (NUMBER==1) 26__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin) 27#endif 28{ 29 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 30 31 // x5 = (x4<<8) - x4; 32 x4 = x5 = 0; 33 x6 = 0xFFFF00FF; 34 x9 = 0x80808080; /* const. */ 35 ref -= NUMBER; /* bic ref, ref, #3 */ 36 ref -= lx; 37 blk -= 16; 38 x8 = 16; 39 40#if (NUMBER==3) 41LOOP_SAD3: 42#elif (NUMBER==2) 43LOOP_SAD2: 44#elif (NUMBER==1) 45LOOP_SAD1: 46#endif 47 /****** process 8 pixels ******/ 48 x10 = *((uint32*)(ref += lx)); /* D C B A */ 49 x11 = *((uint32*)(ref + 4)); /* H G F E */ 50 x12 = *((uint32*)(ref + 8)); /* L K J I */ 51 52 x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */ 53 x10 = x10 | (x11 << (32 - SHIFT)); /* G F E D */ 54 x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */ 55 x11 = x11 | (x12 << (32 - SHIFT)); /* K J I H */ 56 57 x12 = *((uint32*)(blk += 16)); 58 x14 = *((uint32*)(blk + 4)); 59 60 /* process x11 & x14 */ 61 x11 = sad_4pixel(x11, x14, x9); 62 63 /* process x12 & x10 */ 64 x10 = sad_4pixel(x10, x12, x9); 65 66 x5 = x5 + x10; /* accumulate low bytes */ 67 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 68 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 69 x5 = x5 + x11; /* accumulate low bytes */ 70 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 71 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 72 73 /****** process 8 pixels ******/ 74 x10 = *((uint32*)(ref + 8)); /* D C B A */ 75 x11 = *((uint32*)(ref + 12)); /* H G F E */ 76 x12 = *((uint32*)(ref + 16)); /* L K J I */ 77 78 x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24 = 0xFF 0xFF 0xFF ~D */ 79 x10 = x10 | (x11 << (32 - SHIFT)); /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */ 80 x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */ 81 x11 = x11 | (x12 << (32 - SHIFT)); /* ~K ~J ~I ~H */ 82 83 x12 = *((uint32*)(blk + 8)); 84 x14 = *((uint32*)(blk + 12)); 85 86 /* process x11 & x14 */ 87 x11 = sad_4pixel(x11, x14, x9); 88 89 /* process x12 & x10 */ 90 x10 = sad_4pixel(x10, x12, x9); 91 92 x5 = x5 + x10; /* accumulate low bytes */ 93 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 94 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 95 x5 = x5 + x11; /* accumulate low bytes */ 96 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 97 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 98 99 /****************/ 100 x10 = x5 - (x4 << 8); /* extract low bytes */ 101 x10 = x10 + x4; /* add with high bytes */ 102 x10 = x10 + (x10 << 16); /* add with lower half word */ 103 104 if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 105 { 106 if (--x8) 107 { 108#if (NUMBER==3) 109 goto LOOP_SAD3; 110#elif (NUMBER==2) 111 goto LOOP_SAD2; 112#elif (NUMBER==1) 113 goto LOOP_SAD1; 114#endif 115 } 116 117 } 118 119 return ((uint32)x10 >> 16); 120} 121 122#elif defined(__CC_ARM) /* only work with arm v5 */ 123 124#if (NUMBER==3) 125__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8) 126#elif (NUMBER==2) 127__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8) 128#elif (NUMBER==1) 129__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8) 130#endif 131{ 132 int32 x4, x5, x6, x9, x10, x11, x12, x14; 133 134 x9 = 0x80808080; /* const. */ 135 x4 = x5 = 0; 136 137 __asm{ 138 MVN x6, #0xff0000; 139#if (NUMBER==3) 140LOOP_SAD3: 141#elif (NUMBER==2) 142LOOP_SAD2: 143#elif (NUMBER==1) 144LOOP_SAD1: 145#endif 146 BIC ref, ref, #3; 147 } 148 /****** process 8 pixels ******/ 149 x11 = *((int32*)(ref + 12)); 150 x12 = *((int32*)(ref + 16)); 151 x10 = *((int32*)(ref + 8)); 152 x14 = *((int32*)(blk + 12)); 153 154 __asm{ 155 MVN x10, x10, lsr #SHIFT; 156 BIC x10, x10, x11, lsl #(32-SHIFT); 157 MVN x11, x11, lsr #SHIFT; 158 BIC x11, x11, x12, lsl #(32-SHIFT); 159 160 LDR x12, [blk, #8]; 161 } 162 163 /* process x11 & x14 */ 164 x11 = sad_4pixelN(x11, x14, x9); 165 166 /* process x12 & x10 */ 167 x10 = sad_4pixelN(x10, x12, x9); 168 169 sum_accumulate; 170 171 __asm{ 172 /****** process 8 pixels ******/ 173 LDR x11, [ref, #4]; 174 LDR x12, [ref, #8]; 175 LDR x10, [ref], lx ; 176 LDR x14, [blk, #4]; 177 178 MVN x10, x10, lsr #SHIFT; 179 BIC x10, x10, x11, lsl #(32-SHIFT); 180 MVN x11, x11, lsr #SHIFT; 181 BIC x11, x11, x12, lsl #(32-SHIFT); 182 183 LDR x12, [blk], #16; 184 } 185 186 /* process x11 & x14 */ 187 x11 = sad_4pixelN(x11, x14, x9); 188 189 /* process x12 & x10 */ 190 x10 = sad_4pixelN(x10, x12, x9); 191 192 sum_accumulate; 193 194 /****************/ 195 x10 = x5 - (x4 << 8); /* extract low bytes */ 196 x10 = x10 + x4; /* add with high bytes */ 197 x10 = x10 + (x10 << 16); /* add with lower half word */ 198 199 __asm{ 200 RSBS x11, dmin, x10, lsr #16 201 ADDLSS x8, x8, #INC_X8 202#if (NUMBER==3) 203 BLS LOOP_SAD3; 204#elif (NUMBER==2) 205BLS LOOP_SAD2; 206#elif (NUMBER==1) 207BLS LOOP_SAD1; 208#endif 209 } 210 211 return ((uint32)x10 >> 16); 212} 213 214#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ 215 216#if (NUMBER==3) 217__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin) 218#elif (NUMBER==2) 219__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin) 220#elif (NUMBER==1) 221__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin) 222#endif 223{ 224 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 225 226 x9 = 0x80808080; /* const. */ 227 x4 = x5 = 0; 228 x8 = 16; //<<===========******* 229 230__asm__ volatile("MVN %0, #0xFF0000": "=r"(x6)); 231 232#if (NUMBER==3) 233LOOP_SAD3: 234#elif (NUMBER==2) 235LOOP_SAD2: 236#elif (NUMBER==1) 237LOOP_SAD1: 238#endif 239__asm__ volatile("BIC %0, %0, #3": "=r"(ref)); 240 /****** process 8 pixels ******/ 241 x11 = *((int32*)(ref + 12)); 242 x12 = *((int32*)(ref + 16)); 243 x10 = *((int32*)(ref + 8)); 244 x14 = *((int32*)(blk + 12)); 245 246#if (SHIFT==8) 247__asm__ volatile("MVN %0, %0, lsr #8\n\tBIC %0, %0, %1,lsl #24\n\tMVN %1, %1,lsr #8\n\tBIC %1, %1, %2,lsl #24": "=&r"(x10), "=&r"(x11): "r"(x12)); 248#elif (SHIFT==16) 249__asm__ volatile("MVN %0, %0, lsr #16\n\tBIC %0, %0, %1,lsl #16\n\tMVN %1, %1,lsr #16\n\tBIC %1, %1, %2,lsl #16": "=&r"(x10), "=&r"(x11): "r"(x12)); 250#elif (SHIFT==24) 251__asm__ volatile("MVN %0, %0, lsr #24\n\tBIC %0, %0, %1,lsl #8\n\tMVN %1, %1,lsr #24\n\tBIC %1, %1, %2,lsl #8": "=&r"(x10), "=&r"(x11): "r"(x12)); 252#endif 253 254 x12 = *((int32*)(blk + 8)); 255 256 /* process x11 & x14 */ 257 x11 = sad_4pixelN(x11, x14, x9); 258 259 /* process x12 & x10 */ 260 x10 = sad_4pixelN(x10, x12, x9); 261 262 sum_accumulate; 263 264 /****** process 8 pixels ******/ 265 x11 = *((int32*)(ref + 4)); 266 x12 = *((int32*)(ref + 8)); 267 x10 = *((int32*)ref); ref += lx; 268 x14 = *((int32*)(blk + 4)); 269 270#if (SHIFT==8) 271__asm__ volatile("MVN %0, %0, lsr #8\n\tBIC %0, %0, %1,lsl #24\n\tMVN %1, %1,lsr #8\n\tBIC %1, %1, %2,lsl #24": "=&r"(x10), "=&r"(x11): "r"(x12)); 272#elif (SHIFT==16) 273__asm__ volatile("MVN %0, %0, lsr #16\n\tBIC %0, %0, %1,lsl #16\n\tMVN %1, %1,lsr #16\n\tBIC %1, %1, %2,lsl #16": "=&r"(x10), "=&r"(x11): "r"(x12)); 274#elif (SHIFT==24) 275__asm__ volatile("MVN %0, %0, lsr #24\n\tBIC %0, %0, %1,lsl #8\n\tMVN %1, %1,lsr #24\n\tBIC %1, %1, %2,lsl #8": "=&r"(x10), "=&r"(x11): "r"(x12)); 276#endif 277__asm__ volatile("LDR %0, [%1], #16": "=&r"(x12), "=r"(blk)); 278 279 /* process x11 & x14 */ 280 x11 = sad_4pixelN(x11, x14, x9); 281 282 /* process x12 & x10 */ 283 x10 = sad_4pixelN(x10, x12, x9); 284 285 sum_accumulate; 286 287 /****************/ 288 x10 = x5 - (x4 << 8); /* extract low bytes */ 289 x10 = x10 + x4; /* add with high bytes */ 290 x10 = x10 + (x10 << 16); /* add with lower half word */ 291 292 if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */ 293 { 294 if (--x8) 295 { 296#if (NUMBER==3) 297 goto LOOP_SAD3; 298#elif (NUMBER==2) 299goto LOOP_SAD2; 300#elif (NUMBER==1) 301goto LOOP_SAD1; 302#endif 303 } 304 305 } 306 307 return ((uint32)x10 >> 16); 308} 309 310#endif 311 312