sad_mb_offset.h revision ccde1257952d2c073e51ecba6180060570ffa41f
1/* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 19/* Intentionally not using the gcc asm version, since it (if fixed so 20 * as to not crash - the current register constraints are faulty) is 21 * slightly slower than the plain C version on modern GCC versions. */ 22#if !defined(__CC_ARM) /* Generic C version */ 23 24#if (NUMBER==3) 25__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin) 26#elif (NUMBER==2) 27__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin) 28#elif (NUMBER==1) 29__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin) 30#endif 31{ 32 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 33 34 // x5 = (x4<<8) - x4; 35 x4 = x5 = 0; 36 x6 = 0xFFFF00FF; 37 x9 = 0x80808080; /* const. */ 38 ref -= NUMBER; /* bic ref, ref, #3 */ 39 ref -= lx; 40 blk -= 16; 41 x8 = 16; 42 43#if (NUMBER==3) 44LOOP_SAD3: 45#elif (NUMBER==2) 46LOOP_SAD2: 47#elif (NUMBER==1) 48LOOP_SAD1: 49#endif 50 /****** process 8 pixels ******/ 51 x10 = *((uint32*)(ref += lx)); /* D C B A */ 52 x11 = *((uint32*)(ref + 4)); /* H G F E */ 53 x12 = *((uint32*)(ref + 8)); /* L K J I */ 54 55 x10 = ((uint32)x10 >> SHIFT); /* 0 0 0 D */ 56 x10 = x10 | (x11 << (32 - SHIFT)); /* G F E D */ 57 x11 = ((uint32)x11 >> SHIFT); /* 0 0 0 H */ 58 x11 = x11 | (x12 << (32 - SHIFT)); /* K J I H */ 59 60 x12 = *((uint32*)(blk += 16)); 61 x14 = *((uint32*)(blk + 4)); 62 63 /* process x11 & x14 */ 64 x11 = sad_4pixel(x11, x14, x9); 65 66 /* process x12 & x10 */ 67 x10 = sad_4pixel(x10, x12, x9); 68 69 x5 = x5 + x10; /* accumulate low bytes */ 70 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 71 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 72 x5 = x5 + x11; /* accumulate low bytes */ 73 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 74 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 75 76 /****** process 8 pixels ******/ 77 x10 = *((uint32*)(ref + 8)); /* D C B A */ 78 x11 = *((uint32*)(ref + 12)); /* H G F E */ 79 x12 = *((uint32*)(ref + 16)); /* L K J I */ 80 81 x10 = ((uint32)x10 >> SHIFT); /* mvn x10, x10, lsr #24 = 0xFF 0xFF 0xFF ~D */ 82 x10 = x10 | (x11 << (32 - SHIFT)); /* bic x10, x10, x11, lsl #8 = ~G ~F ~E ~D */ 83 x11 = ((uint32)x11 >> SHIFT); /* 0xFF 0xFF 0xFF ~H */ 84 x11 = x11 | (x12 << (32 - SHIFT)); /* ~K ~J ~I ~H */ 85 86 x12 = *((uint32*)(blk + 8)); 87 x14 = *((uint32*)(blk + 12)); 88 89 /* process x11 & x14 */ 90 x11 = sad_4pixel(x11, x14, x9); 91 92 /* process x12 & x10 */ 93 x10 = sad_4pixel(x10, x12, x9); 94 95 x5 = x5 + x10; /* accumulate low bytes */ 96 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 97 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 98 x5 = x5 + x11; /* accumulate low bytes */ 99 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 100 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 101 102 /****************/ 103 x10 = x5 - (x4 << 8); /* extract low bytes */ 104 x10 = x10 + x4; /* add with high bytes */ 105 x10 = x10 + (x10 << 16); /* add with lower half word */ 106 107 if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 108 { 109 if (--x8) 110 { 111#if (NUMBER==3) 112 goto LOOP_SAD3; 113#elif (NUMBER==2) 114 goto LOOP_SAD2; 115#elif (NUMBER==1) 116 goto LOOP_SAD1; 117#endif 118 } 119 120 } 121 122 return ((uint32)x10 >> 16); 123} 124 125#elif defined(__CC_ARM) /* only work with arm v5 */ 126 127#if (NUMBER==3) 128__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8) 129#elif (NUMBER==2) 130__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8) 131#elif (NUMBER==1) 132__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin, int32 x8) 133#endif 134{ 135 int32 x4, x5, x6, x9, x10, x11, x12, x14; 136 137 x9 = 0x80808080; /* const. */ 138 x4 = x5 = 0; 139 140 __asm{ 141 MVN x6, #0xff0000; 142#if (NUMBER==3) 143LOOP_SAD3: 144#elif (NUMBER==2) 145LOOP_SAD2: 146#elif (NUMBER==1) 147LOOP_SAD1: 148#endif 149 BIC ref, ref, #3; 150 } 151 /****** process 8 pixels ******/ 152 x11 = *((int32*)(ref + 12)); 153 x12 = *((int32*)(ref + 16)); 154 x10 = *((int32*)(ref + 8)); 155 x14 = *((int32*)(blk + 12)); 156 157 __asm{ 158 MVN x10, x10, lsr #SHIFT; 159 BIC x10, x10, x11, lsl #(32-SHIFT); 160 MVN x11, x11, lsr #SHIFT; 161 BIC x11, x11, x12, lsl #(32-SHIFT); 162 163 LDR x12, [blk, #8]; 164 } 165 166 /* process x11 & x14 */ 167 x11 = sad_4pixelN(x11, x14, x9); 168 169 /* process x12 & x10 */ 170 x10 = sad_4pixelN(x10, x12, x9); 171 172 sum_accumulate; 173 174 __asm{ 175 /****** process 8 pixels ******/ 176 LDR x11, [ref, #4]; 177 LDR x12, [ref, #8]; 178 LDR x10, [ref], lx ; 179 LDR x14, [blk, #4]; 180 181 MVN x10, x10, lsr #SHIFT; 182 BIC x10, x10, x11, lsl #(32-SHIFT); 183 MVN x11, x11, lsr #SHIFT; 184 BIC x11, x11, x12, lsl #(32-SHIFT); 185 186 LDR x12, [blk], #16; 187 } 188 189 /* process x11 & x14 */ 190 x11 = sad_4pixelN(x11, x14, x9); 191 192 /* process x12 & x10 */ 193 x10 = sad_4pixelN(x10, x12, x9); 194 195 sum_accumulate; 196 197 /****************/ 198 x10 = x5 - (x4 << 8); /* extract low bytes */ 199 x10 = x10 + x4; /* add with high bytes */ 200 x10 = x10 + (x10 << 16); /* add with lower half word */ 201 202 __asm{ 203 RSBS x11, dmin, x10, lsr #16 204 ADDLSS x8, x8, #INC_X8 205#if (NUMBER==3) 206 BLS LOOP_SAD3; 207#elif (NUMBER==2) 208BLS LOOP_SAD2; 209#elif (NUMBER==1) 210BLS LOOP_SAD1; 211#endif 212 } 213 214 return ((uint32)x10 >> 16); 215} 216 217#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ 218 219#if (NUMBER==3) 220__inline int32 sad_mb_offset3(uint8 *ref, uint8 *blk, int lx, int dmin) 221#elif (NUMBER==2) 222__inline int32 sad_mb_offset2(uint8 *ref, uint8 *blk, int lx, int dmin) 223#elif (NUMBER==1) 224__inline int32 sad_mb_offset1(uint8 *ref, uint8 *blk, int lx, int dmin) 225#endif 226{ 227 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 228 229 x9 = 0x80808080; /* const. */ 230 x4 = x5 = 0; 231 x8 = 16; //<<===========******* 232 233 __asm__ volatile("MVN %0, #0xFF0000": "=r"(x6)); 234 235#if (NUMBER==3) 236LOOP_SAD3: 237#elif (NUMBER==2) 238LOOP_SAD2: 239#elif (NUMBER==1) 240LOOP_SAD1: 241#endif 242 __asm__ volatile("BIC %0, %0, #3": "=r"(ref)); 243 /****** process 8 pixels ******/ 244 x11 = *((int32*)(ref + 12)); 245 x12 = *((int32*)(ref + 16)); 246 x10 = *((int32*)(ref + 8)); 247 x14 = *((int32*)(blk + 12)); 248 249#if (SHIFT==8) 250 __asm__ volatile( 251 "MVN %0, %0, lsr #8\n\t" 252 "BIC %0, %0, %1, lsl #24\n\t" 253 "MVN %1, %1, lsr #8\n\t" 254 "BIC %1, %1, %2, lsl #24" 255 : "=&r"(x10), "=&r"(x11) 256 : "r"(x12) 257 ); 258#elif (SHIFT==16) 259 __asm__ volatile( 260 "MVN %0, %0, lsr #16\n\t" 261 "BIC %0, %0, %1, lsl #16\n\t" 262 "MVN %1, %1, lsr #16\n\t" 263 "BIC %1, %1, %2, lsl #16" 264 : "=&r"(x10), "=&r"(x11) 265 : "r"(x12) 266 ); 267#elif (SHIFT==24) 268 __asm__ volatile( 269 "MVN %0, %0, lsr #24\n\t" 270 "BIC %0, %0, %1, lsl #8\n\t" 271 "MVN %1, %1, lsr #24\n\t" 272 "BIC %1, %1, %2, lsl #8" 273 : "=&r"(x10), "=&r"(x11) 274 : "r"(x12) 275 ); 276#endif 277 278 x12 = *((int32*)(blk + 8)); 279 280 /* process x11 & x14 */ 281 x11 = sad_4pixelN(x11, x14, x9); 282 283 /* process x12 & x10 */ 284 x10 = sad_4pixelN(x10, x12, x9); 285 286 sum_accumulate; 287 288 /****** process 8 pixels ******/ 289 x11 = *((int32*)(ref + 4)); 290 x12 = *((int32*)(ref + 8)); 291 x10 = *((int32*)ref); ref += lx; 292 x14 = *((int32*)(blk + 4)); 293 294#if (SHIFT==8) 295 __asm__ volatile( 296 "MVN %0, %0, lsr #8\n\t" 297 "BIC %0, %0, %1, lsl #24\n\t" 298 "MVN %1, %1, lsr #8\n\t" 299 "BIC %1, %1, %2, lsl #24" 300 : "=&r"(x10), "=&r"(x11) 301 : "r"(x12) 302 ); 303#elif (SHIFT==16) 304 __asm__ volatile( 305 "MVN %0, %0, lsr #16\n\t" 306 "BIC %0, %0, %1, lsl #16\n\t" 307 "MVN %1, %1, lsr #16\n\t" 308 "BIC %1, %1, %2, lsl #16" 309 : "=&r"(x10), "=&r"(x11) 310 : "r"(x12) 311 ); 312#elif (SHIFT==24) 313 __asm__ volatile( 314 "MVN %0, %0, lsr #24\n\t" 315 "BIC %0, %0, %1, lsl #8\n\t" 316 "MVN %1, %1, lsr #24\n\t" 317 "BIC %1, %1, %2, lsl #8" 318 : "=&r"(x10), "=&r"(x11) 319 : "r"(x12) 320 ); 321#endif 322 __asm__ volatile("LDR %0, [%1], #16": "=&r"(x12), "=r"(blk)); 323 324 /* process x11 & x14 */ 325 x11 = sad_4pixelN(x11, x14, x9); 326 327 /* process x12 & x10 */ 328 x10 = sad_4pixelN(x10, x12, x9); 329 330 sum_accumulate; 331 332 /****************/ 333 x10 = x5 - (x4 << 8); /* extract low bytes */ 334 x10 = x10 + x4; /* add with high bytes */ 335 x10 = x10 + (x10 << 16); /* add with lower half word */ 336 337 if (((uint32)x10 >> 16) <= (uint32)dmin) /* compare with dmin */ 338 { 339 if (--x8) 340 { 341#if (NUMBER==3) 342 goto LOOP_SAD3; 343#elif (NUMBER==2) 344 goto LOOP_SAD2; 345#elif (NUMBER==1) 346 goto LOOP_SAD1; 347#endif 348 } 349 350 } 351 352 return ((uint32)x10 >> 16); 353} 354 355#endif 356 357