sad_inline.h revision f5af6314db25ff3bef9bd2eeba201bc6cc60805d
1/* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18#ifndef _SAD_INLINE_H_ 19#define _SAD_INLINE_H_ 20 21#ifdef __cplusplus 22extern "C" 23{ 24#endif 25 26/* Intentionally not using the gcc asm version, since it (if fixed so 27 * as to not crash - the current register constraints are faulty) is 28 * slightly slower than the plain C version on modern GCC versions. */ 29#if !defined(__CC_ARM) /* Generic C version */ 30 31 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 32 { 33 tmp = tmp - tmp2; 34 if (tmp > 0) sad += tmp; 35 else sad -= tmp; 36 37 return sad; 38 } 39 40 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 41 { 42 int32 x7; 43 44 x7 = src2 ^ src1; /* check odd/even combination */ 45 if ((uint32)src2 >= (uint32)src1) 46 { 47 src1 = src2 - src1; /* subs */ 48 } 49 else 50 { 51 src1 = src1 - src2; 52 } 53 x7 = x7 ^ src1; /* only odd bytes need to add carry */ 54 x7 = mask & ((uint32)x7 >> 1); 55 x7 = (x7 << 8) - x7; 56 src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */ 57 src1 = src1 ^(x7 >> 7); /* take absolute value of negative byte */ 58 59 return src1; 60 } 61 62#define NUMBER 3 63#define SHIFT 24 64 65#include "sad_mb_offset.h" 66 67#undef NUMBER 68#define NUMBER 2 69#undef SHIFT 70#define SHIFT 16 71#include "sad_mb_offset.h" 72 73#undef NUMBER 74#define NUMBER 1 75#undef SHIFT 76#define SHIFT 8 77#include "sad_mb_offset.h" 78 79 80 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 81 { 82 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 83 84 x9 = 0x80808080; /* const. */ 85 86 x8 = (intptr_t)ref & 0x3; 87 if (x8 == 3) 88 goto SadMBOffset3; 89 if (x8 == 2) 90 goto SadMBOffset2; 91 if (x8 == 1) 92 goto SadMBOffset1; 93 94// x5 = (x4<<8)-x4; /* x5 = x4*255; */ 95 x4 = x5 = 0; 96 97 x6 = 0xFFFF00FF; 98 99 ref -= lx; 100 blk -= 16; 101 102 x8 = 16; 103 104LOOP_SAD0: 105 /****** process 8 pixels ******/ 106 x10 = *((uint32*)(ref += lx)); 107 x11 = *((uint32*)(ref + 4)); 108 x12 = *((uint32*)(blk += 16)); 109 x14 = *((uint32*)(blk + 4)); 110 111 /* process x11 & x14 */ 112 x11 = sad_4pixel(x11, x14, x9); 113 114 /* process x12 & x10 */ 115 x10 = sad_4pixel(x10, x12, x9); 116 117 x5 = x5 + x10; /* accumulate low bytes */ 118 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 119 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 120 x5 = x5 + x11; /* accumulate low bytes */ 121 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 122 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 123 124 /****** process 8 pixels ******/ 125 x10 = *((uint32*)(ref + 8)); 126 x11 = *((uint32*)(ref + 12)); 127 x12 = *((uint32*)(blk + 8)); 128 x14 = *((uint32*)(blk + 12)); 129 130 /* process x11 & x14 */ 131 x11 = sad_4pixel(x11, x14, x9); 132 133 /* process x12 & x10 */ 134 x10 = sad_4pixel(x10, x12, x9); 135 136 x5 = x5 + x10; /* accumulate low bytes */ 137 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 138 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 139 x5 = x5 + x11; /* accumulate low bytes */ 140 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 141 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 142 143 /****************/ 144 x10 = x5 - (x4 << 8); /* extract low bytes */ 145 x10 = x10 + x4; /* add with high bytes */ 146 x10 = x10 + (x10 << 16); /* add with lower half word */ 147 148 if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 149 { 150 if (--x8) 151 { 152 goto LOOP_SAD0; 153 } 154 155 } 156 157 return ((uint32)x10 >> 16); 158 159SadMBOffset3: 160 161 return sad_mb_offset3(ref, blk, lx, dmin); 162 163SadMBOffset2: 164 165 return sad_mb_offset2(ref, blk, lx, dmin); 166 167SadMBOffset1: 168 169 return sad_mb_offset1(ref, blk, lx, dmin); 170 171 } 172 173#elif defined(__CC_ARM) /* only work with arm v5 */ 174 175 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 176 { 177 __asm 178 { 179 rsbs tmp, tmp, tmp2 ; 180 rsbmi tmp, tmp, #0 ; 181 add sad, sad, tmp ; 182 } 183 184 return sad; 185 } 186 187 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 188 { 189 int32 x7; 190 191 __asm 192 { 193 EOR x7, src2, src1; /* check odd/even combination */ 194 SUBS src1, src2, src1; 195 EOR x7, x7, src1; 196 AND x7, mask, x7, lsr #1; 197 ORRCC x7, x7, #0x80000000; 198 RSB x7, x7, x7, lsl #8; 199 ADD src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 200 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 201 } 202 203 return src1; 204 } 205 206 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 207 { 208 int32 x7; 209 210 __asm 211 { 212 EOR x7, src2, src1; /* check odd/even combination */ 213 ADDS src1, src2, src1; 214 EOR x7, x7, src1; /* only odd bytes need to add carry */ 215 ANDS x7, mask, x7, rrx; 216 RSB x7, x7, x7, lsl #8; 217 SUB src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 218 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 219 } 220 221 return src1; 222 } 223 224#define sum_accumulate __asm{ SBC x5, x5, x10; /* accumulate low bytes */ \ 225 BIC x10, x6, x10; /* x10 & 0xFF00FF00 */ \ 226 ADD x4, x4, x10,lsr #8; /* accumulate high bytes */ \ 227 SBC x5, x5, x11; /* accumulate low bytes */ \ 228 BIC x11, x6, x11; /* x11 & 0xFF00FF00 */ \ 229 ADD x4, x4, x11,lsr #8; } /* accumulate high bytes */ 230 231 232#define NUMBER 3 233#define SHIFT 24 234#define INC_X8 0x08000001 235 236#include "sad_mb_offset.h" 237 238#undef NUMBER 239#define NUMBER 2 240#undef SHIFT 241#define SHIFT 16 242#undef INC_X8 243#define INC_X8 0x10000001 244#include "sad_mb_offset.h" 245 246#undef NUMBER 247#define NUMBER 1 248#undef SHIFT 249#define SHIFT 8 250#undef INC_X8 251#define INC_X8 0x08000001 252#include "sad_mb_offset.h" 253 254 255 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 256 { 257 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 258 259 x9 = 0x80808080; /* const. */ 260 x4 = x5 = 0; 261 262 __asm 263 { 264 MOVS x8, ref, lsl #31 ; 265 BHI SadMBOffset3; 266 BCS SadMBOffset2; 267 BMI SadMBOffset1; 268 269 MVN x6, #0xFF00; 270 } 271LOOP_SAD0: 272 /****** process 8 pixels ******/ 273 x11 = *((int32*)(ref + 12)); 274 x10 = *((int32*)(ref + 8)); 275 x14 = *((int32*)(blk + 12)); 276 x12 = *((int32*)(blk + 8)); 277 278 /* process x11 & x14 */ 279 x11 = sad_4pixel(x11, x14, x9); 280 281 /* process x12 & x10 */ 282 x10 = sad_4pixel(x10, x12, x9); 283 284 x5 = x5 + x10; /* accumulate low bytes */ 285 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 286 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 287 x5 = x5 + x11; /* accumulate low bytes */ 288 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 289 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 290 291 __asm 292 { 293 /****** process 8 pixels ******/ 294 LDR x11, [ref, #4]; 295 LDR x10, [ref], lx ; 296 LDR x14, [blk, #4]; 297 LDR x12, [blk], #16 ; 298 } 299 300 /* process x11 & x14 */ 301 x11 = sad_4pixel(x11, x14, x9); 302 303 /* process x12 & x10 */ 304 x10 = sad_4pixel(x10, x12, x9); 305 306 x5 = x5 + x10; /* accumulate low bytes */ 307 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 308 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 309 x5 = x5 + x11; /* accumulate low bytes */ 310 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 311 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 312 313 /****************/ 314 x10 = x5 - (x4 << 8); /* extract low bytes */ 315 x10 = x10 + x4; /* add with high bytes */ 316 x10 = x10 + (x10 << 16); /* add with lower half word */ 317 318 __asm 319 { 320 /****************/ 321 RSBS x11, dmin, x10, lsr #16; 322 ADDLSS x8, x8, #0x10000001; 323 BLS LOOP_SAD0; 324 } 325 326 return ((uint32)x10 >> 16); 327 328SadMBOffset3: 329 330 return sad_mb_offset3(ref, blk, lx, dmin, x8); 331 332SadMBOffset2: 333 334 return sad_mb_offset2(ref, blk, lx, dmin, x8); 335 336SadMBOffset1: 337 338 return sad_mb_offset1(ref, blk, lx, dmin, x8); 339 } 340 341 342#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ 343 344 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 345 { 346__asm__ volatile("rsbs %1, %1, %2\n\trsbmi %1, %1, #0\n\tadd %0, %0, %1": "=r"(sad): "r"(tmp), "r"(tmp2)); 347 return sad; 348 } 349 350 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 351 { 352 int32 x7; 353 354__asm__ volatile("EOR %1, %2, %0\n\tSUBS %0, %2, %0\n\tEOR %1, %1, %0\n\tAND %1, %3, %1, lsr #1\n\tORRCC %1, %1, #0x80000000\n\tRSB %1, %1, %1, lsl #8\n\tADD %0, %0, %1, asr #7\n\tEOR %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask)); 355 356 return src1; 357 } 358 359 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 360 { 361 int32 x7; 362 363__asm__ volatile("EOR %1, %2, %0\n\tADDS %0, %2, %0\n\tEOR %1, %1, %0\n\tANDS %1, %3, %1, rrx\n\tRSB %1, %1, %1, lsl #8\n\tSUB %0, %0, %1, asr #7\n\tEOR %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask)); 364 365 return src1; 366 } 367 368#define sum_accumulate __asm__ volatile("SBC %0, %0, %1\n\tBIC %1, %4, %1\n\tADD %2, %2, %1, lsr #8\n\tSBC %0, %0, %3\n\tBIC %3, %4, %3\n\tADD %2, %2, %3, lsr #8": "=&r" (x5), "=&r" (x10), "=&r" (x4), "=&r" (x11): "r" (x6)); 369 370#define NUMBER 3 371#define SHIFT 24 372#define INC_X8 0x08000001 373 374#include "sad_mb_offset.h" 375 376#undef NUMBER 377#define NUMBER 2 378#undef SHIFT 379#define SHIFT 16 380#undef INC_X8 381#define INC_X8 0x10000001 382#include "sad_mb_offset.h" 383 384#undef NUMBER 385#define NUMBER 1 386#undef SHIFT 387#define SHIFT 8 388#undef INC_X8 389#define INC_X8 0x08000001 390#include "sad_mb_offset.h" 391 392 393 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 394 { 395 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 396 397 x9 = 0x80808080; /* const. */ 398 x4 = x5 = 0; 399 400 x8 = (uint32)ref & 0x3; 401 if (x8 == 3) 402 goto SadMBOffset3; 403 if (x8 == 2) 404 goto SadMBOffset2; 405 if (x8 == 1) 406 goto SadMBOffset1; 407 408 x8 = 16; 409/// 410__asm__ volatile("MVN %0, #0xFF00": "=r"(x6)); 411 412LOOP_SAD0: 413 /****** process 8 pixels ******/ 414 x11 = *((int32*)(ref + 12)); 415 x10 = *((int32*)(ref + 8)); 416 x14 = *((int32*)(blk + 12)); 417 x12 = *((int32*)(blk + 8)); 418 419 /* process x11 & x14 */ 420 x11 = sad_4pixel(x11, x14, x9); 421 422 /* process x12 & x10 */ 423 x10 = sad_4pixel(x10, x12, x9); 424 425 x5 = x5 + x10; /* accumulate low bytes */ 426 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 427 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 428 x5 = x5 + x11; /* accumulate low bytes */ 429 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 430 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 431 432 /****** process 8 pixels ******/ 433 x11 = *((int32*)(ref + 4)); 434__asm__ volatile("LDR %0, [%1], %2": "=&r"(x10), "=r"(ref): "r"(lx)); 435 //x10 = *((int32*)ref); ref+=lx; 436 x14 = *((int32*)(blk + 4)); 437__asm__ volatile("LDR %0, [%1], #16": "=&r"(x12), "=r"(blk)); 438 439 /* process x11 & x14 */ 440 x11 = sad_4pixel(x11, x14, x9); 441 442 /* process x12 & x10 */ 443 x10 = sad_4pixel(x10, x12, x9); 444 445 x5 = x5 + x10; /* accumulate low bytes */ 446 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 447 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 448 x5 = x5 + x11; /* accumulate low bytes */ 449 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 450 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 451 452 /****************/ 453 x10 = x5 - (x4 << 8); /* extract low bytes */ 454 x10 = x10 + x4; /* add with high bytes */ 455 x10 = x10 + (x10 << 16); /* add with lower half word */ 456 457 /****************/ 458 459 if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 460 { 461 if (--x8) 462 { 463 goto LOOP_SAD0; 464 } 465 466 } 467 468 return ((uint32)x10 >> 16); 469 470SadMBOffset3: 471 472 return sad_mb_offset3(ref, blk, lx, dmin); 473 474SadMBOffset2: 475 476 return sad_mb_offset2(ref, blk, lx, dmin); 477 478SadMBOffset1: 479 480 return sad_mb_offset1(ref, blk, lx, dmin); 481 } 482 483 484#endif 485 486#ifdef __cplusplus 487} 488#endif 489 490#endif // _SAD_INLINE_H_ 491 492