sad_inline.h revision 3fdb405597f0e062a9bb8af20199c5e67f0f764c
1/* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18#ifndef _SAD_INLINE_H_ 19#define _SAD_INLINE_H_ 20 21#ifdef __cplusplus 22extern "C" 23{ 24#endif 25 26/* Intentionally not using the gcc asm version, since it (if fixed so 27 * as to not crash - the current register constraints are faulty) is 28 * slightly slower than the plain C version on modern GCC versions. */ 29#if !defined(__CC_ARM) /* Generic C version */ 30 31 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 32 { 33 tmp = tmp - tmp2; 34 if (tmp > 0) sad += tmp; 35 else sad -= tmp; 36 37 return sad; 38 } 39 40 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 41 { 42 int32 x7; 43 44 x7 = src2 ^ src1; /* check odd/even combination */ 45 if ((uint32)src2 >= (uint32)src1) 46 { 47 src1 = src2 - src1; /* subs */ 48 } 49 else 50 { 51 src1 = src1 - src2; 52 } 53 x7 = x7 ^ src1; /* only odd bytes need to add carry */ 54 x7 = mask & ((uint32)x7 >> 1); 55 x7 = (x7 << 8) - x7; 56 src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */ 57 src1 = src1 ^(x7 >> 7); /* take absolute value of negative byte */ 58 59 return src1; 60 } 61 62#define NUMBER 3 63#define SHIFT 24 64 65#include "sad_mb_offset.h" 66 67#undef NUMBER 68#define NUMBER 2 69#undef SHIFT 70#define SHIFT 16 71#include "sad_mb_offset.h" 72 73#undef NUMBER 74#define NUMBER 1 75#undef SHIFT 76#define SHIFT 8 77#include "sad_mb_offset.h" 78 79 80 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 81 { 82 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 83 84 x9 = 0x80808080; /* const. */ 85 86 x8 = (intptr_t)ref & 0x3; 87 if (x8 == 3) 88 goto SadMBOffset3; 89 if (x8 == 2) 90 goto SadMBOffset2; 91 if (x8 == 1) 92 goto SadMBOffset1; 93 94// x5 = (x4<<8)-x4; /* x5 = x4*255; */ 95 x4 = x5 = 0; 96 97 x6 = 0xFFFF00FF; 98 99 ref -= lx; 100 blk -= 16; 101 102 x8 = 16; 103 104LOOP_SAD0: 105 /****** process 8 pixels ******/ 106 x10 = *((uint32*)(ref += lx)); 107 x11 = *((uint32*)(ref + 4)); 108 x12 = *((uint32*)(blk += 16)); 109 x14 = *((uint32*)(blk + 4)); 110 111 /* process x11 & x14 */ 112 x11 = sad_4pixel(x11, x14, x9); 113 114 /* process x12 & x10 */ 115 x10 = sad_4pixel(x10, x12, x9); 116 117 x5 = x5 + x10; /* accumulate low bytes */ 118 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 119 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 120 x5 = x5 + x11; /* accumulate low bytes */ 121 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 122 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 123 124 /****** process 8 pixels ******/ 125 x10 = *((uint32*)(ref + 8)); 126 x11 = *((uint32*)(ref + 12)); 127 x12 = *((uint32*)(blk + 8)); 128 x14 = *((uint32*)(blk + 12)); 129 130 /* process x11 & x14 */ 131 x11 = sad_4pixel(x11, x14, x9); 132 133 /* process x12 & x10 */ 134 x10 = sad_4pixel(x10, x12, x9); 135 136 x5 = x5 + x10; /* accumulate low bytes */ 137 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 138 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 139 x5 = x5 + x11; /* accumulate low bytes */ 140 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 141 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 142 143 /****************/ 144 x10 = x5 - (x4 << 8); /* extract low bytes */ 145 x10 = x10 + x4; /* add with high bytes */ 146 x10 = x10 + (x10 << 16); /* add with lower half word */ 147 148 if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 149 { 150 if (--x8) 151 { 152 goto LOOP_SAD0; 153 } 154 155 } 156 157 return ((uint32)x10 >> 16); 158 159SadMBOffset3: 160 161 return sad_mb_offset3(ref, blk, lx, dmin); 162 163SadMBOffset2: 164 165 return sad_mb_offset2(ref, blk, lx, dmin); 166 167SadMBOffset1: 168 169 return sad_mb_offset1(ref, blk, lx, dmin); 170 171 } 172 173#elif defined(__CC_ARM) /* only work with arm v5 */ 174 175 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 176 { 177 __asm 178 { 179 rsbs tmp, tmp, tmp2 ; 180 rsbmi tmp, tmp, #0 ; 181 add sad, sad, tmp ; 182 } 183 184 return sad; 185 } 186 187 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 188 { 189 int32 x7; 190 191 __asm 192 { 193 EOR x7, src2, src1; /* check odd/even combination */ 194 SUBS src1, src2, src1; 195 EOR x7, x7, src1; 196 AND x7, mask, x7, lsr #1; 197 ORRCC x7, x7, #0x80000000; 198 RSB x7, x7, x7, lsl #8; 199 ADD src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 200 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 201 } 202 203 return src1; 204 } 205 206 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 207 { 208 int32 x7; 209 210 __asm 211 { 212 EOR x7, src2, src1; /* check odd/even combination */ 213 ADDS src1, src2, src1; 214 EOR x7, x7, src1; /* only odd bytes need to add carry */ 215 ANDS x7, mask, x7, rrx; 216 RSB x7, x7, x7, lsl #8; 217 SUB src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 218 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 219 } 220 221 return src1; 222 } 223 224#define sum_accumulate __asm{ SBC x5, x5, x10; /* accumulate low bytes */ \ 225 BIC x10, x6, x10; /* x10 & 0xFF00FF00 */ \ 226 ADD x4, x4, x10,lsr #8; /* accumulate high bytes */ \ 227 SBC x5, x5, x11; /* accumulate low bytes */ \ 228 BIC x11, x6, x11; /* x11 & 0xFF00FF00 */ \ 229 ADD x4, x4, x11,lsr #8; } /* accumulate high bytes */ 230 231 232#define NUMBER 3 233#define SHIFT 24 234#define INC_X8 0x08000001 235 236#include "sad_mb_offset.h" 237 238#undef NUMBER 239#define NUMBER 2 240#undef SHIFT 241#define SHIFT 16 242#undef INC_X8 243#define INC_X8 0x10000001 244#include "sad_mb_offset.h" 245 246#undef NUMBER 247#define NUMBER 1 248#undef SHIFT 249#define SHIFT 8 250#undef INC_X8 251#define INC_X8 0x08000001 252#include "sad_mb_offset.h" 253 254 255 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 256 { 257 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 258 259 x9 = 0x80808080; /* const. */ 260 x4 = x5 = 0; 261 262 __asm 263 { 264 MOVS x8, ref, lsl #31 ; 265 BHI SadMBOffset3; 266 BCS SadMBOffset2; 267 BMI SadMBOffset1; 268 269 MVN x6, #0xFF00; 270 } 271LOOP_SAD0: 272 /****** process 8 pixels ******/ 273 x11 = *((int32*)(ref + 12)); 274 x10 = *((int32*)(ref + 8)); 275 x14 = *((int32*)(blk + 12)); 276 x12 = *((int32*)(blk + 8)); 277 278 /* process x11 & x14 */ 279 x11 = sad_4pixel(x11, x14, x9); 280 281 /* process x12 & x10 */ 282 x10 = sad_4pixel(x10, x12, x9); 283 284 x5 = x5 + x10; /* accumulate low bytes */ 285 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 286 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 287 x5 = x5 + x11; /* accumulate low bytes */ 288 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 289 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 290 291 __asm 292 { 293 /****** process 8 pixels ******/ 294 LDR x11, [ref, #4]; 295 LDR x10, [ref], lx ; 296 LDR x14, [blk, #4]; 297 LDR x12, [blk], #16 ; 298 } 299 300 /* process x11 & x14 */ 301 x11 = sad_4pixel(x11, x14, x9); 302 303 /* process x12 & x10 */ 304 x10 = sad_4pixel(x10, x12, x9); 305 306 x5 = x5 + x10; /* accumulate low bytes */ 307 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 308 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 309 x5 = x5 + x11; /* accumulate low bytes */ 310 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 311 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 312 313 /****************/ 314 x10 = x5 - (x4 << 8); /* extract low bytes */ 315 x10 = x10 + x4; /* add with high bytes */ 316 x10 = x10 + (x10 << 16); /* add with lower half word */ 317 318 __asm 319 { 320 /****************/ 321 RSBS x11, dmin, x10, lsr #16; 322 ADDLSS x8, x8, #0x10000001; 323 BLS LOOP_SAD0; 324 } 325 326 return ((uint32)x10 >> 16); 327 328SadMBOffset3: 329 330 return sad_mb_offset3(ref, blk, lx, dmin, x8); 331 332SadMBOffset2: 333 334 return sad_mb_offset2(ref, blk, lx, dmin, x8); 335 336SadMBOffset1: 337 338 return sad_mb_offset1(ref, blk, lx, dmin, x8); 339 } 340 341 342#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ 343 344 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 345 { 346 __asm__ volatile( 347 "rsbs %1, %1, %2\n\t" 348 "rsbmi %1, %1, #0\n\t" 349 "add %0, %0, %1" 350 : "+r"(sad), "+r"(tmp) 351 : "r"(tmp2) 352 ); 353 return sad; 354 } 355 356 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 357 { 358 int32 x7; 359 360 __asm__ volatile( 361 "EOR %1, %2, %0\n\t" 362 "SUBS %0, %2, %0\n\t" 363 "EOR %1, %1, %0\n\t" 364 "AND %1, %3, %1, lsr #1\n\t" 365 "ORRCC %1, %1, #0x80000000\n\t" 366 "RSB %1, %1, %1, lsl #8\n\t" 367 "ADD %0, %0, %1, asr #7\n\t" 368 "EOR %0, %0, %1, asr #7" 369 : "+r"(src1), "=&r"(x7) 370 : "r"(src2), "r"(mask) 371 ); 372 373 return src1; 374 } 375 376 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 377 { 378 int32 x7; 379 380 __asm__ volatile( 381 "EOR %1, %2, %0\n\t" 382 "ADDS %0, %2, %0\n\t" 383 "EOR %1, %1, %0\n\t" 384 "ANDS %1, %3, %1, rrx\n\t" 385 "RSB %1, %1, %1, lsl #8\n\t" 386 "SUB %0, %0, %1, asr #7\n\t" 387 "EOR %0, %0, %1, asr #7" 388 : "+r"(src1), "=&r"(x7) 389 : "r"(src2), "r"(mask) 390 ); 391 392 return src1; 393 } 394 395#define sum_accumulate __asm__ volatile( \ 396 "SBC %0, %0, %1\n\t" \ 397 "BIC %1, %4, %1\n\t" \ 398 "ADD %2, %2, %1, lsr #8\n\t" \ 399 "SBC %0, %0, %3\n\t" \ 400 "BIC %3, %4, %3\n\t" \ 401 "ADD %2, %2, %3, lsr #8" \ 402 : "+r" (x5), "+r" (x10), "+r" (x4), "+r" (x11) \ 403 : "r" (x6) \ 404 ); 405 406#define NUMBER 3 407#define SHIFT 24 408#define INC_X8 0x08000001 409 410#include "sad_mb_offset.h" 411 412#undef NUMBER 413#define NUMBER 2 414#undef SHIFT 415#define SHIFT 16 416#undef INC_X8 417#define INC_X8 0x10000001 418#include "sad_mb_offset.h" 419 420#undef NUMBER 421#define NUMBER 1 422#undef SHIFT 423#define SHIFT 8 424#undef INC_X8 425#define INC_X8 0x08000001 426#include "sad_mb_offset.h" 427 428 429 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 430 { 431 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 432 433 x9 = 0x80808080; /* const. */ 434 x4 = x5 = 0; 435 436 x8 = (uint32)ref & 0x3; 437 if (x8 == 3) 438 goto SadMBOffset3; 439 if (x8 == 2) 440 goto SadMBOffset2; 441 if (x8 == 1) 442 goto SadMBOffset1; 443 444 x8 = 16; 445/// 446 __asm__ volatile("MVN %0, #0xFF00": "=r"(x6)); 447 448LOOP_SAD0: 449 /****** process 8 pixels ******/ 450 x11 = *((int32*)(ref + 12)); 451 x10 = *((int32*)(ref + 8)); 452 x14 = *((int32*)(blk + 12)); 453 x12 = *((int32*)(blk + 8)); 454 455 /* process x11 & x14 */ 456 x11 = sad_4pixel(x11, x14, x9); 457 458 /* process x12 & x10 */ 459 x10 = sad_4pixel(x10, x12, x9); 460 461 x5 = x5 + x10; /* accumulate low bytes */ 462 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 463 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 464 x5 = x5 + x11; /* accumulate low bytes */ 465 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 466 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 467 468 /****** process 8 pixels ******/ 469 x11 = *((int32*)(ref + 4)); 470 __asm__ volatile("LDR %0, [%1], %2": "=&r"(x10), "+r"(ref): "r"(lx)); 471 //x10 = *((int32*)ref); ref+=lx; 472 x14 = *((int32*)(blk + 4)); 473 __asm__ volatile("LDR %0, [%1], #16": "=&r"(x12), "+r"(blk)); 474 475 /* process x11 & x14 */ 476 x11 = sad_4pixel(x11, x14, x9); 477 478 /* process x12 & x10 */ 479 x10 = sad_4pixel(x10, x12, x9); 480 481 x5 = x5 + x10; /* accumulate low bytes */ 482 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 483 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 484 x5 = x5 + x11; /* accumulate low bytes */ 485 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 486 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 487 488 /****************/ 489 x10 = x5 - (x4 << 8); /* extract low bytes */ 490 x10 = x10 + x4; /* add with high bytes */ 491 x10 = x10 + (x10 << 16); /* add with lower half word */ 492 493 /****************/ 494 495 if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 496 { 497 if (--x8) 498 { 499 goto LOOP_SAD0; 500 } 501 502 } 503 504 return ((uint32)x10 >> 16); 505 506SadMBOffset3: 507 508 return sad_mb_offset3(ref, blk, lx, dmin); 509 510SadMBOffset2: 511 512 return sad_mb_offset2(ref, blk, lx, dmin); 513 514SadMBOffset1: 515 516 return sad_mb_offset1(ref, blk, lx, dmin); 517 } 518 519 520#endif 521 522#ifdef __cplusplus 523} 524#endif 525 526#endif // _SAD_INLINE_H_ 527 528