sad_inline.h revision 4b43b41eaf8c4c80f66185e13620cf94b8b2ef5b
1/* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18#ifndef _SAD_INLINE_H_ 19#define _SAD_INLINE_H_ 20 21#ifdef __cplusplus 22extern "C" 23{ 24#endif 25 26#if defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ 27 28 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 29 { 30 tmp = tmp - tmp2; 31 if (tmp > 0) sad += tmp; 32 else sad -= tmp; 33 34 return sad; 35 } 36 37 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 38 { 39 int32 x7; 40 41 x7 = src2 ^ src1; /* check odd/even combination */ 42 if ((uint32)src2 >= (uint32)src1) 43 { 44 src1 = src2 - src1; /* subs */ 45 } 46 else 47 { 48 src1 = src1 - src2; 49 } 50 x7 = x7 ^ src1; /* only odd bytes need to add carry */ 51 x7 = mask & ((uint32)x7 >> 1); 52 x7 = (x7 << 8) - x7; 53 src1 = src1 + (x7 >> 7); /* add 0xFF to the negative byte, add back carry */ 54 src1 = src1 ^(x7 >> 7); /* take absolute value of negative byte */ 55 56 return src1; 57 } 58 59#define NUMBER 3 60#define SHIFT 24 61 62#include "sad_mb_offset.h" 63 64#undef NUMBER 65#define NUMBER 2 66#undef SHIFT 67#define SHIFT 16 68#include "sad_mb_offset.h" 69 70#undef NUMBER 71#define NUMBER 1 72#undef SHIFT 73#define SHIFT 8 74#include "sad_mb_offset.h" 75 76 77 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 78 { 79 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 80 81 x9 = 0x80808080; /* const. */ 82 83 x8 = (intptr_t)ref & 0x3; 84 if (x8 == 3) 85 goto SadMBOffset3; 86 if (x8 == 2) 87 goto SadMBOffset2; 88 if (x8 == 1) 89 goto SadMBOffset1; 90 91// x5 = (x4<<8)-x4; /* x5 = x4*255; */ 92 x4 = x5 = 0; 93 94 x6 = 0xFFFF00FF; 95 96 ref -= lx; 97 blk -= 16; 98 99 x8 = 16; 100 101LOOP_SAD0: 102 /****** process 8 pixels ******/ 103 x10 = *((uint32*)(ref += lx)); 104 x11 = *((uint32*)(ref + 4)); 105 x12 = *((uint32*)(blk += 16)); 106 x14 = *((uint32*)(blk + 4)); 107 108 /* process x11 & x14 */ 109 x11 = sad_4pixel(x11, x14, x9); 110 111 /* process x12 & x10 */ 112 x10 = sad_4pixel(x10, x12, x9); 113 114 x5 = x5 + x10; /* accumulate low bytes */ 115 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 116 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 117 x5 = x5 + x11; /* accumulate low bytes */ 118 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 119 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 120 121 /****** process 8 pixels ******/ 122 x10 = *((uint32*)(ref + 8)); 123 x11 = *((uint32*)(ref + 12)); 124 x12 = *((uint32*)(blk + 8)); 125 x14 = *((uint32*)(blk + 12)); 126 127 /* process x11 & x14 */ 128 x11 = sad_4pixel(x11, x14, x9); 129 130 /* process x12 & x10 */ 131 x10 = sad_4pixel(x10, x12, x9); 132 133 x5 = x5 + x10; /* accumulate low bytes */ 134 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 135 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 136 x5 = x5 + x11; /* accumulate low bytes */ 137 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 138 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 139 140 /****************/ 141 x10 = x5 - (x4 << 8); /* extract low bytes */ 142 x10 = x10 + x4; /* add with high bytes */ 143 x10 = x10 + (x10 << 16); /* add with lower half word */ 144 145 if ((int)((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 146 { 147 if (--x8) 148 { 149 goto LOOP_SAD0; 150 } 151 152 } 153 154 return ((uint32)x10 >> 16); 155 156SadMBOffset3: 157 158 return sad_mb_offset3(ref, blk, lx, dmin); 159 160SadMBOffset2: 161 162 return sad_mb_offset2(ref, blk, lx, dmin); 163 164SadMBOffset1: 165 166 return sad_mb_offset1(ref, blk, lx, dmin); 167 168 } 169 170#elif defined(__CC_ARM) /* only work with arm v5 */ 171 172 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 173 { 174 __asm 175 { 176 rsbs tmp, tmp, tmp2 ; 177 rsbmi tmp, tmp, #0 ; 178 add sad, sad, tmp ; 179 } 180 181 return sad; 182 } 183 184 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 185 { 186 int32 x7; 187 188 __asm 189 { 190 EOR x7, src2, src1; /* check odd/even combination */ 191 SUBS src1, src2, src1; 192 EOR x7, x7, src1; 193 AND x7, mask, x7, lsr #1; 194 ORRCC x7, x7, #0x80000000; 195 RSB x7, x7, x7, lsl #8; 196 ADD src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 197 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 198 } 199 200 return src1; 201 } 202 203 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 204 { 205 int32 x7; 206 207 __asm 208 { 209 EOR x7, src2, src1; /* check odd/even combination */ 210 ADDS src1, src2, src1; 211 EOR x7, x7, src1; /* only odd bytes need to add carry */ 212 ANDS x7, mask, x7, rrx; 213 RSB x7, x7, x7, lsl #8; 214 SUB src1, src1, x7, asr #7; /* add 0xFF to the negative byte, add back carry */ 215 EOR src1, src1, x7, asr #7; /* take absolute value of negative byte */ 216 } 217 218 return src1; 219 } 220 221#define sum_accumulate __asm{ SBC x5, x5, x10; /* accumulate low bytes */ \ 222 BIC x10, x6, x10; /* x10 & 0xFF00FF00 */ \ 223 ADD x4, x4, x10,lsr #8; /* accumulate high bytes */ \ 224 SBC x5, x5, x11; /* accumulate low bytes */ \ 225 BIC x11, x6, x11; /* x11 & 0xFF00FF00 */ \ 226 ADD x4, x4, x11,lsr #8; } /* accumulate high bytes */ 227 228 229#define NUMBER 3 230#define SHIFT 24 231#define INC_X8 0x08000001 232 233#include "sad_mb_offset.h" 234 235#undef NUMBER 236#define NUMBER 2 237#undef SHIFT 238#define SHIFT 16 239#undef INC_X8 240#define INC_X8 0x10000001 241#include "sad_mb_offset.h" 242 243#undef NUMBER 244#define NUMBER 1 245#undef SHIFT 246#define SHIFT 8 247#undef INC_X8 248#define INC_X8 0x08000001 249#include "sad_mb_offset.h" 250 251 252 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 253 { 254 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 255 256 x9 = 0x80808080; /* const. */ 257 x4 = x5 = 0; 258 259 __asm 260 { 261 MOVS x8, ref, lsl #31 ; 262 BHI SadMBOffset3; 263 BCS SadMBOffset2; 264 BMI SadMBOffset1; 265 266 MVN x6, #0xFF00; 267 } 268LOOP_SAD0: 269 /****** process 8 pixels ******/ 270 x11 = *((int32*)(ref + 12)); 271 x10 = *((int32*)(ref + 8)); 272 x14 = *((int32*)(blk + 12)); 273 x12 = *((int32*)(blk + 8)); 274 275 /* process x11 & x14 */ 276 x11 = sad_4pixel(x11, x14, x9); 277 278 /* process x12 & x10 */ 279 x10 = sad_4pixel(x10, x12, x9); 280 281 x5 = x5 + x10; /* accumulate low bytes */ 282 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 283 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 284 x5 = x5 + x11; /* accumulate low bytes */ 285 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 286 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 287 288 __asm 289 { 290 /****** process 8 pixels ******/ 291 LDR x11, [ref, #4]; 292 LDR x10, [ref], lx ; 293 LDR x14, [blk, #4]; 294 LDR x12, [blk], #16 ; 295 } 296 297 /* process x11 & x14 */ 298 x11 = sad_4pixel(x11, x14, x9); 299 300 /* process x12 & x10 */ 301 x10 = sad_4pixel(x10, x12, x9); 302 303 x5 = x5 + x10; /* accumulate low bytes */ 304 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 305 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 306 x5 = x5 + x11; /* accumulate low bytes */ 307 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 308 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 309 310 /****************/ 311 x10 = x5 - (x4 << 8); /* extract low bytes */ 312 x10 = x10 + x4; /* add with high bytes */ 313 x10 = x10 + (x10 << 16); /* add with lower half word */ 314 315 __asm 316 { 317 /****************/ 318 RSBS x11, dmin, x10, lsr #16; 319 ADDLSS x8, x8, #0x10000001; 320 BLS LOOP_SAD0; 321 } 322 323 return ((uint32)x10 >> 16); 324 325SadMBOffset3: 326 327 return sad_mb_offset3(ref, blk, lx, dmin, x8); 328 329SadMBOffset2: 330 331 return sad_mb_offset2(ref, blk, lx, dmin, x8); 332 333SadMBOffset1: 334 335 return sad_mb_offset1(ref, blk, lx, dmin, x8); 336 } 337 338 339#elif defined(__GNUC__) && defined(__arm__) /* ARM GNU COMPILER */ 340 341 __inline int32 SUB_SAD(int32 sad, int32 tmp, int32 tmp2) 342 { 343__asm__ volatile("rsbs %1, %1, %2\n\trsbmi %1, %1, #0\n\tadd %0, %0, %1": "=r"(sad): "r"(tmp), "r"(tmp2)); 344 return sad; 345 } 346 347 __inline int32 sad_4pixel(int32 src1, int32 src2, int32 mask) 348 { 349 int32 x7; 350 351__asm__ volatile("EOR %1, %2, %0\n\tSUBS %0, %2, %0\n\tEOR %1, %1, %0\n\tAND %1, %3, %1, lsr #1\n\tORRCC %1, %1, #0x80000000\n\tRSB %1, %1, %1, lsl #8\n\tADD %0, %0, %1, asr #7\n\tEOR %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask)); 352 353 return src1; 354 } 355 356 __inline int32 sad_4pixelN(int32 src1, int32 src2, int32 mask) 357 { 358 int32 x7; 359 360__asm__ volatile("EOR %1, %2, %0\n\tADDS %0, %2, %0\n\tEOR %1, %1, %0\n\tANDS %1, %3, %1, rrx\n\tRSB %1, %1, %1, lsl #8\n\tSUB %0, %0, %1, asr #7\n\tEOR %0, %0, %1, asr #7": "=r"(src1), "=&r"(x7): "r"(src2), "r"(mask)); 361 362 return src1; 363 } 364 365#define sum_accumulate __asm__ volatile("SBC %0, %0, %1\n\tBIC %1, %4, %1\n\tADD %2, %2, %1, lsr #8\n\tSBC %0, %0, %3\n\tBIC %3, %4, %3\n\tADD %2, %2, %3, lsr #8": "=&r" (x5), "=&r" (x10), "=&r" (x4), "=&r" (x11): "r" (x6)); 366 367#define NUMBER 3 368#define SHIFT 24 369#define INC_X8 0x08000001 370 371#include "sad_mb_offset.h" 372 373#undef NUMBER 374#define NUMBER 2 375#undef SHIFT 376#define SHIFT 16 377#undef INC_X8 378#define INC_X8 0x10000001 379#include "sad_mb_offset.h" 380 381#undef NUMBER 382#define NUMBER 1 383#undef SHIFT 384#define SHIFT 8 385#undef INC_X8 386#define INC_X8 0x08000001 387#include "sad_mb_offset.h" 388 389 390 __inline int32 simd_sad_mb(uint8 *ref, uint8 *blk, int dmin, int lx) 391 { 392 int32 x4, x5, x6, x8, x9, x10, x11, x12, x14; 393 394 x9 = 0x80808080; /* const. */ 395 x4 = x5 = 0; 396 397 x8 = (uint32)ref & 0x3; 398 if (x8 == 3) 399 goto SadMBOffset3; 400 if (x8 == 2) 401 goto SadMBOffset2; 402 if (x8 == 1) 403 goto SadMBOffset1; 404 405 x8 = 16; 406/// 407__asm__ volatile("MVN %0, #0xFF00": "=r"(x6)); 408 409LOOP_SAD0: 410 /****** process 8 pixels ******/ 411 x11 = *((int32*)(ref + 12)); 412 x10 = *((int32*)(ref + 8)); 413 x14 = *((int32*)(blk + 12)); 414 x12 = *((int32*)(blk + 8)); 415 416 /* process x11 & x14 */ 417 x11 = sad_4pixel(x11, x14, x9); 418 419 /* process x12 & x10 */ 420 x10 = sad_4pixel(x10, x12, x9); 421 422 x5 = x5 + x10; /* accumulate low bytes */ 423 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 424 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 425 x5 = x5 + x11; /* accumulate low bytes */ 426 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 427 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 428 429 /****** process 8 pixels ******/ 430 x11 = *((int32*)(ref + 4)); 431__asm__ volatile("LDR %0, [%1], %2": "=&r"(x10), "=r"(ref): "r"(lx)); 432 //x10 = *((int32*)ref); ref+=lx; 433 x14 = *((int32*)(blk + 4)); 434__asm__ volatile("LDR %0, [%1], #16": "=&r"(x12), "=r"(blk)); 435 436 /* process x11 & x14 */ 437 x11 = sad_4pixel(x11, x14, x9); 438 439 /* process x12 & x10 */ 440 x10 = sad_4pixel(x10, x12, x9); 441 442 x5 = x5 + x10; /* accumulate low bytes */ 443 x10 = x10 & (x6 << 8); /* x10 & 0xFF00FF00 */ 444 x4 = x4 + ((uint32)x10 >> 8); /* accumulate high bytes */ 445 x5 = x5 + x11; /* accumulate low bytes */ 446 x11 = x11 & (x6 << 8); /* x11 & 0xFF00FF00 */ 447 x4 = x4 + ((uint32)x11 >> 8); /* accumulate high bytes */ 448 449 /****************/ 450 x10 = x5 - (x4 << 8); /* extract low bytes */ 451 x10 = x10 + x4; /* add with high bytes */ 452 x10 = x10 + (x10 << 16); /* add with lower half word */ 453 454 /****************/ 455 456 if (((uint32)x10 >> 16) <= dmin) /* compare with dmin */ 457 { 458 if (--x8) 459 { 460 goto LOOP_SAD0; 461 } 462 463 } 464 465 return ((uint32)x10 >> 16); 466 467SadMBOffset3: 468 469 return sad_mb_offset3(ref, blk, lx, dmin); 470 471SadMBOffset2: 472 473 return sad_mb_offset2(ref, blk, lx, dmin); 474 475SadMBOffset1: 476 477 return sad_mb_offset1(ref, blk, lx, dmin); 478 } 479 480 481#endif 482 483#ifdef __cplusplus 484} 485#endif 486 487#endif // _SAD_INLINE_H_ 488 489