omxVCM4P2_MCReconBlock_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;// 2;// 3;// File Name: omxVCM4P2_MCReconBlock_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 9641 6;// Date: Thursday, February 7, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12;// Description: 13;// 14;// 15 16;// Include standard headers 17 INCLUDE omxtypes_s.h 18 INCLUDE armCOMM_s.h 19 20;// Import symbols required from other files 21 22 M_VARIANTS ARM1136JS 23 24;// *************************************************************************** 25;// ARM1136JS implementation 26;// *************************************************************************** 27 IF ARM1136JS 28 29;// *************************************************************************** 30;// MACRO DEFINITIONS 31;// *************************************************************************** 32 ;// Description: 33 ;// 34 ;// dest[j] = (x[j] + y[j] + round) >> 1, j=0..3 35 ;// 36 ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to 37 ;// each sum before dividing by two, if round is 1 38 ;// 39 ;// Syntax: 40 ;// M_UHADD8R $dest, $x, $y, $round, $mask 41 ;// 42 ;// Inputs: 43 ;// $x four packed bytes, x[3] : x[2] : x[1] : x[0] 44 ;// $y four packed bytes, y[3] : y[2] : y[1] : y[0] 45 ;// $round 0 if no rounding to be added, 1 if rounding to be done 46 ;// $mask some register set to 0x80808080 47 ;// 48 ;// Outputs: 49 ;// $dest four packed bytes, z[3] : z[2] : z[1] : z[0] 50 51 MACRO 52 M_UHADD8R $dest, $x, $y, $round, $mask 53 IF $round = 1 54 IF $dest /= $y 55 MVN $dest, $x 56 UHSUB8 $dest, $y, $dest 57 EOR $dest, $dest, $mask 58 ELSE 59 MVN $dest, $y 60 UHSUB8 $dest, $x, $dest 61 EOR $dest, $dest, $mask 62 ENDIF 63 ELSE 64 UHADD8 $dest, $x, $y 65 ENDIF 66 MEND 67;// *************************************************************************** 68 ;// Description: 69 ;// Load 8 bytes from $pSrc (aligned or unaligned locations) 70 ;// 71 ;// Syntax: 72 ;// M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset 73 ;// 74 ;// Inputs: 75 ;// $pSrc 4 byte aligned source pointer to an address just less than 76 ;// or equal to the data location 77 ;// $srcStep The stride on source 78 ;// $scratch A scratch register, used internally for temp calculations 79 ;// $offset Difference of source data location to the source pointer 80 ;// Use when $offset != 0 (unaligned load) 81 ;// 82 ;// Outputs: 83 ;// $pSrc In case the macro accepts stride, it increments the pSrc by 84 ;// that value, else unchanged 85 ;// $out0 four packed bytes, z[3] : z[2] : z[1] : z[0] 86 ;// $out1 four packed bytes, z[7] : z[6] : z[5] : z[4] 87 ;// 88 ;// Note: {$out0, $out1, $scratch} should be registers with ascending 89 ;// register numbering. In case offset is 0, $scratch is not modified. 90 91 MACRO 92 M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset 93 IF $offset = 0 94 LDM $pSrc, {$out0, $out1} 95 ADD $pSrc, $pSrc, $srcStep 96 ELSE 97 LDM $pSrc, {$out0, $out1, $scratch} 98 ADD $pSrc, $pSrc, $srcStep 99 100 MOV $out0, $out0, LSR #8 * $offset 101 ORR $out0, $out0, $out1, LSL #(32 - 8 * ($offset)) 102 MOV $out1, $out1, LSR #8 * $offset 103 ORR $out1, $out1, $scratch, LSL #(32 - 8 * ($offset)) 104 ENDIF 105 MEND 106 107;// *************************************************************************** 108 ;// Description: 109 ;// Loads three words for X interpolation, update pointer to next row. For 110 ;// X interpolation, given a truncated-4byteAligned source pointer, 111 ;// invariably three continous words are required from there to get the 112 ;// nine bytes from the source pointer for filtering. 113 ;// 114 ;// Syntax: 115 ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 116 ;// 117 ;// Inputs: 118 ;// $pSrc 4 byte aligned source pointer to an address just less than 119 ;// or equal to the data location 120 ;// 121 ;// $srcStep The stride on source 122 ;// 123 ;// $offset Difference of source data location to the source pointer 124 ;// Use when $offset != 0 (unaligned load) 125 ;// 126 ;// Outputs: 127 ;// $pSrc Incremented by $srcStep 128 ;// 129 ;// $word0, $word1, $word2, $word3 130 ;// Three of these are outputs based on the $offset parameter. 131 ;// The outputs are specifically generated to be processed by 132 ;// the M_EXT_XINT macro. Following is the illustration to show 133 ;// how the nine bytes are spanned for different offsets from 134 ;// notTruncatedForAlignmentSourcePointer. 135 ;// 136 ;// ------------------------------------------------------ 137 ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | 138 ;// |------------------------------------------------------| 139 ;// | 0 | 0 | 0123 | 4567 | 8xxx | | 140 ;// | 1 | -1 | x012 | 3456 | 78xx | | 141 ;// | 2 | -2 | xx01 | 2345 | 678x | | 142 ;// | 3 | -3 | xxx0 | | 1234 | 5678 | 143 ;// ------------------------------------------------------ 144 ;// 145 ;// where the numbering (0-8) is to designate the 9 bytes from 146 ;// start of a particular row. The illustration doesn't take in 147 ;// account the positioning of bytes with in the word and the 148 ;// macro combination with M_EXT_XINT will work only in little 149 ;// endian environs 150 ;// 151 ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending 152 ;// register numbering 153 154 MACRO 155 M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 156 IF $offset /= 3 157 LDM $pSrc, {$word0, $word1, $word2} 158 ELSE 159 LDM $pSrc, {$word0, $word2, $word3} 160 ENDIF 161 ADD $pSrc, $pSrc, $srcStep 162 MEND 163 164;// *************************************************************************** 165 ;// Description: 166 ;// Extract four registers of four pixels for X interpolation 167 ;// 168 ;// Syntax: 169 ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3 170 ;// 171 ;// Inputs: 172 ;// $offset Difference of source data location to the source pointer 173 ;// Use when $offset != 0 (unaligned load) 174 ;// 175 ;// $word0, $word1, $word2, $word3 176 ;// Three of these are inputs based on the $offset parameter. 177 ;// The inputs are specifically selected to be processed by 178 ;// the M_EXT_XINT macro. 179 ;// 180 ;// ------------------------------------------------------ 181 ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | 182 ;// |------------------------------------------------------| 183 ;// | 0 | 0 | 0123 | 4567 | 8xxx | yyyy | 184 ;// | 1 | -1 | x012 | 3456 | 78xx | yyyy | 185 ;// | 2 | -2 | xx01 | 2345 | 678x | yyyy | 186 ;// | 3 | -3 | xxx0 | yyyy | 1234 | 5678 | 187 ;// ------------------------------------------------------ 188 ;// 189 ;// Outputs: 190 ;// $word0, $word1, $word2, $word3 191 ;// Bytes from the original source pointer (not truncated for 192 ;// 4 byte alignment) as shown in the table. 193 ;// ------------------------------- 194 ;// | word0 | word1 | word2 | word3 | 195 ;// |-------------------------------| 196 ;// | 0123 | 4567 | 1234 | 5678 | 197 ;// ------------------------------- 198 ;// 199 ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending 200 ;// register numbering 201 202 MACRO 203 M_EXT_XINT $offset, $word0, $word1, $word2, $word3 204 IF $offset = 0 205 ; $word0 and $word1 are ok 206 ; $word2, $word3 are just 8 shifted versions 207 MOV $word3, $word1, LSR #8 208 ORR $word3, $word3, $word2, LSL #24 209 MOV $word2, $word0, LSR #8 210 ORR $word2, $word2, $word1, LSL #24 211 ELIF $offset = 3 212 ; $word2 and $word3 are ok (taken care while loading itself) 213 ; set $word0 & $word1 214 MOV $word0, $word0, LSR #24 215 ORR $word0, $word0, $word2, LSL #8 216 MOV $word1, $word2, LSR #24 217 ORR $word1, $word1, $word3, LSL #8 218 ELSE 219 MOV $word0, $word0, LSR #8 * $offset 220 ORR $word0, $word0, $word1, LSL #(32 - 8 * ($offset)) 221 MOV $word1, $word1, LSR #8 * $offset 222 ORR $word1, $word1, $word2, LSL #(32 - 8 * ($offset)) 223 224 MOV $word3, $word1, LSR #8 225 ORR $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1)) 226 MOV $word2, $word0, LSR #8 227 ORR $word2, $word2, $word1, LSL #24 228 ENDIF 229 MEND 230 231;// *************************************************************************** 232 ;// Description: 233 ;// Computes half-sum and xor of two inputs and puts them in the input 234 ;// registers in that order 235 ;// 236 ;// Syntax: 237 ;// M_HSUM_XOR $v0, $v1, $tmp 238 ;// 239 ;// Inputs: 240 ;// $v0 a, first input 241 ;// $v1 b, second input 242 ;// $tmp scratch register 243 ;// 244 ;// Outputs: 245 ;// $v0 (a + b)/2 246 ;// $v1 a ^ b 247 248 MACRO 249 M_HSUM_XOR $v0, $v1, $tmp 250 UHADD8 $tmp, $v0, $v1 ;// s0 = a + b 251 EOR $v1, $v0, $v1 ;// l0 = a ^ b 252 MOV $v0, $tmp ;// s0 253 MEND 254;// *************************************************************************** 255 ;// Description: 256 ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in 257 ;// mcReconBlock module. Very specific to the implementation of 258 ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and 259 ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are 260 ;// not significant and are used by the callee for row counter (y) 261 ;// 262 ;// Some points to note are: 263 ;// 1. Input is pair of pair-averages and Xors 264 ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another 265 ;// running average 266 ;// 3. Output is in the first argument 267 ;// 268 ;// Syntax: 269 ;// M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal 270 ;// 271 ;// Inputs: 272 ;// $sum0 (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged 273 ;// $lsb0 (a ^ b) 274 ;// $sum1 (c + d) >> 1. Not modified 275 ;// $lsb1 (c ^ d) Not modified 276 ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding 277 ;// 278 ;// Outputs: 279 ;// $sum0 (a + b + c + d + 1) / 4 : If no rounding 280 ;// (a + b + c + d + 2) / 4 : If rounding 281 282 MACRO 283 M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal 284 LCLS OP1 285 LCLS OP2 286 IF $rndVal = 0 ;// rounding case 287OP1 SETS "AND" 288OP2 SETS "ORR" 289 ELSE ;// Not rounding case 290OP1 SETS "ORR" 291OP2 SETS "AND" 292 ENDIF 293 294 LCLS lsb2 295 LCLS sum2 296 LCLS dest 297 298lsb2 SETS "tmp" 299sum2 SETS "$lsb0" 300dest SETS "$sum0" 301 302 $OP1 $lsb0, $lsb0, $lsb1 ;// e0 = e0 & e1 303 EOR $lsb2, $sum0, $sum1 ;// e2 = s0 ^ s1 304 $OP2 $lsb2, $lsb2, $lsb0 ;// e2 = e2 | e0 305 AND $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask 306 UHADD8 $sum2, $sum0, $sum1 ;// s2 = (s0 + s1)/2 307 UADD8 $dest, $sum2, $lsb2 ;// dest = s2 + e2 308 MEND 309;// *************************************************************************** 310;// Motion compensation handler macros 311;// *************************************************************************** 312 ;// Description: 313 ;// Implement motion compensation routines using the named registers in 314 ;// callee function. Each of the following 4 implement the 4 predict type 315 ;// Each handles 8 cases each ie all the combinations of 4 types of source 316 ;// alignment offsets and 2 types of rounding flag 317 ;// 318 ;// Syntax: 319 ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset 320 ;// M_MCRECONBLOCK_HalfPixelX $rndVal, $offset 321 ;// M_MCRECONBLOCK_HalfPixelY $rndVal, $offset 322 ;// M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset 323 ;// 324 ;// Inputs: 325 ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding 326 ;// $offset $pSrc MOD 4 value. Offset from 4 byte aligned location. 327 ;// 328 ;// Outputs: 329 ;// Outputs come in the named registers of the callee functions 330 ;// The macro loads the data from the source pointer, processes it and 331 ;// stores in the destination pointer. Does the whole prediction cycle 332 ;// of Motion Compensation routine for a particular predictType 333 ;// After this only residue addition to the predicted values remain 334 335 MACRO 336 M_MCRECONBLOCK_IntegerPixel $rndVal, $offset 337 ;// Algorithmic Description: 338 ;// This handles motion compensation for IntegerPixel predictType. Both 339 ;// rounding cases are handled by the same code base. It is just a copy 340 ;// from source to destination. Two lines are done per loop to reduce 341 ;// stalls. Loop has been software pipelined as well for that purpose. 342 ;// 343 ;// M_LOAD_X loads a whole row in two registers and then they are stored 344 345CaseIntegerPixelRnd0Offset$offset 346CaseIntegerPixelRnd1Offset$offset 347 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset 348 M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 349YloopIntegerPixelOffset$offset 350 SUBS y, y, #2 351 STRD tmp1, tmp2, [pDst], dstStep 352 STRD tmp3, tmp4, [pDst], dstStep 353 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset 354 M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 355 BGT YloopIntegerPixelOffset$offset 356 357 B SwitchPredictTypeEnd 358 MEND 359;// *************************************************************************** 360 MACRO 361 M_MCRECONBLOCK_HalfPixelX $rndVal, $offset 362 ;// Algorithmic Description: 363 ;// This handles motion compensation for HalfPixelX predictType. The two 364 ;// rounding cases are handled by the different code base and spanned by 365 ;// different macro calls. Loop has been software pipelined to reduce 366 ;// stalls. 367 ;// 368 ;// Filtering involves averaging a pixel with the next horizontal pixel. 369 ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with 370 ;// all pixels in a row with 4 pixel in each register and another 2 371 ;// registers with pixels corresponding to one horizontally shifted pixel 372 ;// corresponding to the initial row pixels. These are set of packed 373 ;// registers appropriate to do 4 lane SIMD. 374 ;// After that M_UHADD8R macro does the averaging taking care of the 375 ;// rounding as required 376 377CaseHalfPixelXRnd$rndVal.Offset$offset 378 IF $rndVal = 0 379 LDR mask, =0x80808080 380 ENDIF 381 382 M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 383YloopHalfPixelXRnd$rndVal.Offset$offset 384 SUBS y, y, #1 385 M_EXT_XINT $offset, tmp1, tmp2, tmp3, tmp4 386 M_UHADD8R tmp5, tmp1, tmp3, (1-$rndVal), mask 387 M_UHADD8R tmp6, tmp2, tmp4, (1-$rndVal), mask 388 STRD tmp5, tmp6, [pDst], dstStep 389 M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 390 BGT YloopHalfPixelXRnd$rndVal.Offset$offset 391 392 B SwitchPredictTypeEnd 393 MEND 394;// *************************************************************************** 395 MACRO 396 M_MCRECONBLOCK_HalfPixelY $rndVal, $offset 397 ;// Algorithmic Description: 398 ;// This handles motion compensation for HalfPixelY predictType. The two 399 ;// rounding cases are handled by the different code base and spanned by 400 ;// different macro calls. PreLoading is used to avoid reload of same data. 401 ;// 402 ;// Filtering involves averaging a pixel with the next vertical pixel. 403 ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in 404 ;// each register. These are set of packed registers appropriate to do 405 ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care 406 ;// of the rounding as required 407 408CaseHalfPixelYRnd$rndVal.Offset$offset 409 IF $rndVal = 0 410 LDR mask, =0x80808080 411 ENDIF 412 413 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load 414YloopHalfPixelYRnd$rndVal.Offset$offset 415 SUBS y, y, #2 416 ;// Processing one line 417 M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 418 M_UHADD8R tmp1, tmp1, tmp3, (1-$rndVal), mask 419 M_UHADD8R tmp2, tmp2, tmp4, (1-$rndVal), mask 420 STRD tmp1, tmp2, [pDst], dstStep 421 ;// Processing another line 422 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset 423 M_UHADD8R tmp3, tmp3, tmp1, (1-$rndVal), mask 424 M_UHADD8R tmp4, tmp4, tmp2, (1-$rndVal), mask 425 STRD tmp3, tmp4, [pDst], dstStep 426 427 BGT YloopHalfPixelYRnd$rndVal.Offset$offset 428 429 B SwitchPredictTypeEnd 430 MEND 431;// *************************************************************************** 432 MACRO 433 M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset 434 ;// Algorithmic Description: 435 ;// This handles motion compensation for HalfPixelXY predictType. The two 436 ;// rounding cases are handled by the different code base and spanned by 437 ;// different macro calls. PreLoading is used to avoid reload of same data. 438 ;// 439 ;// Filtering involves averaging a pixel with the next vertical, horizontal 440 ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT 441 ;// and M_EXT_XINT combination generates 4 registers with a row and its 442 ;// 1 pixel right shifted version, with 4 pixels in one register. Another 443 ;// call of that macro-combination gets another row. Then M_HSUM_XOR is 444 ;// called to get mutual half-sum and xor combinations of a row with its 445 ;// shifted version as they are inputs to the M_AVG4 macro which computes 446 ;// the 4 element average with rounding. Note that it is the half-sum/xor 447 ;// values that are preserved for next row as they can be re-used in the 448 ;// next call to the M_AVG4 and saves recomputation. 449 ;// Due to lack of register, the row counter and a masking value required 450 ;// in M_AVG4 are packed into a single register yMask where the last nibble 451 ;// holds the row counter values and rest holds the masking variable left 452 ;// shifted by 4 453 454CaseHalfPixelXYRnd$rndVal.Offset$offset 455 LDR yMask, =((0x01010101 << 4) + 8) 456 457 M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' 458 M_EXT_XINT $offset, t00, t01, t10, t11 459 M_HSUM_XOR t00, t10, tmp ;// s0, l0 460 M_HSUM_XOR t01, t11, tmp ;// s0', l0' 461 462YloopHalfPixelXYRnd$rndVal.Offset$offset 463 ;// Processsing one line 464 ;// t00, t01, t10, t11 required from previous loop 465 M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d' 466 SUB yMask, yMask, #2 467 M_EXT_XINT $offset, t20, t21, t30, t31 468 M_HSUM_XOR t20, t30, tmp ;// s1, l1 469 M_HSUM_XOR t21, t31, tmp ;// s1', l1' 470 M_AVG4 t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1 471 M_AVG4 t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1' 472 STRD t00, t01, [pDst], dstStep ;// store the average 473 474 ;// Processsing another line 475 ;// t20, t21, t30, t31 required from above 476 M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' 477 TST yMask, #7 478 M_EXT_XINT $offset, t00, t01, t10, t11 479 M_HSUM_XOR t00, t10, tmp 480 M_HSUM_XOR t01, t11, tmp 481 M_AVG4 t20, t30, t00, t10, $rndVal 482 M_AVG4 t21, t31, t01, t11, $rndVal 483 STRD t20, t21, [pDst], dstStep 484 485 BGT YloopHalfPixelXYRnd$rndVal.Offset$offset 486 487 IF $offset/=3 :LOR: $rndVal/=1 488 B SwitchPredictTypeEnd 489 ENDIF 490 MEND 491;// *************************************************************************** 492;// Motion compensation handler macros end here 493;// *************************************************************************** 494 ;// Description: 495 ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal 496 ;// combination in the "switch" to prediction processing code segment 497 ;// 498 ;// Syntax: 499 ;// M_CASE_OFFSET $rnd, $predictType 500 ;// 501 ;// Inputs: 502 ;// $rnd 0 for rounding, 1 for no rounding 503 ;// $predictType The prediction mode 504 ;// 505 ;// Outputs: 506 ;// Populated list of "M_CASE"s for the "M_SWITCH" macro 507 508 MACRO 509 M_CASE_OFFSET $rnd, $predictType 510 M_CASE Case$predictType.Rnd$rnd.Offset0 511 M_CASE Case$predictType.Rnd$rnd.Offset1 512 M_CASE Case$predictType.Rnd$rnd.Offset2 513 M_CASE Case$predictType.Rnd$rnd.Offset3 514 MEND 515;// *************************************************************************** 516 ;// Description: 517 ;// Populates all 2 kinds of rounding "cases" for each predictType in the 518 ;// "switch" to prediction processing code segment 519 ;// 520 ;// Syntax: 521 ;// M_CASE_OFFSET $predictType 522 ;// 523 ;// Inputs: 524 ;// $predictType The prediction mode 525 ;// 526 ;// Outputs: 527 ;// Populated list of "M_CASE_OFFSET" macros 528 529 MACRO 530 M_CASE_MCRECONBLOCK $predictType 531 M_CASE_OFFSET 0, $predictType ;// 0 for rounding 532 M_CASE_OFFSET 1, $predictType ;// 1 for no rounding 533 MEND 534;// *************************************************************************** 535 ;// Description: 536 ;// Populates all 8 kinds of rounding and offset combinations handling macros 537 ;// for the specified predictType. In case of "IntegerPixel" predictType, 538 ;// rounding is not required so same code segment handles both cases 539 ;// 540 ;// Syntax: 541 ;// M_MCRECONBLOCK $predictType 542 ;// 543 ;// Inputs: 544 ;// $predictType The prediction mode 545 ;// 546 ;// Outputs: 547 ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified 548 ;// predictType. Each 549 ;// M_MCRECONBLOCK_<predictType> $rnd, $offset 550 ;// is an code segment (starting with a label indicating the predictType, 551 ;// rounding and offset combination) 552 ;// Four calls of this macro with the 4 prediction modes populate all the 32 553 ;// handlers 554 555 MACRO 556 M_MCRECONBLOCK $predictType 557 M_MCRECONBLOCK_$predictType 0, 0 558 M_MCRECONBLOCK_$predictType 0, 1 559 M_MCRECONBLOCK_$predictType 0, 2 560 M_MCRECONBLOCK_$predictType 0, 3 561 IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference 562 M_MCRECONBLOCK_$predictType 1, 0 563 M_MCRECONBLOCK_$predictType 1, 1 564 M_MCRECONBLOCK_$predictType 1, 2 565 M_MCRECONBLOCK_$predictType 1, 3 566 ENDIF 567 MEND 568;// *************************************************************************** 569;// Input/Output Registers 570pSrc RN 0 571srcStep RN 1 572arg_pSrcResidue RN 2 573pSrcResidue RN 12 574pDst RN 3 575dstStep RN 2 576predictType RN 10 577rndVal RN 11 578mask RN 11 579 580;// Local Scratch Registers 581zero RN 12 582y RN 14 583 584tmp1 RN 4 585tmp2 RN 5 586tmp3 RN 6 587tmp4 RN 7 588tmp5 RN 8 589tmp6 RN 9 590tmp7 RN 10 591tmp8 RN 11 592tmp9 RN 12 593 594t00 RN 4 595t01 RN 5 596t10 RN 6 597t11 RN 7 598t20 RN 8 599t21 RN 9 600t30 RN 10 601t31 RN 11 602tmp RN 12 603 604yMask RN 14 605 606dst RN 1 607return RN 0 608 609 ;// Allocate memory on stack 610 M_ALLOC4 Stk_pDst, 4 611 M_ALLOC4 Stk_pSrcResidue, 4 612 ;// Function header 613 M_START omxVCM4P2_MCReconBlock, r11 614 ;// Define stack arguments 615 M_ARG Arg_dstStep, 4 616 M_ARG Arg_predictType, 4 617 M_ARG Arg_rndVal, 4 618 ;// Save on stack 619 M_STR pDst, Stk_pDst 620 M_STR arg_pSrcResidue, Stk_pSrcResidue 621 ;// Load argument from the stack 622 M_LDR dstStep, Arg_dstStep 623 M_LDR predictType, Arg_predictType 624 M_LDR rndVal, Arg_rndVal 625 626 MOV y, #8 627 628 AND tmp1, pSrc, #3 629 ORR predictType, tmp1, predictType, LSL #3 630 ORR predictType, predictType, rndVal, LSL #2 631 ;// Truncating source pointer to align to 4 byte location 632 BIC pSrc, pSrc, #3 633 634 ;// Implementation takes care of all combinations of different 635 ;// predictTypes, rounding cases and source pointer offsets to alignment 636 ;// of 4 bytes in different code bases unless one of these parameter wasn't 637 ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK 638 ;// macros branch into 8 M_CASE macros for all combinations of the 2 639 ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte 640 ;// alignment. 641 M_SWITCH predictType 642 M_CASE_MCRECONBLOCK IntegerPixel 643 M_CASE_MCRECONBLOCK HalfPixelX 644 M_CASE_MCRECONBLOCK HalfPixelY 645 M_CASE_MCRECONBLOCK HalfPixelXY 646 M_ENDSWITCH 647 648 ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8 649 ;// particular macros (4 in case of IntegerPixel as rounding makes no 650 ;// difference there) to generate the code for all cases of rounding and 651 ;// offsets. LTORG is used to segment the code as code size bloated beyond 652 ;// 4KB. 653 M_MCRECONBLOCK IntegerPixel 654 M_MCRECONBLOCK HalfPixelX 655 LTORG 656 M_MCRECONBLOCK HalfPixelY 657 M_MCRECONBLOCK HalfPixelXY 658SwitchPredictTypeEnd 659 660 ;// Residue Addition 661 ;// This is done in 2 lane SIMD though loads are further optimized and 662 ;// 4 bytes are loaded in case of destination buffer. Algorithmic 663 ;// details are in inlined comments 664 M_LDR pSrcResidue, Stk_pSrcResidue 665 CMP pSrcResidue, #0 666 BEQ pSrcResidueConditionEnd 667pSrcResidueNotNull 668 M_LDR pDst, Stk_pDst 669 MOV y, #8 670 SUB dstStep, dstStep, #4 671Yloop_pSrcResidueNotNull 672 SUBS y, y, #1 673 LDR dst, [pDst] ;// dst = [dcba] 674 LDMIA pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA] 675 PKHBT tmp3, tmp1, tmp2, LSL #16 ;// Deltaval1 = [C A] 676 PKHTB tmp4, tmp2, tmp1, ASR #16 ;// DeltaVal2 = [D B] 677 UXTB16 tmp1, dst ;// tmp1 = [0c0a] 678 UXTB16 tmp2, dst, ROR #8 ;// tmp2 = [0d0b] 679 QADD16 tmp1, tmp1, tmp3 ;// Add and saturate to 16 bits 680 QADD16 tmp2, tmp2, tmp4 681 USAT16 tmp1, #8, tmp1 682 USAT16 tmp2, #8, tmp2 ;// armClip(0, 255, tmp2) 683 ORR tmp1, tmp1, tmp2, LSL #8 ;// tmp1 = [dcba] 684 STR tmp1, [pDst], #4 685 686 LDR dst, [pDst] 687 LDMIA pSrcResidue!, {tmp1, tmp2} 688 PKHBT tmp3, tmp1, tmp2, LSL #16 689 PKHTB tmp4, tmp2, tmp1, ASR #16 690 UXTB16 tmp1, dst 691 UXTB16 tmp2, dst, ROR #8 692 QADD16 tmp1, tmp1, tmp3 693 QADD16 tmp2, tmp2, tmp4 694 USAT16 tmp1, #8, tmp1 695 USAT16 tmp2, #8, tmp2 696 ORR tmp1, tmp1, tmp2, LSL #8 697 STR tmp1, [pDst], dstStep 698 699 BGT Yloop_pSrcResidueNotNull 700pSrcResidueConditionEnd 701 702 MOV return, #OMX_Sts_NoErr 703 704 M_END 705 ENDIF ;// ARM1136JS 706 707;// *************************************************************************** 708;// CortexA8 implementation 709;// *************************************************************************** 710 END 711;// *************************************************************************** 712;// omxVCM4P2_MCReconBlock ends 713;// *************************************************************************** 714