1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: omxVCM4P2_MCReconBlock_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 9641 21;// Date: Thursday, February 7, 2008 22;// 23;// 24;// 25;// 26;// Description: 27;// 28;// 29 30;// Include standard headers 31 INCLUDE omxtypes_s.h 32 INCLUDE armCOMM_s.h 33 34;// Import symbols required from other files 35 36 M_VARIANTS ARM1136JS 37 38;// *************************************************************************** 39;// ARM1136JS implementation 40;// *************************************************************************** 41 IF ARM1136JS 42 43;// *************************************************************************** 44;// MACRO DEFINITIONS 45;// *************************************************************************** 46 ;// Description: 47 ;// 48 ;// dest[j] = (x[j] + y[j] + round) >> 1, j=0..3 49 ;// 50 ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to 51 ;// each sum before dividing by two, if round is 1 52 ;// 53 ;// Syntax: 54 ;// M_UHADD8R $dest, $x, $y, $round, $mask 55 ;// 56 ;// Inputs: 57 ;// $x four packed bytes, x[3] : x[2] : x[1] : x[0] 58 ;// $y four packed bytes, y[3] : y[2] : y[1] : y[0] 59 ;// $round 0 if no rounding to be added, 1 if rounding to be done 60 ;// $mask some register set to 0x80808080 61 ;// 62 ;// Outputs: 63 ;// $dest four packed bytes, z[3] : z[2] : z[1] : z[0] 64 65 MACRO 66 M_UHADD8R $dest, $x, $y, $round, $mask 67 IF $round = 1 68 IF $dest /= $y 69 MVN $dest, $x 70 UHSUB8 $dest, $y, $dest 71 EOR $dest, $dest, $mask 72 ELSE 73 MVN $dest, $y 74 UHSUB8 $dest, $x, $dest 75 EOR $dest, $dest, $mask 76 ENDIF 77 ELSE 78 UHADD8 $dest, $x, $y 79 ENDIF 80 MEND 81;// *************************************************************************** 82 ;// Description: 83 ;// Load 8 bytes from $pSrc (aligned or unaligned locations) 84 ;// 85 ;// Syntax: 86 ;// M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset 87 ;// 88 ;// Inputs: 89 ;// $pSrc 4 byte aligned source pointer to an address just less than 90 ;// or equal to the data location 91 ;// $srcStep The stride on source 92 ;// $scratch A scratch register, used internally for temp calculations 93 ;// $offset Difference of source data location to the source pointer 94 ;// Use when $offset != 0 (unaligned load) 95 ;// 96 ;// Outputs: 97 ;// $pSrc In case the macro accepts stride, it increments the pSrc by 98 ;// that value, else unchanged 99 ;// $out0 four packed bytes, z[3] : z[2] : z[1] : z[0] 100 ;// $out1 four packed bytes, z[7] : z[6] : z[5] : z[4] 101 ;// 102 ;// Note: {$out0, $out1, $scratch} should be registers with ascending 103 ;// register numbering. In case offset is 0, $scratch is not modified. 104 105 MACRO 106 M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset 107 IF $offset = 0 108 LDM $pSrc, {$out0, $out1} 109 ADD $pSrc, $pSrc, $srcStep 110 ELSE 111 LDM $pSrc, {$out0, $out1, $scratch} 112 ADD $pSrc, $pSrc, $srcStep 113 114 MOV $out0, $out0, LSR #8 * $offset 115 ORR $out0, $out0, $out1, LSL #(32 - 8 * ($offset)) 116 MOV $out1, $out1, LSR #8 * $offset 117 ORR $out1, $out1, $scratch, LSL #(32 - 8 * ($offset)) 118 ENDIF 119 MEND 120 121;// *************************************************************************** 122 ;// Description: 123 ;// Loads three words for X interpolation, update pointer to next row. For 124 ;// X interpolation, given a truncated-4byteAligned source pointer, 125 ;// invariably three continous words are required from there to get the 126 ;// nine bytes from the source pointer for filtering. 127 ;// 128 ;// Syntax: 129 ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 130 ;// 131 ;// Inputs: 132 ;// $pSrc 4 byte aligned source pointer to an address just less than 133 ;// or equal to the data location 134 ;// 135 ;// $srcStep The stride on source 136 ;// 137 ;// $offset Difference of source data location to the source pointer 138 ;// Use when $offset != 0 (unaligned load) 139 ;// 140 ;// Outputs: 141 ;// $pSrc Incremented by $srcStep 142 ;// 143 ;// $word0, $word1, $word2, $word3 144 ;// Three of these are outputs based on the $offset parameter. 145 ;// The outputs are specifically generated to be processed by 146 ;// the M_EXT_XINT macro. Following is the illustration to show 147 ;// how the nine bytes are spanned for different offsets from 148 ;// notTruncatedForAlignmentSourcePointer. 149 ;// 150 ;// ------------------------------------------------------ 151 ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | 152 ;// |------------------------------------------------------| 153 ;// | 0 | 0 | 0123 | 4567 | 8xxx | | 154 ;// | 1 | -1 | x012 | 3456 | 78xx | | 155 ;// | 2 | -2 | xx01 | 2345 | 678x | | 156 ;// | 3 | -3 | xxx0 | | 1234 | 5678 | 157 ;// ------------------------------------------------------ 158 ;// 159 ;// where the numbering (0-8) is to designate the 9 bytes from 160 ;// start of a particular row. The illustration doesn't take in 161 ;// account the positioning of bytes with in the word and the 162 ;// macro combination with M_EXT_XINT will work only in little 163 ;// endian environs 164 ;// 165 ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending 166 ;// register numbering 167 168 MACRO 169 M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 170 IF $offset /= 3 171 LDM $pSrc, {$word0, $word1, $word2} 172 ELSE 173 LDM $pSrc, {$word0, $word2, $word3} 174 ENDIF 175 ADD $pSrc, $pSrc, $srcStep 176 MEND 177 178;// *************************************************************************** 179 ;// Description: 180 ;// Extract four registers of four pixels for X interpolation 181 ;// 182 ;// Syntax: 183 ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3 184 ;// 185 ;// Inputs: 186 ;// $offset Difference of source data location to the source pointer 187 ;// Use when $offset != 0 (unaligned load) 188 ;// 189 ;// $word0, $word1, $word2, $word3 190 ;// Three of these are inputs based on the $offset parameter. 191 ;// The inputs are specifically selected to be processed by 192 ;// the M_EXT_XINT macro. 193 ;// 194 ;// ------------------------------------------------------ 195 ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | 196 ;// |------------------------------------------------------| 197 ;// | 0 | 0 | 0123 | 4567 | 8xxx | yyyy | 198 ;// | 1 | -1 | x012 | 3456 | 78xx | yyyy | 199 ;// | 2 | -2 | xx01 | 2345 | 678x | yyyy | 200 ;// | 3 | -3 | xxx0 | yyyy | 1234 | 5678 | 201 ;// ------------------------------------------------------ 202 ;// 203 ;// Outputs: 204 ;// $word0, $word1, $word2, $word3 205 ;// Bytes from the original source pointer (not truncated for 206 ;// 4 byte alignment) as shown in the table. 207 ;// ------------------------------- 208 ;// | word0 | word1 | word2 | word3 | 209 ;// |-------------------------------| 210 ;// | 0123 | 4567 | 1234 | 5678 | 211 ;// ------------------------------- 212 ;// 213 ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending 214 ;// register numbering 215 216 MACRO 217 M_EXT_XINT $offset, $word0, $word1, $word2, $word3 218 IF $offset = 0 219 ; $word0 and $word1 are ok 220 ; $word2, $word3 are just 8 shifted versions 221 MOV $word3, $word1, LSR #8 222 ORR $word3, $word3, $word2, LSL #24 223 MOV $word2, $word0, LSR #8 224 ORR $word2, $word2, $word1, LSL #24 225 ELIF $offset = 3 226 ; $word2 and $word3 are ok (taken care while loading itself) 227 ; set $word0 & $word1 228 MOV $word0, $word0, LSR #24 229 ORR $word0, $word0, $word2, LSL #8 230 MOV $word1, $word2, LSR #24 231 ORR $word1, $word1, $word3, LSL #8 232 ELSE 233 MOV $word0, $word0, LSR #8 * $offset 234 ORR $word0, $word0, $word1, LSL #(32 - 8 * ($offset)) 235 MOV $word1, $word1, LSR #8 * $offset 236 ORR $word1, $word1, $word2, LSL #(32 - 8 * ($offset)) 237 238 MOV $word3, $word1, LSR #8 239 ORR $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1)) 240 MOV $word2, $word0, LSR #8 241 ORR $word2, $word2, $word1, LSL #24 242 ENDIF 243 MEND 244 245;// *************************************************************************** 246 ;// Description: 247 ;// Computes half-sum and xor of two inputs and puts them in the input 248 ;// registers in that order 249 ;// 250 ;// Syntax: 251 ;// M_HSUM_XOR $v0, $v1, $tmp 252 ;// 253 ;// Inputs: 254 ;// $v0 a, first input 255 ;// $v1 b, second input 256 ;// $tmp scratch register 257 ;// 258 ;// Outputs: 259 ;// $v0 (a + b)/2 260 ;// $v1 a ^ b 261 262 MACRO 263 M_HSUM_XOR $v0, $v1, $tmp 264 UHADD8 $tmp, $v0, $v1 ;// s0 = a + b 265 EOR $v1, $v0, $v1 ;// l0 = a ^ b 266 MOV $v0, $tmp ;// s0 267 MEND 268;// *************************************************************************** 269 ;// Description: 270 ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in 271 ;// mcReconBlock module. Very specific to the implementation of 272 ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and 273 ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are 274 ;// not significant and are used by the callee for row counter (y) 275 ;// 276 ;// Some points to note are: 277 ;// 1. Input is pair of pair-averages and Xors 278 ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another 279 ;// running average 280 ;// 3. Output is in the first argument 281 ;// 282 ;// Syntax: 283 ;// M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal 284 ;// 285 ;// Inputs: 286 ;// $sum0 (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged 287 ;// $lsb0 (a ^ b) 288 ;// $sum1 (c + d) >> 1. Not modified 289 ;// $lsb1 (c ^ d) Not modified 290 ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding 291 ;// 292 ;// Outputs: 293 ;// $sum0 (a + b + c + d + 1) / 4 : If no rounding 294 ;// (a + b + c + d + 2) / 4 : If rounding 295 296 MACRO 297 M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal 298 LCLS OP1 299 LCLS OP2 300 IF $rndVal = 0 ;// rounding case 301OP1 SETS "AND" 302OP2 SETS "ORR" 303 ELSE ;// Not rounding case 304OP1 SETS "ORR" 305OP2 SETS "AND" 306 ENDIF 307 308 LCLS lsb2 309 LCLS sum2 310 LCLS dest 311 312lsb2 SETS "tmp" 313sum2 SETS "$lsb0" 314dest SETS "$sum0" 315 316 $OP1 $lsb0, $lsb0, $lsb1 ;// e0 = e0 & e1 317 EOR $lsb2, $sum0, $sum1 ;// e2 = s0 ^ s1 318 $OP2 $lsb2, $lsb2, $lsb0 ;// e2 = e2 | e0 319 AND $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask 320 UHADD8 $sum2, $sum0, $sum1 ;// s2 = (s0 + s1)/2 321 UADD8 $dest, $sum2, $lsb2 ;// dest = s2 + e2 322 MEND 323;// *************************************************************************** 324;// Motion compensation handler macros 325;// *************************************************************************** 326 ;// Description: 327 ;// Implement motion compensation routines using the named registers in 328 ;// callee function. Each of the following 4 implement the 4 predict type 329 ;// Each handles 8 cases each ie all the combinations of 4 types of source 330 ;// alignment offsets and 2 types of rounding flag 331 ;// 332 ;// Syntax: 333 ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset 334 ;// M_MCRECONBLOCK_HalfPixelX $rndVal, $offset 335 ;// M_MCRECONBLOCK_HalfPixelY $rndVal, $offset 336 ;// M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset 337 ;// 338 ;// Inputs: 339 ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding 340 ;// $offset $pSrc MOD 4 value. Offset from 4 byte aligned location. 341 ;// 342 ;// Outputs: 343 ;// Outputs come in the named registers of the callee functions 344 ;// The macro loads the data from the source pointer, processes it and 345 ;// stores in the destination pointer. Does the whole prediction cycle 346 ;// of Motion Compensation routine for a particular predictType 347 ;// After this only residue addition to the predicted values remain 348 349 MACRO 350 M_MCRECONBLOCK_IntegerPixel $rndVal, $offset 351 ;// Algorithmic Description: 352 ;// This handles motion compensation for IntegerPixel predictType. Both 353 ;// rounding cases are handled by the same code base. It is just a copy 354 ;// from source to destination. Two lines are done per loop to reduce 355 ;// stalls. Loop has been software pipelined as well for that purpose. 356 ;// 357 ;// M_LOAD_X loads a whole row in two registers and then they are stored 358 359CaseIntegerPixelRnd0Offset$offset 360CaseIntegerPixelRnd1Offset$offset 361 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset 362 M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 363YloopIntegerPixelOffset$offset 364 SUBS y, y, #2 365 STRD tmp1, tmp2, [pDst], dstStep 366 STRD tmp3, tmp4, [pDst], dstStep 367 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset 368 M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 369 BGT YloopIntegerPixelOffset$offset 370 371 B SwitchPredictTypeEnd 372 MEND 373;// *************************************************************************** 374 MACRO 375 M_MCRECONBLOCK_HalfPixelX $rndVal, $offset 376 ;// Algorithmic Description: 377 ;// This handles motion compensation for HalfPixelX predictType. The two 378 ;// rounding cases are handled by the different code base and spanned by 379 ;// different macro calls. Loop has been software pipelined to reduce 380 ;// stalls. 381 ;// 382 ;// Filtering involves averaging a pixel with the next horizontal pixel. 383 ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with 384 ;// all pixels in a row with 4 pixel in each register and another 2 385 ;// registers with pixels corresponding to one horizontally shifted pixel 386 ;// corresponding to the initial row pixels. These are set of packed 387 ;// registers appropriate to do 4 lane SIMD. 388 ;// After that M_UHADD8R macro does the averaging taking care of the 389 ;// rounding as required 390 391CaseHalfPixelXRnd$rndVal.Offset$offset 392 IF $rndVal = 0 393 LDR mask, =0x80808080 394 ENDIF 395 396 M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 397YloopHalfPixelXRnd$rndVal.Offset$offset 398 SUBS y, y, #1 399 M_EXT_XINT $offset, tmp1, tmp2, tmp3, tmp4 400 M_UHADD8R tmp5, tmp1, tmp3, (1-$rndVal), mask 401 M_UHADD8R tmp6, tmp2, tmp4, (1-$rndVal), mask 402 STRD tmp5, tmp6, [pDst], dstStep 403 M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 404 BGT YloopHalfPixelXRnd$rndVal.Offset$offset 405 406 B SwitchPredictTypeEnd 407 MEND 408;// *************************************************************************** 409 MACRO 410 M_MCRECONBLOCK_HalfPixelY $rndVal, $offset 411 ;// Algorithmic Description: 412 ;// This handles motion compensation for HalfPixelY predictType. The two 413 ;// rounding cases are handled by the different code base and spanned by 414 ;// different macro calls. PreLoading is used to avoid reload of same data. 415 ;// 416 ;// Filtering involves averaging a pixel with the next vertical pixel. 417 ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in 418 ;// each register. These are set of packed registers appropriate to do 419 ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care 420 ;// of the rounding as required 421 422CaseHalfPixelYRnd$rndVal.Offset$offset 423 IF $rndVal = 0 424 LDR mask, =0x80808080 425 ENDIF 426 427 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load 428YloopHalfPixelYRnd$rndVal.Offset$offset 429 SUBS y, y, #2 430 ;// Processing one line 431 M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 432 M_UHADD8R tmp1, tmp1, tmp3, (1-$rndVal), mask 433 M_UHADD8R tmp2, tmp2, tmp4, (1-$rndVal), mask 434 STRD tmp1, tmp2, [pDst], dstStep 435 ;// Processing another line 436 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset 437 M_UHADD8R tmp3, tmp3, tmp1, (1-$rndVal), mask 438 M_UHADD8R tmp4, tmp4, tmp2, (1-$rndVal), mask 439 STRD tmp3, tmp4, [pDst], dstStep 440 441 BGT YloopHalfPixelYRnd$rndVal.Offset$offset 442 443 B SwitchPredictTypeEnd 444 MEND 445;// *************************************************************************** 446 MACRO 447 M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset 448 ;// Algorithmic Description: 449 ;// This handles motion compensation for HalfPixelXY predictType. The two 450 ;// rounding cases are handled by the different code base and spanned by 451 ;// different macro calls. PreLoading is used to avoid reload of same data. 452 ;// 453 ;// Filtering involves averaging a pixel with the next vertical, horizontal 454 ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT 455 ;// and M_EXT_XINT combination generates 4 registers with a row and its 456 ;// 1 pixel right shifted version, with 4 pixels in one register. Another 457 ;// call of that macro-combination gets another row. Then M_HSUM_XOR is 458 ;// called to get mutual half-sum and xor combinations of a row with its 459 ;// shifted version as they are inputs to the M_AVG4 macro which computes 460 ;// the 4 element average with rounding. Note that it is the half-sum/xor 461 ;// values that are preserved for next row as they can be re-used in the 462 ;// next call to the M_AVG4 and saves recomputation. 463 ;// Due to lack of register, the row counter and a masking value required 464 ;// in M_AVG4 are packed into a single register yMask where the last nibble 465 ;// holds the row counter values and rest holds the masking variable left 466 ;// shifted by 4 467 468CaseHalfPixelXYRnd$rndVal.Offset$offset 469 LDR yMask, =((0x01010101 << 4) + 8) 470 471 M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' 472 M_EXT_XINT $offset, t00, t01, t10, t11 473 M_HSUM_XOR t00, t10, tmp ;// s0, l0 474 M_HSUM_XOR t01, t11, tmp ;// s0', l0' 475 476YloopHalfPixelXYRnd$rndVal.Offset$offset 477 ;// Processsing one line 478 ;// t00, t01, t10, t11 required from previous loop 479 M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d' 480 SUB yMask, yMask, #2 481 M_EXT_XINT $offset, t20, t21, t30, t31 482 M_HSUM_XOR t20, t30, tmp ;// s1, l1 483 M_HSUM_XOR t21, t31, tmp ;// s1', l1' 484 M_AVG4 t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1 485 M_AVG4 t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1' 486 STRD t00, t01, [pDst], dstStep ;// store the average 487 488 ;// Processsing another line 489 ;// t20, t21, t30, t31 required from above 490 M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' 491 TST yMask, #7 492 M_EXT_XINT $offset, t00, t01, t10, t11 493 M_HSUM_XOR t00, t10, tmp 494 M_HSUM_XOR t01, t11, tmp 495 M_AVG4 t20, t30, t00, t10, $rndVal 496 M_AVG4 t21, t31, t01, t11, $rndVal 497 STRD t20, t21, [pDst], dstStep 498 499 BGT YloopHalfPixelXYRnd$rndVal.Offset$offset 500 501 IF $offset/=3 :LOR: $rndVal/=1 502 B SwitchPredictTypeEnd 503 ENDIF 504 MEND 505;// *************************************************************************** 506;// Motion compensation handler macros end here 507;// *************************************************************************** 508 ;// Description: 509 ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal 510 ;// combination in the "switch" to prediction processing code segment 511 ;// 512 ;// Syntax: 513 ;// M_CASE_OFFSET $rnd, $predictType 514 ;// 515 ;// Inputs: 516 ;// $rnd 0 for rounding, 1 for no rounding 517 ;// $predictType The prediction mode 518 ;// 519 ;// Outputs: 520 ;// Populated list of "M_CASE"s for the "M_SWITCH" macro 521 522 MACRO 523 M_CASE_OFFSET $rnd, $predictType 524 M_CASE Case$predictType.Rnd$rnd.Offset0 525 M_CASE Case$predictType.Rnd$rnd.Offset1 526 M_CASE Case$predictType.Rnd$rnd.Offset2 527 M_CASE Case$predictType.Rnd$rnd.Offset3 528 MEND 529;// *************************************************************************** 530 ;// Description: 531 ;// Populates all 2 kinds of rounding "cases" for each predictType in the 532 ;// "switch" to prediction processing code segment 533 ;// 534 ;// Syntax: 535 ;// M_CASE_OFFSET $predictType 536 ;// 537 ;// Inputs: 538 ;// $predictType The prediction mode 539 ;// 540 ;// Outputs: 541 ;// Populated list of "M_CASE_OFFSET" macros 542 543 MACRO 544 M_CASE_MCRECONBLOCK $predictType 545 M_CASE_OFFSET 0, $predictType ;// 0 for rounding 546 M_CASE_OFFSET 1, $predictType ;// 1 for no rounding 547 MEND 548;// *************************************************************************** 549 ;// Description: 550 ;// Populates all 8 kinds of rounding and offset combinations handling macros 551 ;// for the specified predictType. In case of "IntegerPixel" predictType, 552 ;// rounding is not required so same code segment handles both cases 553 ;// 554 ;// Syntax: 555 ;// M_MCRECONBLOCK $predictType 556 ;// 557 ;// Inputs: 558 ;// $predictType The prediction mode 559 ;// 560 ;// Outputs: 561 ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified 562 ;// predictType. Each 563 ;// M_MCRECONBLOCK_<predictType> $rnd, $offset 564 ;// is an code segment (starting with a label indicating the predictType, 565 ;// rounding and offset combination) 566 ;// Four calls of this macro with the 4 prediction modes populate all the 32 567 ;// handlers 568 569 MACRO 570 M_MCRECONBLOCK $predictType 571 M_MCRECONBLOCK_$predictType 0, 0 572 M_MCRECONBLOCK_$predictType 0, 1 573 M_MCRECONBLOCK_$predictType 0, 2 574 M_MCRECONBLOCK_$predictType 0, 3 575 IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference 576 M_MCRECONBLOCK_$predictType 1, 0 577 M_MCRECONBLOCK_$predictType 1, 1 578 M_MCRECONBLOCK_$predictType 1, 2 579 M_MCRECONBLOCK_$predictType 1, 3 580 ENDIF 581 MEND 582;// *************************************************************************** 583;// Input/Output Registers 584pSrc RN 0 585srcStep RN 1 586arg_pSrcResidue RN 2 587pSrcResidue RN 12 588pDst RN 3 589dstStep RN 2 590predictType RN 10 591rndVal RN 11 592mask RN 11 593 594;// Local Scratch Registers 595zero RN 12 596y RN 14 597 598tmp1 RN 4 599tmp2 RN 5 600tmp3 RN 6 601tmp4 RN 7 602tmp5 RN 8 603tmp6 RN 9 604tmp7 RN 10 605tmp8 RN 11 606tmp9 RN 12 607 608t00 RN 4 609t01 RN 5 610t10 RN 6 611t11 RN 7 612t20 RN 8 613t21 RN 9 614t30 RN 10 615t31 RN 11 616tmp RN 12 617 618yMask RN 14 619 620dst RN 1 621return RN 0 622 623 ;// Allocate memory on stack 624 M_ALLOC4 Stk_pDst, 4 625 M_ALLOC4 Stk_pSrcResidue, 4 626 ;// Function header 627 M_START omxVCM4P2_MCReconBlock, r11 628 ;// Define stack arguments 629 M_ARG Arg_dstStep, 4 630 M_ARG Arg_predictType, 4 631 M_ARG Arg_rndVal, 4 632 ;// Save on stack 633 M_STR pDst, Stk_pDst 634 M_STR arg_pSrcResidue, Stk_pSrcResidue 635 ;// Load argument from the stack 636 M_LDR dstStep, Arg_dstStep 637 M_LDR predictType, Arg_predictType 638 M_LDR rndVal, Arg_rndVal 639 640 MOV y, #8 641 642 AND tmp1, pSrc, #3 643 ORR predictType, tmp1, predictType, LSL #3 644 ORR predictType, predictType, rndVal, LSL #2 645 ;// Truncating source pointer to align to 4 byte location 646 BIC pSrc, pSrc, #3 647 648 ;// Implementation takes care of all combinations of different 649 ;// predictTypes, rounding cases and source pointer offsets to alignment 650 ;// of 4 bytes in different code bases unless one of these parameter wasn't 651 ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK 652 ;// macros branch into 8 M_CASE macros for all combinations of the 2 653 ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte 654 ;// alignment. 655 M_SWITCH predictType 656 M_CASE_MCRECONBLOCK IntegerPixel 657 M_CASE_MCRECONBLOCK HalfPixelX 658 M_CASE_MCRECONBLOCK HalfPixelY 659 M_CASE_MCRECONBLOCK HalfPixelXY 660 M_ENDSWITCH 661 662 ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8 663 ;// particular macros (4 in case of IntegerPixel as rounding makes no 664 ;// difference there) to generate the code for all cases of rounding and 665 ;// offsets. LTORG is used to segment the code as code size bloated beyond 666 ;// 4KB. 667 M_MCRECONBLOCK IntegerPixel 668 M_MCRECONBLOCK HalfPixelX 669 LTORG 670 M_MCRECONBLOCK HalfPixelY 671 M_MCRECONBLOCK HalfPixelXY 672SwitchPredictTypeEnd 673 674 ;// Residue Addition 675 ;// This is done in 2 lane SIMD though loads are further optimized and 676 ;// 4 bytes are loaded in case of destination buffer. Algorithmic 677 ;// details are in inlined comments 678 M_LDR pSrcResidue, Stk_pSrcResidue 679 CMP pSrcResidue, #0 680 BEQ pSrcResidueConditionEnd 681pSrcResidueNotNull 682 M_LDR pDst, Stk_pDst 683 MOV y, #8 684 SUB dstStep, dstStep, #4 685Yloop_pSrcResidueNotNull 686 SUBS y, y, #1 687 LDR dst, [pDst] ;// dst = [dcba] 688 LDMIA pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA] 689 PKHBT tmp3, tmp1, tmp2, LSL #16 ;// Deltaval1 = [C A] 690 PKHTB tmp4, tmp2, tmp1, ASR #16 ;// DeltaVal2 = [D B] 691 UXTB16 tmp1, dst ;// tmp1 = [0c0a] 692 UXTB16 tmp2, dst, ROR #8 ;// tmp2 = [0d0b] 693 QADD16 tmp1, tmp1, tmp3 ;// Add and saturate to 16 bits 694 QADD16 tmp2, tmp2, tmp4 695 USAT16 tmp1, #8, tmp1 696 USAT16 tmp2, #8, tmp2 ;// armClip(0, 255, tmp2) 697 ORR tmp1, tmp1, tmp2, LSL #8 ;// tmp1 = [dcba] 698 STR tmp1, [pDst], #4 699 700 LDR dst, [pDst] 701 LDMIA pSrcResidue!, {tmp1, tmp2} 702 PKHBT tmp3, tmp1, tmp2, LSL #16 703 PKHTB tmp4, tmp2, tmp1, ASR #16 704 UXTB16 tmp1, dst 705 UXTB16 tmp2, dst, ROR #8 706 QADD16 tmp1, tmp1, tmp3 707 QADD16 tmp2, tmp2, tmp4 708 USAT16 tmp1, #8, tmp1 709 USAT16 tmp2, #8, tmp2 710 ORR tmp1, tmp1, tmp2, LSL #8 711 STR tmp1, [pDst], dstStep 712 713 BGT Yloop_pSrcResidueNotNull 714pSrcResidueConditionEnd 715 716 MOV return, #OMX_Sts_NoErr 717 718 M_END 719 ENDIF ;// ARM1136JS 720 721;// *************************************************************************** 722;// CortexA8 implementation 723;// *************************************************************************** 724 END 725;// *************************************************************************** 726;// omxVCM4P2_MCReconBlock ends 727;// *************************************************************************** 728