omxVCM4P10_PredictIntra_16x16_s.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1;// 2;// 3;// File Name: omxVCM4P10_PredictIntra_16x16_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 12290 6;// Date: Wednesday, April 9, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS CortexA8 17 18 19;//------------------------------------------------------- 20;// This table for implementing switch case of C in asm by 21;// the mehtod of two levels of indexing. 22;//------------------------------------------------------- 23 24 M_TABLE armVCM4P10_pIndexTable16x16 25 DCD OMX_VC_16X16_VERT, OMX_VC_16X16_HOR 26 DCD OMX_VC_16X16_DC, OMX_VC_16X16_PLANE 27 28 29 IF CortexA8 30 31 M_TABLE armVCM4P10_MultiplierTable16x16,1 32 DCW 7, 6, 5, 4, 3, 2, 1, 8 33 DCW 0, 1, 2, 3, 4, 5, 6, 7 34 DCW 8, 9, 10, 11, 12, 13, 14, 15 35 36;//-------------------------------------------- 37;// Constants 38;//-------------------------------------------- 39BLK_SIZE EQU 0x10 40MUL_CONST0 EQU 0x01010101 41MUL_CONST1 EQU 0x00060004 42MUL_CONST2 EQU 0x00070005 43MUL_CONST3 EQU 0x00030001 44MASK_CONST EQU 0x00FF00FF 45 46;//-------------------------------------------- 47;// Scratch variable 48;//-------------------------------------------- 49y RN 12 50pc RN 15 51 52return RN 0 53pTable RN 9 54count RN 11 55pMultTable RN 9 56; ---------------------------------------------- 57; Neon registers 58; ---------------------------------------------- 59qAbove QN Q0.U8 60qLeft QN Q1.U8 61qSum8 QN Q0.U16 62dSum80 DN D0.U16 63dSum81 DN D1.U16 64dSum4 DN D0.U16 65dSum2 DN D0.U32 66dSum1 DN D0.U64 67qOut QN Q3.U8 68dSumLeft DN D6.U64 69dSumAbove DN D7.U64 70dSum DN D8.U64 71dSum0 DN D8.U8[0] 72 73qH QN Q11.S32 74qV QN Q12.S32 75qA QN Q11.S16 76qB QN Q6.S16 77qC QN Q7.S16 78 79qB0 QN Q5.S16 80qB1 QN Q6.S16 81dA1 DN D23.S16 82 83dH0 DN D22.S32 84dH1 DN D23.S32 85dV0 DN D24.S32 86dV1 DN D25.S32 87 88qHV QN Q11.S64 89qHV0 QN Q11.S32 90qHV1 QN Q12.S64 91 92dHV00 DN D22.S32 93dHV01 DN D23.S32 94 95dHV0 DN D22.S16[0] 96dHV1 DN D23.S16[0] 97dHV10 DN D24.S64 98dHV11 DN D25.S64 99 100qSum0 QN Q0.S16 101qSum1 QN Q1.S16 102 103dOut0 DN D6.U8 104dOut1 DN D7.U8 105 106dLeft0 DN D2.U8 107dLeft1 DN D3.U8 108qConst QN Q13.S16 109 110dAbove0 DN D0.U8 111dAbove1 DN D1.U8 112 113dRevLeft64 DN D12.U64 114dRevLeft DN D12.U8 115dRevAbove64 DN D5.U64 116dRevAbove DN D5.U8 117qLeftDiff QN Q8.S16 118dLeftDiff1 DN D17.S16 119dLeftDiff64 DN D17.S64 120qDiffLeft QN Q8.S16 121qDiffAbove QN Q4.S16 122dAboveDiff1 DN D9.S16 123dAboveDiff64 DN D9.S64 124qAboveDiff QN Q4.S16 125 126dAboveLeft DN D4.U8 127 128dDiffLeft0 DN D16.S16 129dDiffLeft1 DN D17.S16 130dDiffAbove0 DN D8.S16 131dDiffAbove1 DN D9.S16 132 133qLeft15minus0 QN Q7.S16 134dLeft15minus0 DN D14.S16 135qAbove15minus0 QN Q3.S16 136dAbove15minus0 DN D6.S16 137 138qMultiplier QN Q10.S16 139qMultiplier0 QN Q10.S16 140qMultiplier1 QN Q12.S16 141dMultiplier0 DN D20.S16 142dMultiplier1 DN D21.S16 143 144dBPlusCMult7 DN D1.S64 145dBPlusCMult7S16 DN D1.S16 146 147qTmp QN Q0.U8 148 149;//-------------------------------------------- 150;// Declare input registers 151;//-------------------------------------------- 152pSrcLeft RN 0 ;// input pointer 153pSrcAbove RN 1 ;// input pointer 154pSrcAboveLeft RN 2 ;// input pointer 155pDst RN 3 ;// output pointer 156leftStep RN 4 ;// input variable 157dstStep RN 5 ;// input variable 158predMode RN 6 ;// input variable 159availability RN 7 ;// input variable 160 161pTmp RN 8 162step RN 10 163pTmp2 RN 11 164 165;//----------------------------------------------------------------------------------------------- 166;// omxVCM4P10_PredictIntra_16x16 starts 167;//----------------------------------------------------------------------------------------------- 168 169 ;// Write function header 170 M_START omxVCM4P10_PredictIntra_16x16, r11, d15 171 172 ;// Define stack arguments 173 M_ARG LeftStep, 4 174 M_ARG DstStep, 4 175 M_ARG PredMode, 4 176 M_ARG Availability, 4 177 178 ;// M_STALL ARM1136JS=4 179 180 LDR pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case 181 182 ;// Load argument from the stack 183 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 184 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 185 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 186 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 187 188 MOV y, #BLK_SIZE ;// Outer Loop Count 189 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 190 191OMX_VC_16X16_VERT 192 VLD1 qAbove, [pSrcAbove] 193 ADD pTmp, pDst, dstStep 194 ADD step, dstStep, dstStep 195 VST1 qAbove, [pDst], step 196 VST1 qAbove, [pTmp], step 197 VST1 qAbove, [pDst], step 198 VST1 qAbove, [pTmp], step 199 VST1 qAbove, [pDst], step 200 VST1 qAbove, [pTmp], step 201 VST1 qAbove, [pDst], step 202 VST1 qAbove, [pTmp], step 203 VST1 qAbove, [pDst], step 204 VST1 qAbove, [pTmp], step 205 VST1 qAbove, [pDst], step 206 VST1 qAbove, [pTmp], step 207 VST1 qAbove, [pDst], step 208 VST1 qAbove, [pTmp], step 209 VST1 qAbove, [pDst] 210 VST1 qAbove, [pTmp] 211 MOV return, #OMX_Sts_NoErr ;// returnNoError 212 M_EXIT 213 214OMX_VC_16X16_HOR 215 ADD pTmp, pSrcLeft, leftStep 216 ADD leftStep, leftStep, leftStep 217 ADD pTmp2, pDst, dstStep 218 ADD dstStep, dstStep, dstStep 219LoopHor 220 VLD1 {qLeft[]}, [pSrcLeft], leftStep 221 VLD1 {qTmp[]}, [pTmp], leftStep 222 SUBS y, y, #8 223 VST1 qLeft, [pDst], dstStep 224 VST1 qTmp, [pTmp2], dstStep 225 VLD1 {qLeft[]}, [pSrcLeft], leftStep 226 VLD1 {qTmp[]}, [pTmp], leftStep 227 VST1 qLeft, [pDst], dstStep 228 VST1 qTmp, [pTmp2], dstStep 229 VLD1 {qLeft[]}, [pSrcLeft], leftStep 230 VLD1 {qTmp[]}, [pTmp], leftStep 231 VST1 qLeft, [pDst], dstStep 232 VST1 qTmp, [pTmp2], dstStep 233 VLD1 {qLeft[]}, [pSrcLeft], leftStep 234 VLD1 {qTmp[]}, [pTmp], leftStep 235 VST1 qLeft, [pDst], dstStep 236 VST1 qTmp, [pTmp2], dstStep 237 238 BNE LoopHor ;// Loop for 16 times 239 MOV return, #OMX_Sts_NoErr 240 M_EXIT 241 242OMX_VC_16X16_DC 243 MOV count, #0 ;// count = 0 244 TST availability, #OMX_VC_LEFT 245 BEQ UpperOrNoneAvailable ;// Jump to Upper if not left 246 247 ADD pTmp, pSrcLeft, leftStep 248 ADD step, leftStep, leftStep 249 250 VLD1 {qLeft[0]}, [pSrcLeft],step 251 VLD1 {qLeft[1]}, [pTmp],step 252 VLD1 {qLeft[2]}, [pSrcLeft],step 253 VLD1 {qLeft[3]}, [pTmp],step 254 VLD1 {qLeft[4]}, [pSrcLeft],step 255 VLD1 {qLeft[5]}, [pTmp],step 256 VLD1 {qLeft[6]}, [pSrcLeft],step 257 VLD1 {qLeft[7]}, [pTmp],step 258 VLD1 {qLeft[8]}, [pSrcLeft],step 259 VLD1 {qLeft[9]}, [pTmp],step 260 VLD1 {qLeft[10]},[pSrcLeft],step 261 VLD1 {qLeft[11]},[pTmp],step 262 VLD1 {qLeft[12]},[pSrcLeft],step 263 VLD1 {qLeft[13]},[pTmp],step 264 VLD1 {qLeft[14]},[pSrcLeft],step 265 VLD1 {qLeft[15]},[pTmp] 266 267 VPADDL qSum8, qLeft 268 ADD count, count, #1 269 VPADD dSum4, dSum80, dSum81 270 VPADDL dSum2, dSum4 271 VPADDL dSumLeft, dSum2 272 VRSHR dSum, dSumLeft, #4 273 274UpperOrNoneAvailable 275 TST availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER) 276 BEQ BothOrNoneAvailable ;// Jump to Left if not upper 277 VLD1 qAbove, [pSrcAbove] 278 ADD count, count, #1 ;// if upper inc count by 1 279 VPADDL qSum8, qAbove 280 VPADD dSum4, dSum80, dSum81 281 VPADDL dSum2, dSum4 282 VPADDL dSumAbove, dSum2 283 VRSHR dSum, dSumAbove, #4 284 285BothOrNoneAvailable 286 CMP count, #2 ;// check if both available 287 BNE NoneAvailable 288 VADD dSum, dSumAbove, dSumLeft 289 VRSHR dSum, dSum, #5 290 291 292NoneAvailable 293 VDUP qOut, dSum0 294 CMP count, #0 ;// check if none available 295 ADD pTmp, pDst, dstStep 296 ADD step, dstStep, dstStep 297 BNE LoopDC 298 VMOV qOut, #128 299LoopDC 300 VST1 qOut, [pDst], step 301 VST1 qOut, [pTmp], step 302 VST1 qOut, [pDst], step 303 VST1 qOut, [pTmp], step 304 VST1 qOut, [pDst], step 305 VST1 qOut, [pTmp], step 306 VST1 qOut, [pDst], step 307 VST1 qOut, [pTmp], step 308 VST1 qOut, [pDst], step 309 VST1 qOut, [pTmp], step 310 VST1 qOut, [pDst], step 311 VST1 qOut, [pTmp], step 312 VST1 qOut, [pDst], step 313 VST1 qOut, [pTmp], step 314 VST1 qOut, [pDst], step 315 VST1 qOut, [pTmp], step 316 MOV return, #OMX_Sts_NoErr 317 M_EXIT 318 319OMX_VC_16X16_PLANE 320 LDR pMultTable, =armVCM4P10_MultiplierTable16x16 321 VLD1 qAbove, [pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 322 VLD1 dAboveLeft[0],[pSrcAboveLeft] 323 ADD pTmp, pSrcLeft, leftStep 324 ADD step, leftStep, leftStep 325 VLD1 {qLeft[0]}, [pSrcLeft],step 326 VLD1 {qLeft[1]}, [pTmp],step 327 VLD1 {qLeft[2]}, [pSrcLeft],step 328 VLD1 {qLeft[3]}, [pTmp],step 329 VLD1 {qLeft[4]}, [pSrcLeft],step 330 VLD1 {qLeft[5]}, [pTmp],step 331 VLD1 {qLeft[6]}, [pSrcLeft],step 332 VLD1 {qLeft[7]}, [pTmp],step 333 VLD1 {qLeft[8]}, [pSrcLeft],step 334 VLD1 {qLeft[9]}, [pTmp],step 335 VLD1 {qLeft[10]}, [pSrcLeft],step 336 VLD1 {qLeft[11]}, [pTmp],step 337 VLD1 {qLeft[12]}, [pSrcLeft],step 338 VLD1 {qLeft[13]}, [pTmp],step 339 VLD1 {qLeft[14]}, [pSrcLeft],step 340 VLD1 {qLeft[15]}, [pTmp] 341 342 VREV64 dRevAbove, dAbove1 ;// pSrcAbove[15:14:13:12:11:10:9:8] 343 VSUBL qAbove15minus0, dRevAbove, dAboveLeft ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0] 344 VSHR dRevAbove64, dRevAbove64, #8 ;// pSrcAbove[14:13:12:11:10:9:8:X] 345 VSUBL qAboveDiff, dRevAbove, dAbove0 346 347 VSHL dAboveDiff64, dAboveDiff64, #16 348 VEXT dDiffAbove1, dAboveDiff1, dAbove15minus0, #1 349 350 VREV64 dRevLeft,dLeft1 ;// pSrcLeft[15:14:13:12:11:10:9:8] 351 VSUBL qLeft15minus0,dRevLeft, dAboveLeft ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] 352 VSHR dRevLeft64, dRevLeft64, #8 ;// pSrcLeft[14:13:12:11:10:9:8:X] 353 VSUBL qLeftDiff,dRevLeft, dLeft0 354 355 ;// Multiplier = [8|1|2|...|6|7] 356 VLD1 qMultiplier, [pMultTable]! 357 358 VSHL dLeftDiff64, dLeftDiff64, #16 359 VEXT dDiffLeft1, dLeftDiff1, dLeft15minus0, #1 360 361 VMULL qH,dDiffAbove0, dMultiplier0 362 VMULL qV,dDiffLeft0, dMultiplier0 363 VMLAL qH,dDiffAbove1, dMultiplier1 364 VMLAL qV,dDiffLeft1, dMultiplier1 365 366 VPADD dHV00,dH1,dH0 367 VPADD dHV01,dV1,dV0 368 VPADDL qHV, qHV0 369 VSHL qHV1,qHV,#2 370 VADD qHV,qHV,qHV1 371 372 ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)] 373 VRSHR qHV,qHV,#6 374 375 ;// HV1 = [c*7|b*7] 376 VSHL qHV1,qHV,#3 377 VSUB qHV1,qHV1,qHV 378 379 ;// Multiplier1 = [0|1|2|...|7] 380 VLD1 qMultiplier0, [pMultTable]! 381 VDUP qB, dHV0 382 VDUP qC, dHV1 383 384 VADDL qA,dAbove1,dLeft1 385 VSHL qA,qA, #4 386 VDUP qA,dA1[3] 387 VADD dBPlusCMult7, dHV10, dHV11 388 389 ;// Multiplier1 = [8|9|10|...|15] 390 VLD1 qMultiplier1, [pMultTable] 391 ;// Const = a - 7*(b+c) 392 VDUP qConst, dBPlusCMult7S16[0] 393 VSUB qConst, qA, qConst 394 395 ;// B0 = [0*b|1*b|2*b|3*b|......|7*b] 396 VMUL qB0,qB,qMultiplier0 397 398 ;// B0 = [8*b|9*b|10*b|11*b|....|15*b] 399 VMUL qB1,qB,qMultiplier1 400 401 VADD qSum0, qB0, qConst 402 VADD qSum1, qB1, qConst 403 404 ;// Loops for 16 times 405LoopPlane 406 ;// (b*x + c*y + C)>>5 407 VQRSHRUN dOut0, qSum0,#5 408 VQRSHRUN dOut1, qSum1,#5 409 SUBS y, y, #1 410 VST1 qOut,[pDst],dstStep 411 VADD qSum0,qSum0,qC 412 VADD qSum1,qSum1,qC 413 BNE LoopPlane 414 415 MOV return, #OMX_Sts_NoErr 416 417 M_END 418 419 ENDIF ;// CortexA8 420 421 END 422;----------------------------------------------------------------------------------------------- 423; omxVCM4P10_PredictIntra_16x16 ends 424;----------------------------------------------------------------------------------------------- 425