1;// 2;// 3;// File Name: omxVCM4P10_PredictIntraChroma_8x8_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 12290 6;// Date: Wednesday, April 9, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12 13 14 INCLUDE omxtypes_s.h 15 INCLUDE armCOMM_s.h 16 17 EXPORT armVCM4P10_pIndexTable8x8 18 19;// Define the processor variants supported by this file 20 21 M_VARIANTS CortexA8 22 23 AREA table, DATA 24;//------------------------------------------------------- 25;// This table for implementing switch case of C in asm by 26;// the mehtod of two levels of indexing. 27;//------------------------------------------------------- 28 29 M_TABLE armVCM4P10_pIndexTable8x8 30 DCD OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR 31 DCD OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE 32 33 M_TABLE armVCM4P10_MultiplierTableChroma8x8,1 34 DCW 3, 2, 1,4 35 DCW -3,-2,-1,0 36 DCW 1, 2, 3,4 37 38 39 40 IF CortexA8 41 42;//-------------------------------------------- 43;// Scratch variable 44;//-------------------------------------------- 45 46pc RN 15 47return RN 0 48pTable RN 8 49 50;//-------------------------------------------- 51;// Input Arguments 52;//-------------------------------------------- 53pSrcLeft RN 0 ;// input pointer 54pSrcAbove RN 1 ;// input pointer 55pSrcAboveLeft RN 2 ;// input pointer 56pDst RN 3 ;// output pointer 57leftStep RN 4 ;// input variable 58dstStep RN 5 ;// input variable 59predMode RN 6 ;// input variable 60availability RN 7 ;// input variable 61pMultiplierTable RN 2 62 63pTmp RN 9 64step RN 10 65 66;//--------------------- 67;// Neon Registers 68;//--------------------- 69 70;// OMX_VC_CHROMA_HOR 71 72dLeftVal0 DN D0.8 73dLeftVal1 DN D1.8 74dLeftVal2 DN D2.8 75dLeftVal3 DN D3.8 76dLeftVal4 DN D4.8 77dLeftVal5 DN D5.8 78dLeftVal6 DN D6.8 79dLeftVal7 DN D7.8 80 81;// OMX_VC_CHROMA_VERT 82 83dAboveVal DN D0.U8 84 85;// OMX_VC_CHROMA_DC 86 87dLeftVal DN D1.U8 88dSumAboveValU16 DN D2.U16 89dSumAboveValU32 DN D3.U32 90dSumAboveValU8 DN D3.U8 91dSumLeftValU16 DN D2.U16 92dSumLeftValU32 DN D1.U32 93dSumLeftValU8 DN D1.U8 94dSumAboveLeft DN D2.U32 95dSumAboveLeftU8 DN D2.U8 96dIndexRow0U8 DN D5.U8 97dIndexRow0 DN D5.U64 98dIndexRow4U8 DN D6.U8 99dIndexRow4 DN D6.U64 100dDstRow0 DN D0.U8 101dDstRow4 DN D4.U8 102dConst128U8 DN D0.U8 103 104;// OMX_VC_CHROMA_PLANE 105 106dRevAboveVal DN D3.U8 107dRevAboveValU64 DN D3.U64 108dAboveLeftVal DN D2.U8 109qAbove7minus0 QN Q3.S16 110qAboveDiff QN Q2.S16 111dIndex DN D8.U8 112dDiffAboveU8 DN D9.U8 113dDiffAboveS16 DN D9.S16 114dAboveDiff0U8 DN D4.U8 115dAboveDiff0U64 DN D4.U64 116dAbove7minus0U8 DN D6.U8 117dMultiplier DN D10.S16 118dHorPred DN D11.S16 119dRevLeftVal DN D3.U8 120dRevLeftValU64 DN D3.U64 121qLeft7minus0 QN Q7.S16 122qLeftDiff QN Q6.S16 123dDiffLeftU8 DN D16.U8 124dDiffLeftS16 DN D16.S16 125dLeftDiff0U8 DN D12.U8 126dLeftDiff0U64 DN D12.U64 127dLeft7minus0U8 DN D14.U8 128dVerPred DN D3.S16 129dHVValS16 DN D3.S16 130dHVValS32 DN D3.S32 131dHVTempS32 DN D2.S32 132qA QN Q0.S16 133qB QN Q2.S16 134qC QN Q3.S16 135qMultiplier QN Q5.S16 136dMultiplier0 DN D10.S16 137dMultiplier1 DN D11.S16 138qC0 QN Q0.S16 139qC1 QN Q1.S16 140qC2 QN Q4.S16 141qC3 QN Q5.S16 142qC4 QN Q6.S16 143qC5 QN Q7.S16 144qC6 QN Q8.S16 145qC7 QN Q9.S16 146qSum0 QN Q0.S16 147qSum1 QN Q1.S16 148qSum2 QN Q4.S16 149qSum3 QN Q5.S16 150qSum4 QN Q6.S16 151qSum5 QN Q7.S16 152qSum6 QN Q8.S16 153qSum7 QN Q9.S16 154dSum0 DN D0.U8 155dSum1 DN D1.U8 156dSum2 DN D2.U8 157dSum3 DN D3.U8 158dSum4 DN D4.U8 159dSum5 DN D5.U8 160dSum6 DN D6.U8 161dSum7 DN D7.U8 162 163;//----------------------------------------------------------------------------------------------- 164;// omxVCM4P10_PredictIntraChroma_8x8 starts 165;//----------------------------------------------------------------------------------------------- 166 167 ;// Write function header 168 M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15 169 170 ;// Define stack arguments 171 M_ARG LeftStep, 4 172 M_ARG DstStep, 4 173 M_ARG PredMode, 4 174 M_ARG Availability, 4 175 176 LDR pTable,=armVCM4P10_pIndexTable8x8 ;// Load index table for switch case 177 178 ;// Load argument from the stack 179 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 180 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 181 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 182 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 183 184 185 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 186 187OMX_VC_CHROMA_DC 188 189 TST availability, #OMX_VC_LEFT 190 BEQ DCChroma8x8LeftNotAvailable 191 192 ADD pTmp, pSrcLeft, leftStep 193 ADD step, leftStep, leftStep 194 195 ;// Load Left Edge 196 VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 197 VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep] 198 VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 199 VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep] 200 VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 201 VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep] 202 VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 203 VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep] 204 205 TST availability, #OMX_VC_UPPER 206 BEQ DCChroma8x8LeftOnlyAvailable 207 208 ;// Load Upper Edge also 209 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7] 210 211 MOV return, #OMX_Sts_NoErr ;// returnNoError 212 213 VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ] 214 VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ] 215 216 VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ] 217 VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ] 218 219 VADD dSumAboveLeft,dSumAboveValU32,dSumLeftValU32 220 VRSHR dSumAboveLeft,dSumAboveLeft,#3 ;// Sum = (Sum + 4) >> 3 221 VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 222 VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 223 224 VMOV dIndexRow0U8,#0x0c 225 VMOV dIndexRow4U8,#0x04 226 VSHL dIndexRow0,dIndexRow0,#32 ;// index0 = 0x0c0c0c0c00000000 227 VSHR dIndexRow4,dIndexRow4,#32 ;// index4 = 0x0000000004040404 228 VADD dIndexRow4U8,dIndexRow4U8,dIndexRow0U8 ;// index4 = 0x0c0c0c0c04040404 229 VTBL dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8 230 VTBL dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8 231 232DCChroma8x8LeftStore 233 ADD pTmp, pDst, dstStep 234 ADD step, dstStep, dstStep 235 236 VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 237 VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 238 VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 239 VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 240 VST1 dDstRow4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 241 VST1 dDstRow4,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 242 VST1 dDstRow4,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 243 VST1 dDstRow4,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 244 245 M_EXIT 246 247 248DCChroma8x8LeftOnlyAvailable 249 250 MOV return, #OMX_Sts_NoErr 251 252 VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ] 253 VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ] 254 VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 255 256 VDUP dDstRow0,dSumLeftValU8[0] 257 VDUP dDstRow4,dSumLeftValU8[4] 258 259 B DCChroma8x8LeftStore 260 261 262DCChroma8x8LeftNotAvailable 263 264 TST availability, #OMX_VC_UPPER 265 BEQ DCChroma8x8NoneAvailable 266 267 ;// Load Upper Edge 268 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7] 269 MOV return, #OMX_Sts_NoErr ;// returnNoError 270 271 VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ] 272 VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ] 273 VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 274 VMOV dIndexRow0U8,#0x04 275 VSHL dIndexRow0,dIndexRow0,#32 ;// index = 0x0404040400000000 276 VTBL dDstRow0,{dSumAboveValU8},dIndexRow0U8 277 278 B DCChroma8x8UpperStore 279 280 281DCChroma8x8NoneAvailable 282 283 VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0) 284 MOV return, #OMX_Sts_NoErr ;// returnNoError 285 286DCChroma8x8UpperStore 287 288 ADD pTmp, pDst, dstStep 289 ADD step, dstStep, dstStep 290 291 VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 292 VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 293 VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 294 VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 295 VST1 dDstRow0,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 296 VST1 dDstRow0,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 297 VST1 dDstRow0,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 298 VST1 dDstRow0,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 299 300 M_EXIT 301 302 303OMX_VC_CHROMA_VERT 304 305 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 306 MOV return, #OMX_Sts_NoErr 307 308 B DCChroma8x8UpperStore 309 310 311OMX_VC_CHROMA_HOR 312 313 ADD pTmp, pSrcLeft, leftStep 314 ADD step, leftStep, leftStep 315 316 VLD1 {dLeftVal0[]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 317 VLD1 {dLeftVal1[]},[pTmp],step ;// pSrcLeft[1*leftStep] 318 VLD1 {dLeftVal2[]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 319 VLD1 {dLeftVal3[]},[pTmp],step ;// pSrcLeft[3*leftStep] 320 VLD1 {dLeftVal4[]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 321 VLD1 {dLeftVal5[]},[pTmp],step ;// pSrcLeft[5*leftStep] 322 VLD1 {dLeftVal6[]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 323 VLD1 {dLeftVal7[]},[pTmp] ;// pSrcLeft[7*leftStep] 324 325 B DCChroma8x8PlaneStore 326 327 328OMX_VC_CHROMA_PLANE 329 ADD pTmp, pSrcLeft, leftStep 330 ADD step, leftStep, leftStep 331 332 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 333 VLD1 dAboveLeftVal[0],[pSrcAboveLeft] 334 335 VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 336 VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep] 337 VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 338 VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep] 339 VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 340 VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep] 341 VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 342 VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep] 343 344 345 VREV64 dRevAboveVal,dAboveVal ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7] 346 VSUBL qAbove7minus0,dRevAboveVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0] 347 VSHR dRevAboveValU64,dRevAboveValU64,#8 ;// pSrcAbove[X:0:1:2:3:4:5:6] 348 VSUBL qAboveDiff,dRevAboveVal,dAboveVal ;// pSrcAbove[6] - pSrcAbove[0] 349 ;// pSrcAbove[5] - pSrcAbove[1] 350 ;// pSrcAbove[4] - pSrcAbove[2] 351 352 VREV64 dRevLeftVal,dLeftVal ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7] 353 VSUBL qLeft7minus0,dRevLeftVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] 354 VSHR dRevLeftValU64,dRevLeftValU64,#8 ;// pSrcLeft[X:0:1:2:3:4:5:6] 355 VSUBL qLeftDiff,dRevLeftVal,dLeftVal ;// pSrcLeft[6] - pSrcLeft[0] 356 ;// pSrcLeft[5] - pSrcLeft[1] 357 ;// pSrcLeft[4] - pSrcLeft[2] 358 359 LDR pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8 ;// Used to calculate Hval & Vval 360 VSHL dAboveDiff0U64,dAboveDiff0U64,#16 361 VEXT dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2 ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ] 362 VLD1 dMultiplier,[pMultiplierTable]! 363 VSHL dLeftDiff0U64,dLeftDiff0U64,#16 364 VEXT dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2 ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ] 365 366 367 VMUL dHorPred,dDiffAboveS16,dMultiplier ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ] 368 VMUL dVerPred,dDiffLeftS16,dMultiplier 369 VPADD dHVValS16,dHorPred,dVerPred 370 371 372 VPADDL dHVValS32,dHVValS16 ;// [V|H] in 32 bits each 373 VSHL dHVTempS32,dHVValS32,#4 ;// 17*H = 16*H + H = (H<<4)+H 374 VADD dHVValS32,dHVValS32,dHVTempS32 ;// [ 17*V | 17*H ]in 32 bits each 375 VLD1 {dMultiplier0,dMultiplier1},[pMultiplierTable] ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ] 376 VRSHR dHVValS32,dHVValS32,#5 ;// [c|b] in 16bits each 377 VADDL qA,dAboveVal,dLeftVal 378 VDUP qA,qA[7] 379 VSHL qA,qA,#4 ;// [a|a|a|a|a|a|a|a] 380 VDUP qB,dHVValS16[0] ;// [b|b|b|b|b|b|b|b] 381 VDUP qC,dHVValS16[2] ;// [c|c|c|c|c|c|c|c] 382 383 384 VMUL qB,qB,qMultiplier 385 VMUL qC,qC,qMultiplier 386 VADD qB,qB,qA 387 388 VDUP qC0,qC[0] 389 VDUP qC1,qC[1] 390 VDUP qC2,qC[2] 391 VDUP qC3,qC[3] 392 VDUP qC4,qC[4] 393 VDUP qC5,qC[5] 394 VDUP qC6,qC[6] 395 VDUP qC7,qC[7] 396 397 VADD qSum0,qB,qC0 398 VADD qSum1,qB,qC1 399 VADD qSum2,qB,qC2 400 VADD qSum3,qB,qC3 401 VADD qSum4,qB,qC4 402 VADD qSum5,qB,qC5 403 VADD qSum6,qB,qC6 404 VADD qSum7,qB,qC7 405 406 VQRSHRUN dSum0,qSum0,#5 ;// (OMX_U8)armClip(0,255,(Sum+16)>>5) 407 VQRSHRUN dSum1,qSum1,#5 408 VQRSHRUN dSum2,qSum2,#5 409 VQRSHRUN dSum3,qSum3,#5 410 VQRSHRUN dSum4,qSum4,#5 411 VQRSHRUN dSum5,qSum5,#5 412 VQRSHRUN dSum6,qSum6,#5 413 VQRSHRUN dSum7,qSum7,#5 414 415DCChroma8x8PlaneStore 416 ADD pTmp, pDst, dstStep 417 ADD step, dstStep, dstStep 418 419 VST1 dSum0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 420 VST1 dSum1,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 421 VST1 dSum2,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 422 VST1 dSum3,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 423 VST1 dSum4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 424 VST1 dSum5,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 425 VST1 dSum6,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 426 VST1 dSum7,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 427 428 MOV return, #OMX_Sts_NoErr 429 M_END 430 431 ENDIF ;// CortexA8 432 433 END 434;//----------------------------------------------------------------------------------------------- 435;// omxVCM4P10_PredictIntraChroma_8x8 ends 436;//----------------------------------------------------------------------------------------------- 437