omxVCM4P10_PredictIntraChroma_8x8_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: omxVCM4P10_PredictIntraChroma_8x8_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 12290 21;// Date: Wednesday, April 9, 2008 22;// 23;// 24;// 25;// 26 27 28 INCLUDE omxtypes_s.h 29 INCLUDE armCOMM_s.h 30 31 EXPORT armVCM4P10_pIndexTable8x8 32 33;// Define the processor variants supported by this file 34 35 M_VARIANTS CortexA8 36 37 AREA table, DATA 38;//------------------------------------------------------- 39;// This table for implementing switch case of C in asm by 40;// the mehtod of two levels of indexing. 41;//------------------------------------------------------- 42 43 M_TABLE armVCM4P10_pIndexTable8x8 44 DCD OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR 45 DCD OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE 46 47 M_TABLE armVCM4P10_MultiplierTableChroma8x8,1 48 DCW 3, 2, 1,4 49 DCW -3,-2,-1,0 50 DCW 1, 2, 3,4 51 52 53 54 IF CortexA8 55 56;//-------------------------------------------- 57;// Scratch variable 58;//-------------------------------------------- 59 60pc RN 15 61return RN 0 62pTable RN 8 63 64;//-------------------------------------------- 65;// Input Arguments 66;//-------------------------------------------- 67pSrcLeft RN 0 ;// input pointer 68pSrcAbove RN 1 ;// input pointer 69pSrcAboveLeft RN 2 ;// input pointer 70pDst RN 3 ;// output pointer 71leftStep RN 4 ;// input variable 72dstStep RN 5 ;// input variable 73predMode RN 6 ;// input variable 74availability RN 7 ;// input variable 75pMultiplierTable RN 2 76 77pTmp RN 9 78step RN 10 79 80;//--------------------- 81;// Neon Registers 82;//--------------------- 83 84;// OMX_VC_CHROMA_HOR 85 86dLeftVal0 DN D0.8 87dLeftVal1 DN D1.8 88dLeftVal2 DN D2.8 89dLeftVal3 DN D3.8 90dLeftVal4 DN D4.8 91dLeftVal5 DN D5.8 92dLeftVal6 DN D6.8 93dLeftVal7 DN D7.8 94 95;// OMX_VC_CHROMA_VERT 96 97dAboveVal DN D0.U8 98 99;// OMX_VC_CHROMA_DC 100 101dLeftVal DN D1.U8 102dSumAboveValU16 DN D2.U16 103dSumAboveValU32 DN D3.U32 104dSumAboveValU8 DN D3.U8 105dSumLeftValU16 DN D2.U16 106dSumLeftValU32 DN D1.U32 107dSumLeftValU8 DN D1.U8 108dSumAboveLeft DN D2.U32 109dSumAboveLeftU8 DN D2.U8 110dIndexRow0U8 DN D5.U8 111dIndexRow0 DN D5.U64 112dIndexRow4U8 DN D6.U8 113dIndexRow4 DN D6.U64 114dDstRow0 DN D0.U8 115dDstRow4 DN D4.U8 116dConst128U8 DN D0.U8 117 118;// OMX_VC_CHROMA_PLANE 119 120dRevAboveVal DN D3.U8 121dRevAboveValU64 DN D3.U64 122dAboveLeftVal DN D2.U8 123qAbove7minus0 QN Q3.S16 124qAboveDiff QN Q2.S16 125dIndex DN D8.U8 126dDiffAboveU8 DN D9.U8 127dDiffAboveS16 DN D9.S16 128dAboveDiff0U8 DN D4.U8 129dAboveDiff0U64 DN D4.U64 130dAbove7minus0U8 DN D6.U8 131dMultiplier DN D10.S16 132dHorPred DN D11.S16 133dRevLeftVal DN D3.U8 134dRevLeftValU64 DN D3.U64 135qLeft7minus0 QN Q7.S16 136qLeftDiff QN Q6.S16 137dDiffLeftU8 DN D16.U8 138dDiffLeftS16 DN D16.S16 139dLeftDiff0U8 DN D12.U8 140dLeftDiff0U64 DN D12.U64 141dLeft7minus0U8 DN D14.U8 142dVerPred DN D3.S16 143dHVValS16 DN D3.S16 144dHVValS32 DN D3.S32 145dHVTempS32 DN D2.S32 146qA QN Q0.S16 147qB QN Q2.S16 148qC QN Q3.S16 149qMultiplier QN Q5.S16 150dMultiplier0 DN D10.S16 151dMultiplier1 DN D11.S16 152qC0 QN Q0.S16 153qC1 QN Q1.S16 154qC2 QN Q4.S16 155qC3 QN Q5.S16 156qC4 QN Q6.S16 157qC5 QN Q7.S16 158qC6 QN Q8.S16 159qC7 QN Q9.S16 160qSum0 QN Q0.S16 161qSum1 QN Q1.S16 162qSum2 QN Q4.S16 163qSum3 QN Q5.S16 164qSum4 QN Q6.S16 165qSum5 QN Q7.S16 166qSum6 QN Q8.S16 167qSum7 QN Q9.S16 168dSum0 DN D0.U8 169dSum1 DN D1.U8 170dSum2 DN D2.U8 171dSum3 DN D3.U8 172dSum4 DN D4.U8 173dSum5 DN D5.U8 174dSum6 DN D6.U8 175dSum7 DN D7.U8 176 177;//----------------------------------------------------------------------------------------------- 178;// omxVCM4P10_PredictIntraChroma_8x8 starts 179;//----------------------------------------------------------------------------------------------- 180 181 ;// Write function header 182 M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15 183 184 ;// Define stack arguments 185 M_ARG LeftStep, 4 186 M_ARG DstStep, 4 187 M_ARG PredMode, 4 188 M_ARG Availability, 4 189 190 LDR pTable,=armVCM4P10_pIndexTable8x8 ;// Load index table for switch case 191 192 ;// Load argument from the stack 193 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 194 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 195 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 196 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 197 198 199 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 200 201OMX_VC_CHROMA_DC 202 203 TST availability, #OMX_VC_LEFT 204 BEQ DCChroma8x8LeftNotAvailable 205 206 ADD pTmp, pSrcLeft, leftStep 207 ADD step, leftStep, leftStep 208 209 ;// Load Left Edge 210 VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 211 VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep] 212 VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 213 VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep] 214 VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 215 VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep] 216 VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 217 VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep] 218 219 TST availability, #OMX_VC_UPPER 220 BEQ DCChroma8x8LeftOnlyAvailable 221 222 ;// Load Upper Edge also 223 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7] 224 225 MOV return, #OMX_Sts_NoErr ;// returnNoError 226 227 VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ] 228 VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ] 229 230 VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ] 231 VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ] 232 233 VADD dSumAboveLeft,dSumAboveValU32,dSumLeftValU32 234 VRSHR dSumAboveLeft,dSumAboveLeft,#3 ;// Sum = (Sum + 4) >> 3 235 VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 236 VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 237 238 VMOV dIndexRow0U8,#0x0c 239 VMOV dIndexRow4U8,#0x04 240 VSHL dIndexRow0,dIndexRow0,#32 ;// index0 = 0x0c0c0c0c00000000 241 VSHR dIndexRow4,dIndexRow4,#32 ;// index4 = 0x0000000004040404 242 VADD dIndexRow4U8,dIndexRow4U8,dIndexRow0U8 ;// index4 = 0x0c0c0c0c04040404 243 VTBL dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8 244 VTBL dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8 245 246DCChroma8x8LeftStore 247 ADD pTmp, pDst, dstStep 248 ADD step, dstStep, dstStep 249 250 VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 251 VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 252 VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 253 VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 254 VST1 dDstRow4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 255 VST1 dDstRow4,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 256 VST1 dDstRow4,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 257 VST1 dDstRow4,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 258 259 M_EXIT 260 261 262DCChroma8x8LeftOnlyAvailable 263 264 MOV return, #OMX_Sts_NoErr 265 266 VPADDL dSumLeftValU16, dLeftVal ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ] 267 VPADDL dSumLeftValU32, dSumLeftValU16 ;// pSrcLeft[ 4+5+6+7 | 0+1+2+3 ] 268 VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2 269 270 VDUP dDstRow0,dSumLeftValU8[0] 271 VDUP dDstRow4,dSumLeftValU8[4] 272 273 B DCChroma8x8LeftStore 274 275 276DCChroma8x8LeftNotAvailable 277 278 TST availability, #OMX_VC_UPPER 279 BEQ DCChroma8x8NoneAvailable 280 281 ;// Load Upper Edge 282 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[0 to 7] 283 MOV return, #OMX_Sts_NoErr ;// returnNoError 284 285 VPADDL dSumAboveValU16, dAboveVal ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ] 286 VPADDL dSumAboveValU32, dSumAboveValU16 ;// pSrcAbove[ 4+5+6+7 | 0+1+2+3 ] 287 VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2 288 VMOV dIndexRow0U8,#0x04 289 VSHL dIndexRow0,dIndexRow0,#32 ;// index = 0x0404040400000000 290 VTBL dDstRow0,{dSumAboveValU8},dIndexRow0U8 291 292 B DCChroma8x8UpperStore 293 294 295DCChroma8x8NoneAvailable 296 297 VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0) 298 MOV return, #OMX_Sts_NoErr ;// returnNoError 299 300DCChroma8x8UpperStore 301 302 ADD pTmp, pDst, dstStep 303 ADD step, dstStep, dstStep 304 305 VST1 dDstRow0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 306 VST1 dDstRow0,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 307 VST1 dDstRow0,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 308 VST1 dDstRow0,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 309 VST1 dDstRow0,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 310 VST1 dDstRow0,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 311 VST1 dDstRow0,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 312 VST1 dDstRow0,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 313 314 M_EXIT 315 316 317OMX_VC_CHROMA_VERT 318 319 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 320 MOV return, #OMX_Sts_NoErr 321 322 B DCChroma8x8UpperStore 323 324 325OMX_VC_CHROMA_HOR 326 327 ADD pTmp, pSrcLeft, leftStep 328 ADD step, leftStep, leftStep 329 330 VLD1 {dLeftVal0[]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 331 VLD1 {dLeftVal1[]},[pTmp],step ;// pSrcLeft[1*leftStep] 332 VLD1 {dLeftVal2[]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 333 VLD1 {dLeftVal3[]},[pTmp],step ;// pSrcLeft[3*leftStep] 334 VLD1 {dLeftVal4[]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 335 VLD1 {dLeftVal5[]},[pTmp],step ;// pSrcLeft[5*leftStep] 336 VLD1 {dLeftVal6[]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 337 VLD1 {dLeftVal7[]},[pTmp] ;// pSrcLeft[7*leftStep] 338 339 B DCChroma8x8PlaneStore 340 341 342OMX_VC_CHROMA_PLANE 343 ADD pTmp, pSrcLeft, leftStep 344 ADD step, leftStep, leftStep 345 346 VLD1 dAboveVal,[pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 347 VLD1 dAboveLeftVal[0],[pSrcAboveLeft] 348 349 VLD1 {dLeftVal[0]},[pSrcLeft],step ;// pSrcLeft[0*leftStep] 350 VLD1 {dLeftVal[1]},[pTmp],step ;// pSrcLeft[1*leftStep] 351 VLD1 {dLeftVal[2]},[pSrcLeft],step ;// pSrcLeft[2*leftStep] 352 VLD1 {dLeftVal[3]},[pTmp],step ;// pSrcLeft[3*leftStep] 353 VLD1 {dLeftVal[4]},[pSrcLeft],step ;// pSrcLeft[4*leftStep] 354 VLD1 {dLeftVal[5]},[pTmp],step ;// pSrcLeft[5*leftStep] 355 VLD1 {dLeftVal[6]},[pSrcLeft],step ;// pSrcLeft[6*leftStep] 356 VLD1 {dLeftVal[7]},[pTmp] ;// pSrcLeft[7*leftStep] 357 358 359 VREV64 dRevAboveVal,dAboveVal ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7] 360 VSUBL qAbove7minus0,dRevAboveVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0] 361 VSHR dRevAboveValU64,dRevAboveValU64,#8 ;// pSrcAbove[X:0:1:2:3:4:5:6] 362 VSUBL qAboveDiff,dRevAboveVal,dAboveVal ;// pSrcAbove[6] - pSrcAbove[0] 363 ;// pSrcAbove[5] - pSrcAbove[1] 364 ;// pSrcAbove[4] - pSrcAbove[2] 365 366 VREV64 dRevLeftVal,dLeftVal ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7] 367 VSUBL qLeft7minus0,dRevLeftVal,dAboveLeftVal ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] 368 VSHR dRevLeftValU64,dRevLeftValU64,#8 ;// pSrcLeft[X:0:1:2:3:4:5:6] 369 VSUBL qLeftDiff,dRevLeftVal,dLeftVal ;// pSrcLeft[6] - pSrcLeft[0] 370 ;// pSrcLeft[5] - pSrcLeft[1] 371 ;// pSrcLeft[4] - pSrcLeft[2] 372 373 LDR pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8 ;// Used to calculate Hval & Vval 374 VSHL dAboveDiff0U64,dAboveDiff0U64,#16 375 VEXT dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2 ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ] 376 VLD1 dMultiplier,[pMultiplierTable]! 377 VSHL dLeftDiff0U64,dLeftDiff0U64,#16 378 VEXT dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2 ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ] 379 380 381 VMUL dHorPred,dDiffAboveS16,dMultiplier ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ] 382 VMUL dVerPred,dDiffLeftS16,dMultiplier 383 VPADD dHVValS16,dHorPred,dVerPred 384 385 386 VPADDL dHVValS32,dHVValS16 ;// [V|H] in 32 bits each 387 VSHL dHVTempS32,dHVValS32,#4 ;// 17*H = 16*H + H = (H<<4)+H 388 VADD dHVValS32,dHVValS32,dHVTempS32 ;// [ 17*V | 17*H ]in 32 bits each 389 VLD1 {dMultiplier0,dMultiplier1},[pMultiplierTable] ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ] 390 VRSHR dHVValS32,dHVValS32,#5 ;// [c|b] in 16bits each 391 VADDL qA,dAboveVal,dLeftVal 392 VDUP qA,qA[7] 393 VSHL qA,qA,#4 ;// [a|a|a|a|a|a|a|a] 394 VDUP qB,dHVValS16[0] ;// [b|b|b|b|b|b|b|b] 395 VDUP qC,dHVValS16[2] ;// [c|c|c|c|c|c|c|c] 396 397 398 VMUL qB,qB,qMultiplier 399 VMUL qC,qC,qMultiplier 400 VADD qB,qB,qA 401 402 VDUP qC0,qC[0] 403 VDUP qC1,qC[1] 404 VDUP qC2,qC[2] 405 VDUP qC3,qC[3] 406 VDUP qC4,qC[4] 407 VDUP qC5,qC[5] 408 VDUP qC6,qC[6] 409 VDUP qC7,qC[7] 410 411 VADD qSum0,qB,qC0 412 VADD qSum1,qB,qC1 413 VADD qSum2,qB,qC2 414 VADD qSum3,qB,qC3 415 VADD qSum4,qB,qC4 416 VADD qSum5,qB,qC5 417 VADD qSum6,qB,qC6 418 VADD qSum7,qB,qC7 419 420 VQRSHRUN dSum0,qSum0,#5 ;// (OMX_U8)armClip(0,255,(Sum+16)>>5) 421 VQRSHRUN dSum1,qSum1,#5 422 VQRSHRUN dSum2,qSum2,#5 423 VQRSHRUN dSum3,qSum3,#5 424 VQRSHRUN dSum4,qSum4,#5 425 VQRSHRUN dSum5,qSum5,#5 426 VQRSHRUN dSum6,qSum6,#5 427 VQRSHRUN dSum7,qSum7,#5 428 429DCChroma8x8PlaneStore 430 ADD pTmp, pDst, dstStep 431 ADD step, dstStep, dstStep 432 433 VST1 dSum0,[pDst],step ;// pDst[0*dstStep+x] :0<= x <= 7 434 VST1 dSum1,[pTmp],step ;// pDst[1*dstStep+x] :0<= x <= 7 435 VST1 dSum2,[pDst],step ;// pDst[2*dstStep+x] :0<= x <= 7 436 VST1 dSum3,[pTmp],step ;// pDst[3*dstStep+x] :0<= x <= 7 437 VST1 dSum4,[pDst],step ;// pDst[4*dstStep+x] :0<= x <= 7 438 VST1 dSum5,[pTmp],step ;// pDst[5*dstStep+x] :0<= x <= 7 439 VST1 dSum6,[pDst],step ;// pDst[6*dstStep+x] :0<= x <= 7 440 VST1 dSum7,[pTmp] ;// pDst[7*dstStep+x] :0<= x <= 7 441 442 MOV return, #OMX_Sts_NoErr 443 M_END 444 445 ENDIF ;// CortexA8 446 447 END 448;//----------------------------------------------------------------------------------------------- 449;// omxVCM4P10_PredictIntraChroma_8x8 ends 450;//----------------------------------------------------------------------------------------------- 451