1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: omxVCM4P10_PredictIntra_16x16_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 12290 21;// Date: Wednesday, April 9, 2008 22;// 23;// 24;// 25;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS CortexA8 31 32 33;//------------------------------------------------------- 34;// This table for implementing switch case of C in asm by 35;// the mehtod of two levels of indexing. 36;//------------------------------------------------------- 37 38 M_TABLE armVCM4P10_pIndexTable16x16 39 DCD OMX_VC_16X16_VERT, OMX_VC_16X16_HOR 40 DCD OMX_VC_16X16_DC, OMX_VC_16X16_PLANE 41 42 43 IF CortexA8 44 45 M_TABLE armVCM4P10_MultiplierTable16x16,1 46 DCW 7, 6, 5, 4, 3, 2, 1, 8 47 DCW 0, 1, 2, 3, 4, 5, 6, 7 48 DCW 8, 9, 10, 11, 12, 13, 14, 15 49 50;//-------------------------------------------- 51;// Constants 52;//-------------------------------------------- 53BLK_SIZE EQU 0x10 54MUL_CONST0 EQU 0x01010101 55MUL_CONST1 EQU 0x00060004 56MUL_CONST2 EQU 0x00070005 57MUL_CONST3 EQU 0x00030001 58MASK_CONST EQU 0x00FF00FF 59 60;//-------------------------------------------- 61;// Scratch variable 62;//-------------------------------------------- 63y RN 12 64pc RN 15 65 66return RN 0 67pTable RN 9 68count RN 11 69pMultTable RN 9 70; ---------------------------------------------- 71; Neon registers 72; ---------------------------------------------- 73qAbove QN Q0.U8 74qLeft QN Q1.U8 75qSum8 QN Q0.U16 76dSum80 DN D0.U16 77dSum81 DN D1.U16 78dSum4 DN D0.U16 79dSum2 DN D0.U32 80dSum1 DN D0.U64 81qOut QN Q3.U8 82dSumLeft DN D6.U64 83dSumAbove DN D7.U64 84dSum DN D8.U64 85dSum0 DN D8.U8[0] 86 87qH QN Q11.S32 88qV QN Q12.S32 89qA QN Q11.S16 90qB QN Q6.S16 91qC QN Q7.S16 92 93qB0 QN Q5.S16 94qB1 QN Q6.S16 95dA1 DN D23.S16 96 97dH0 DN D22.S32 98dH1 DN D23.S32 99dV0 DN D24.S32 100dV1 DN D25.S32 101 102qHV QN Q11.S64 103qHV0 QN Q11.S32 104qHV1 QN Q12.S64 105 106dHV00 DN D22.S32 107dHV01 DN D23.S32 108 109dHV0 DN D22.S16[0] 110dHV1 DN D23.S16[0] 111dHV10 DN D24.S64 112dHV11 DN D25.S64 113 114qSum0 QN Q0.S16 115qSum1 QN Q1.S16 116 117dOut0 DN D6.U8 118dOut1 DN D7.U8 119 120dLeft0 DN D2.U8 121dLeft1 DN D3.U8 122qConst QN Q13.S16 123 124dAbove0 DN D0.U8 125dAbove1 DN D1.U8 126 127dRevLeft64 DN D12.U64 128dRevLeft DN D12.U8 129dRevAbove64 DN D5.U64 130dRevAbove DN D5.U8 131qLeftDiff QN Q8.S16 132dLeftDiff1 DN D17.S16 133dLeftDiff64 DN D17.S64 134qDiffLeft QN Q8.S16 135qDiffAbove QN Q4.S16 136dAboveDiff1 DN D9.S16 137dAboveDiff64 DN D9.S64 138qAboveDiff QN Q4.S16 139 140dAboveLeft DN D4.U8 141 142dDiffLeft0 DN D16.S16 143dDiffLeft1 DN D17.S16 144dDiffAbove0 DN D8.S16 145dDiffAbove1 DN D9.S16 146 147qLeft15minus0 QN Q7.S16 148dLeft15minus0 DN D14.S16 149qAbove15minus0 QN Q3.S16 150dAbove15minus0 DN D6.S16 151 152qMultiplier QN Q10.S16 153qMultiplier0 QN Q10.S16 154qMultiplier1 QN Q12.S16 155dMultiplier0 DN D20.S16 156dMultiplier1 DN D21.S16 157 158dBPlusCMult7 DN D1.S64 159dBPlusCMult7S16 DN D1.S16 160 161qTmp QN Q0.U8 162 163;//-------------------------------------------- 164;// Declare input registers 165;//-------------------------------------------- 166pSrcLeft RN 0 ;// input pointer 167pSrcAbove RN 1 ;// input pointer 168pSrcAboveLeft RN 2 ;// input pointer 169pDst RN 3 ;// output pointer 170leftStep RN 4 ;// input variable 171dstStep RN 5 ;// input variable 172predMode RN 6 ;// input variable 173availability RN 7 ;// input variable 174 175pTmp RN 8 176step RN 10 177pTmp2 RN 11 178 179;//----------------------------------------------------------------------------------------------- 180;// omxVCM4P10_PredictIntra_16x16 starts 181;//----------------------------------------------------------------------------------------------- 182 183 ;// Write function header 184 M_START omxVCM4P10_PredictIntra_16x16, r11, d15 185 186 ;// Define stack arguments 187 M_ARG LeftStep, 4 188 M_ARG DstStep, 4 189 M_ARG PredMode, 4 190 M_ARG Availability, 4 191 192 ;// M_STALL ARM1136JS=4 193 194 LDR pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case 195 196 ;// Load argument from the stack 197 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 198 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 199 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 200 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 201 202 MOV y, #BLK_SIZE ;// Outer Loop Count 203 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 204 205OMX_VC_16X16_VERT 206 VLD1 qAbove, [pSrcAbove] 207 ADD pTmp, pDst, dstStep 208 ADD step, dstStep, dstStep 209 VST1 qAbove, [pDst], step 210 VST1 qAbove, [pTmp], step 211 VST1 qAbove, [pDst], step 212 VST1 qAbove, [pTmp], step 213 VST1 qAbove, [pDst], step 214 VST1 qAbove, [pTmp], step 215 VST1 qAbove, [pDst], step 216 VST1 qAbove, [pTmp], step 217 VST1 qAbove, [pDst], step 218 VST1 qAbove, [pTmp], step 219 VST1 qAbove, [pDst], step 220 VST1 qAbove, [pTmp], step 221 VST1 qAbove, [pDst], step 222 VST1 qAbove, [pTmp], step 223 VST1 qAbove, [pDst] 224 VST1 qAbove, [pTmp] 225 MOV return, #OMX_Sts_NoErr ;// returnNoError 226 M_EXIT 227 228OMX_VC_16X16_HOR 229 ADD pTmp, pSrcLeft, leftStep 230 ADD leftStep, leftStep, leftStep 231 ADD pTmp2, pDst, dstStep 232 ADD dstStep, dstStep, dstStep 233LoopHor 234 VLD1 {qLeft[]}, [pSrcLeft], leftStep 235 VLD1 {qTmp[]}, [pTmp], leftStep 236 SUBS y, y, #8 237 VST1 qLeft, [pDst], dstStep 238 VST1 qTmp, [pTmp2], dstStep 239 VLD1 {qLeft[]}, [pSrcLeft], leftStep 240 VLD1 {qTmp[]}, [pTmp], leftStep 241 VST1 qLeft, [pDst], dstStep 242 VST1 qTmp, [pTmp2], dstStep 243 VLD1 {qLeft[]}, [pSrcLeft], leftStep 244 VLD1 {qTmp[]}, [pTmp], leftStep 245 VST1 qLeft, [pDst], dstStep 246 VST1 qTmp, [pTmp2], dstStep 247 VLD1 {qLeft[]}, [pSrcLeft], leftStep 248 VLD1 {qTmp[]}, [pTmp], leftStep 249 VST1 qLeft, [pDst], dstStep 250 VST1 qTmp, [pTmp2], dstStep 251 252 BNE LoopHor ;// Loop for 16 times 253 MOV return, #OMX_Sts_NoErr 254 M_EXIT 255 256OMX_VC_16X16_DC 257 MOV count, #0 ;// count = 0 258 TST availability, #OMX_VC_LEFT 259 BEQ UpperOrNoneAvailable ;// Jump to Upper if not left 260 261 ADD pTmp, pSrcLeft, leftStep 262 ADD step, leftStep, leftStep 263 264 VLD1 {qLeft[0]}, [pSrcLeft],step 265 VLD1 {qLeft[1]}, [pTmp],step 266 VLD1 {qLeft[2]}, [pSrcLeft],step 267 VLD1 {qLeft[3]}, [pTmp],step 268 VLD1 {qLeft[4]}, [pSrcLeft],step 269 VLD1 {qLeft[5]}, [pTmp],step 270 VLD1 {qLeft[6]}, [pSrcLeft],step 271 VLD1 {qLeft[7]}, [pTmp],step 272 VLD1 {qLeft[8]}, [pSrcLeft],step 273 VLD1 {qLeft[9]}, [pTmp],step 274 VLD1 {qLeft[10]},[pSrcLeft],step 275 VLD1 {qLeft[11]},[pTmp],step 276 VLD1 {qLeft[12]},[pSrcLeft],step 277 VLD1 {qLeft[13]},[pTmp],step 278 VLD1 {qLeft[14]},[pSrcLeft],step 279 VLD1 {qLeft[15]},[pTmp] 280 281 VPADDL qSum8, qLeft 282 ADD count, count, #1 283 VPADD dSum4, dSum80, dSum81 284 VPADDL dSum2, dSum4 285 VPADDL dSumLeft, dSum2 286 VRSHR dSum, dSumLeft, #4 287 288UpperOrNoneAvailable 289 TST availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER) 290 BEQ BothOrNoneAvailable ;// Jump to Left if not upper 291 VLD1 qAbove, [pSrcAbove] 292 ADD count, count, #1 ;// if upper inc count by 1 293 VPADDL qSum8, qAbove 294 VPADD dSum4, dSum80, dSum81 295 VPADDL dSum2, dSum4 296 VPADDL dSumAbove, dSum2 297 VRSHR dSum, dSumAbove, #4 298 299BothOrNoneAvailable 300 CMP count, #2 ;// check if both available 301 BNE NoneAvailable 302 VADD dSum, dSumAbove, dSumLeft 303 VRSHR dSum, dSum, #5 304 305 306NoneAvailable 307 VDUP qOut, dSum0 308 CMP count, #0 ;// check if none available 309 ADD pTmp, pDst, dstStep 310 ADD step, dstStep, dstStep 311 BNE LoopDC 312 VMOV qOut, #128 313LoopDC 314 VST1 qOut, [pDst], step 315 VST1 qOut, [pTmp], step 316 VST1 qOut, [pDst], step 317 VST1 qOut, [pTmp], step 318 VST1 qOut, [pDst], step 319 VST1 qOut, [pTmp], step 320 VST1 qOut, [pDst], step 321 VST1 qOut, [pTmp], step 322 VST1 qOut, [pDst], step 323 VST1 qOut, [pTmp], step 324 VST1 qOut, [pDst], step 325 VST1 qOut, [pTmp], step 326 VST1 qOut, [pDst], step 327 VST1 qOut, [pTmp], step 328 VST1 qOut, [pDst], step 329 VST1 qOut, [pTmp], step 330 MOV return, #OMX_Sts_NoErr 331 M_EXIT 332 333OMX_VC_16X16_PLANE 334 LDR pMultTable, =armVCM4P10_MultiplierTable16x16 335 VLD1 qAbove, [pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7 336 VLD1 dAboveLeft[0],[pSrcAboveLeft] 337 ADD pTmp, pSrcLeft, leftStep 338 ADD step, leftStep, leftStep 339 VLD1 {qLeft[0]}, [pSrcLeft],step 340 VLD1 {qLeft[1]}, [pTmp],step 341 VLD1 {qLeft[2]}, [pSrcLeft],step 342 VLD1 {qLeft[3]}, [pTmp],step 343 VLD1 {qLeft[4]}, [pSrcLeft],step 344 VLD1 {qLeft[5]}, [pTmp],step 345 VLD1 {qLeft[6]}, [pSrcLeft],step 346 VLD1 {qLeft[7]}, [pTmp],step 347 VLD1 {qLeft[8]}, [pSrcLeft],step 348 VLD1 {qLeft[9]}, [pTmp],step 349 VLD1 {qLeft[10]}, [pSrcLeft],step 350 VLD1 {qLeft[11]}, [pTmp],step 351 VLD1 {qLeft[12]}, [pSrcLeft],step 352 VLD1 {qLeft[13]}, [pTmp],step 353 VLD1 {qLeft[14]}, [pSrcLeft],step 354 VLD1 {qLeft[15]}, [pTmp] 355 356 VREV64 dRevAbove, dAbove1 ;// pSrcAbove[15:14:13:12:11:10:9:8] 357 VSUBL qAbove15minus0, dRevAbove, dAboveLeft ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0] 358 VSHR dRevAbove64, dRevAbove64, #8 ;// pSrcAbove[14:13:12:11:10:9:8:X] 359 VSUBL qAboveDiff, dRevAbove, dAbove0 360 361 VSHL dAboveDiff64, dAboveDiff64, #16 362 VEXT dDiffAbove1, dAboveDiff1, dAbove15minus0, #1 363 364 VREV64 dRevLeft,dLeft1 ;// pSrcLeft[15:14:13:12:11:10:9:8] 365 VSUBL qLeft15minus0,dRevLeft, dAboveLeft ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] 366 VSHR dRevLeft64, dRevLeft64, #8 ;// pSrcLeft[14:13:12:11:10:9:8:X] 367 VSUBL qLeftDiff,dRevLeft, dLeft0 368 369 ;// Multiplier = [8|1|2|...|6|7] 370 VLD1 qMultiplier, [pMultTable]! 371 372 VSHL dLeftDiff64, dLeftDiff64, #16 373 VEXT dDiffLeft1, dLeftDiff1, dLeft15minus0, #1 374 375 VMULL qH,dDiffAbove0, dMultiplier0 376 VMULL qV,dDiffLeft0, dMultiplier0 377 VMLAL qH,dDiffAbove1, dMultiplier1 378 VMLAL qV,dDiffLeft1, dMultiplier1 379 380 VPADD dHV00,dH1,dH0 381 VPADD dHV01,dV1,dV0 382 VPADDL qHV, qHV0 383 VSHL qHV1,qHV,#2 384 VADD qHV,qHV,qHV1 385 386 ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)] 387 VRSHR qHV,qHV,#6 388 389 ;// HV1 = [c*7|b*7] 390 VSHL qHV1,qHV,#3 391 VSUB qHV1,qHV1,qHV 392 393 ;// Multiplier1 = [0|1|2|...|7] 394 VLD1 qMultiplier0, [pMultTable]! 395 VDUP qB, dHV0 396 VDUP qC, dHV1 397 398 VADDL qA,dAbove1,dLeft1 399 VSHL qA,qA, #4 400 VDUP qA,dA1[3] 401 VADD dBPlusCMult7, dHV10, dHV11 402 403 ;// Multiplier1 = [8|9|10|...|15] 404 VLD1 qMultiplier1, [pMultTable] 405 ;// Const = a - 7*(b+c) 406 VDUP qConst, dBPlusCMult7S16[0] 407 VSUB qConst, qA, qConst 408 409 ;// B0 = [0*b|1*b|2*b|3*b|......|7*b] 410 VMUL qB0,qB,qMultiplier0 411 412 ;// B0 = [8*b|9*b|10*b|11*b|....|15*b] 413 VMUL qB1,qB,qMultiplier1 414 415 VADD qSum0, qB0, qConst 416 VADD qSum1, qB1, qConst 417 418 ;// Loops for 16 times 419LoopPlane 420 ;// (b*x + c*y + C)>>5 421 VQRSHRUN dOut0, qSum0,#5 422 VQRSHRUN dOut1, qSum1,#5 423 SUBS y, y, #1 424 VST1 qOut,[pDst],dstStep 425 VADD qSum0,qSum0,qC 426 VADD qSum1,qSum1,qC 427 BNE LoopPlane 428 429 MOV return, #OMX_Sts_NoErr 430 431 M_END 432 433 ENDIF ;// CortexA8 434 435 END 436;----------------------------------------------------------------------------------------------- 437; omxVCM4P10_PredictIntra_16x16 ends 438;----------------------------------------------------------------------------------------------- 439