omxVCM4P10_PredictIntraChroma_8x8_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: omxVCM4P10_PredictIntraChroma_8x8_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 9641 21;// Date: Thursday, February 7, 2008 22;// 23;// 24;// 25;// 26 27 28 INCLUDE omxtypes_s.h 29 INCLUDE armCOMM_s.h 30 31 EXPORT armVCM4P10_pIndexTable8x8 32 33;// Define the processor variants supported by this file 34 35 M_VARIANTS ARM1136JS 36 37 AREA table, DATA 38;//------------------------------------------------------- 39;// This table for implementing switch case of C in asm by 40;// the mehtod of two levels of indexing. 41;//------------------------------------------------------- 42 43 M_TABLE armVCM4P10_pIndexTable8x8 44 DCD OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR 45 DCD OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE 46 47 M_TABLE armVCM4P10_MultiplierTableChroma8x8,1 48 DCW 3, 2, 1,4 49 DCW -3,-2,-1,0 50 DCW 1, 2, 3,4 51 52 IF ARM1136JS 53 54;//-------------------------------------------- 55;// Constants 56;//-------------------------------------------- 57 58BLK_SIZE EQU 0x8 59MUL_CONST0 EQU 0x01010101 60MASK_CONST EQU 0x00FF00FF 61MUL_CONST1 EQU 0x80808080 62 63;//-------------------------------------------- 64;// Scratch variable 65;//-------------------------------------------- 66y RN 12 67pc RN 15 68return RN 0 69pSrcLeft2 RN 1 70pDst2 RN 2 71sum1 RN 6 72sum2 RN 7 73pTable RN 9 74dstStepx2 RN 11 75leftStepx2 RN 14 76outerCount RN 14 77r0x01010101 RN 10 78r0x00FF00FF RN 11 79 80tVal0 RN 0 81tVal1 RN 1 82tVal2 RN 2 83tVal3 RN 3 84tVal4 RN 4 85tVal5 RN 5 86tVal6 RN 6 87tVal7 RN 7 88tVal8 RN 8 89tVal9 RN 9 90tVal10 RN 10 91tVal11 RN 11 92tVal12 RN 12 93tVal14 RN 14 94 95b RN 14 96c RN 12 97 98p2p0 RN 0 99p3p1 RN 1 100p6p4 RN 2 101p7p5 RN 4 102 103pp2pp0 RN 6 104pp3pp1 RN 7 105pp6pp4 RN 8 106pp7pp5 RN 9 107 108p3210 RN 10 109p7654 RN 10 110 111;//-------------------------------------------- 112;// Input Arguments 113;//-------------------------------------------- 114pSrcLeft RN 0 ;// input pointer 115pSrcAbove RN 1 ;// input pointer 116pSrcAboveLeft RN 2 ;// input pointer 117pDst RN 3 ;// output pointer 118leftStep RN 4 ;// input variable 119dstStep RN 5 ;// input variable 120predMode RN 6 ;// input variable 121availability RN 7 ;// input variable 122 123;//----------------------------------------------------------------------------------------------- 124;// omxVCM4P10_PredictIntraChroma_8x8 starts 125;//----------------------------------------------------------------------------------------------- 126 127 ;// Write function header 128 M_START omxVCM4P10_PredictIntraChroma_8x8, r11 129 130 ;// Define stack arguments 131 M_ARG LeftStep, 4 132 M_ARG DstStep, 4 133 M_ARG PredMode, 4 134 M_ARG Availability, 4 135 136 ;// M_STALL ARM1136JS=4 137 138 LDR pTable,=armVCM4P10_pIndexTable8x8 ;// Load index table for switch case 139 140 141 ;// Load argument from the stack 142 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 143 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 144 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 145 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 146 147 MOV y, #BLK_SIZE ;// Outer Loop Count 148 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 149 150OMX_VC_CHROMA_DC 151 AND availability, availability,#(OMX_VC_UPPER + OMX_VC_LEFT) 152 CMP availability, #(OMX_VC_UPPER + OMX_VC_LEFT) ;// if(availability & (#OMX_VC_UPPER | #OMX_VC_LEFT)) 153 LDR r0x01010101, =MUL_CONST0 154 BNE TST_UPPER ;// Jump to Upper if not both 155 LDM pSrcAbove,{tVal8,tVal9} ;// tVal 8 to 9 = pSrcAbove[0 to 7] 156 157 ADD leftStepx2, leftStep,leftStep ;// leftStepx2 = 2 * leftStep 158 ADD pSrcLeft2, pSrcLeft, leftStep ;// pSrcLeft2 = pSrcLeft + leftStep 159 160 ;// M_STALL ARM1136JS=1 161 162 UXTB16 tVal7, tVal8 ;// pSrcAbove[0, 2] 163 UXTB16 tVal8, tVal8, ROR #8 ;// pSrcAbove[1, 3] 164 UADD16 sum1, tVal7, tVal8 ;// pSrcAbove[0, 2] + pSrcAbove[1, 3] 165 166 UXTB16 tVal7, tVal9 ;// pSrcAbove[4, 6] 167 UXTB16 tVal9, tVal9, ROR #8 ;// pSrcAbove[5, 7] 168 UADD16 sum2, tVal7, tVal9 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6] 169 ADD sum1, sum1, sum1, LSR #16 ;// sum(pSrcAbove[0] to pSrcAbove[3]) 170 ADD sum2, sum2, sum2, LSR #16 ;// sum(pSrcAbove[4] to pSrcAbove[7]) 171 UXTH sum1, sum1 ;// upsum1 (Clear the top junk bits) 172 UXTH sum2, sum2 ;// upsum2 (Clear the top junk bits) 173 174 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 175 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 176 M_LDRB tVal4, [pSrcLeft], +leftStepx2 ;// tVal4 = pSrcLeft[2] 177 M_LDRB tVal12,[pSrcLeft2], +leftStepx2 ;// tVal12= pSrcLeft[3] 178 ADD tVal2, tVal8, tVal9 ;// tVal14 = tVal8 + tVal9 179 180 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[4] 181 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[5] 182 ADD tVal14, tVal4, tVal12 ;// tVal14 = tVal4 + tVal12 183 184 LDRB tVal4, [pSrcLeft] ;// tVal4 = pSrcLeft[6] 185 LDRB tVal12,[pSrcLeft2] ;// tVal12= pSrcLeft[7] 186 ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9 187 ADD tVal2, tVal2, tVal14 ;// leftsum1 = sum(pSrcLeft[0] to pSrcLeft[3]) 188 ADD tVal4, tVal4, tVal12 ;// tVal4 = tVal4 + tVal12 189 ADD tVal14, tVal8, tVal4 ;// leftsum2 = sum(pSrcLeft[4] to pSrcLeft[7]) 190 ADD tVal8, tVal14, #2 ;// tVal8 = leftsum2 + 2 191 ADD tVal9, sum2, #2 ;// tVal8 = upsum2 + 2 192 ADD sum1, sum1, tVal2 ;// sum1 = upsum1 + leftsum1 193 ADD sum2, sum2, tVal14 ;// sum2 = upsum2 + leftsum2 194 ADD sum1, sum1, #4 ;// (sum1 + 4) 195 ADD sum2, sum2, #4 ;// (sum2 + 4) 196 MOV sum1, sum1, LSR #3 ;// (sum1 + 4)>>3 197 MOV tVal9, tVal9, LSR #2 ;// (tVal9 + 2)>>2 198 MOV tVal8, tVal8, LSR #2 ;// (tVal8 + 2)>>2 199 MOV sum2, sum2, LSR #3 ;// (sum2 + 4)>>3 200 201 MUL tVal0, sum1, r0x01010101 ;// replicate the val in all the bytes 202 MUL tVal1, tVal9,r0x01010101 ;// replicate the val in all the bytes 203 MUL tVal8, tVal8,r0x01010101 ;// replicate the val in all the bytes 204 MUL tVal9, sum2, r0x01010101 ;// replicate the val in all the bytes 205 206 M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[0 to 7] = tVal 0 to 1 207 M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[8 to 15] = tVal 0 to 1 208 M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[16 to 23] = tVal 0 to 1 209 M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[24 to 31] = tVal 0 to 1 210 211 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[32 to 39] = tVal 8 to 9 212 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[40 to 47] = tVal 8 to 9 213 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[48 to 55] = tVal 8 to 9 214 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[56 to 63] = tVal 8 to 9 215 MOV return, #OMX_Sts_NoErr 216 M_EXIT 217 218TST_UPPER 219 220 ;// M_STALL ARM1136JS=3 221 222 CMP availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER) 223 224 BNE TST_LEFT ;// Jump to Left if not upper 225 LDM pSrcAbove,{tVal8,tVal9} ;// tVal 8 to 9 = pSrcAbove[0 to 7] 226 227 ;// M_STALL ARM1136JS=3 228 229 UXTB16 tVal7, tVal8 ;// pSrcAbove[0, 2] 230 UXTB16 tVal8, tVal8, ROR #8 ;// pSrcAbove[1, 3] 231 UADD16 sum1, tVal7, tVal8 ;// pSrcAbove[0, 2] + pSrcAbove[1, 3] 232 233 UXTB16 tVal7, tVal9 ;// pSrcAbove[4, 6] 234 UXTB16 tVal9, tVal9, ROR #8 ;// pSrcAbove[5, 7] 235 UADD16 sum2, tVal7, tVal9 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6] 236 237 ADD sum1, sum1, sum1, LSR #16 ;// sum(pSrcAbove[0] to pSrcAbove[3]) 238 ADD sum2, sum2, sum2, LSR #16 ;// sum(pSrcAbove[4] to pSrcAbove[7]) 239 240 UXTH sum1, sum1 ;// upsum1 (Clear the top junk bits) 241 UXTH sum2, sum2 ;// upsum2 (Clear the top junk bits) 242 243 ADD sum1, sum1, #2 ;// sum1 + 2 244 ADD sum2, sum2, #2 ;// sum2 + 2 245 246 MOV sum1, sum1, LSR #2 ;// (sum1 + 2)>>2 247 MOV sum2, sum2, LSR #2 ;// (sum2 + 2)>>2 248 249 MUL sum1, sum1,r0x01010101 ;// replicate the val in all the bytes 250 MUL sum2, sum2,r0x01010101 ;// replicate the val in all the bytes 251 252 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7 253 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7 254 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7 255 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7 256 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[32 to 39] = tVal 6 to 7 257 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[40 to 47] = tVal 6 to 7 258 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[48 to 55] = tVal 6 to 7 259 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[56 to 63] = tVal 6 to 7 260 MOV return, #OMX_Sts_NoErr 261 M_EXIT 262 263TST_LEFT 264 ;// M_STALL ARM1136JS=3 265 266 CMP availability, #OMX_VC_LEFT 267 BNE TST_COUNT0 268 ADD leftStepx2, leftStep,leftStep ;// leftStepx2 = 2 * leftStep 269 ADD pSrcLeft2, pSrcLeft, leftStep ;// pSrcLeft2 = pSrcLeft + leftStep 270 271 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 272 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 273 M_LDRB tVal4, [pSrcLeft], +leftStepx2 ;// tVal4 = pSrcLeft[2] 274 M_LDRB tVal12,[pSrcLeft2], +leftStepx2 ;// tVal12= pSrcLeft[3] 275 276 ADD tVal6, tVal8, tVal9 ;// tVal6 = tVal8 + tVal9 277 278 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[4] 279 ADD tVal7, tVal4, tVal12 ;// tVal7 = tVal4 + tVal12 280 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[5] 281 M_LDRB tVal4, [pSrcLeft], +leftStepx2 ;// tVal4 = pSrcLeft[6] 282 M_LDRB tVal12,[pSrcLeft2], +leftStepx2 ;// tVal12= pSrcLeft[7] 283 284 ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9 285 ADD sum1, tVal6, tVal7 ;// sum1 = sum(pSrcLeft[0] to pSrcLeft[3]) 286 ADD tVal4, tVal4, tVal12 ;// tVal4 = tVal4 + tVal12 287 ADD sum2, tVal8, tVal4 ;// sum2 = sum(pSrcLeft[4] to pSrcLeft[7]) 288 289 ADD sum1, sum1, #2 ;// sum1 + 2 290 ADD sum2, sum2, #2 ;// sum2 + 2 291 292 MOV sum1, sum1, LSR #2 ;// (sum1 + 2)>>2 293 MOV sum2, sum2, LSR #2 ;// (sum2 + 2)>>2 294 295 MUL tVal6, sum1,r0x01010101 ;// replicate the val in all the bytes 296 MUL tVal8, sum2,r0x01010101 ;// replicate the val in all the bytes 297 298 ;// M_STALL ARM1136JS=1 299 MOV tVal7,tVal6 ;// tVal7 = sum1 300 MOV tVal9,tVal8 ;// tVal9 = sum2 301 302 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7 303 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7 304 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7 305 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7 306 307 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[32 to 39] = tVal 8 to 9 308 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[40 to 47] = tVal 8 to 9 309 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[48 to 55] = tVal 8 to 9 310 M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[56 to 63] = tVal 8 to 9 311 312 MOV return, #OMX_Sts_NoErr 313 M_EXIT ;// Macro to exit midway-break frm case 314 315TST_COUNT0 316 LDR sum1, =MUL_CONST1 ;// sum1 = 0x80808080 if(count == 0) 317 318 ;// M_STALL ARM1136JS=2 319 320 MOV tVal7, sum1 ;// tVal7 = sum1 321 322 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7 323 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7 324 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7 325 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7 326 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[32 to 39] = tVal 6 to 7 327 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[40 to 47] = tVal 6 to 7 328 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[48 to 55] = tVal 6 to 7 329 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[56 to 63] = tVal 6 to 7 330 331 MOV return, #OMX_Sts_NoErr 332 M_EXIT ;// Macro to exit midway-break frm case 333 334OMX_VC_CHROMA_HOR 335 336 ;// M_STALL ARM1136JS=2 337 338 ADD pSrcLeft2, pSrcLeft, leftStep ;// pSrcLeft2 = pSrcLeft + leftStep 339 ADD leftStepx2, leftStep, leftStep ;// leftStepx2 = leftStep * 2 340 ADD pDst2, pDst, dstStep ;// pDst2 = pDst + dstStep 341 ADD dstStepx2, dstStep, dstStep ;// double dstStep 342 SUB dstStepx2, dstStepx2, #4 ;// double dstStep minus 4 343 LDR r0x01010101, =MUL_CONST0 ;// Const to repeat the byte in reg 4 times 344 M_LDRB tVal6, [pSrcLeft], +leftStepx2 ;// tVal6 = pSrcLeft[0] 345 M_LDRB tVal7, [pSrcLeft2],+leftStepx2 ;// tVal7 = pSrcLeft[1] 346 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[2] 347 M_LDRB tVal9, [pSrcLeft2],+leftStepx2 ;// tVal9 = pSrcLeft[3] 348 MUL tVal6, tVal6, r0x01010101 ;// replicate the val in all the bytes 349 MUL tVal7, tVal7, r0x01010101 ;// replicate the val in all the bytes 350 MUL tVal8, tVal8, r0x01010101 ;// replicate the val in all the bytes 351 MUL tVal9, tVal9, r0x01010101 ;// replicate the val in all the bytes 352 STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3] 353 STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 354 M_STR tVal6, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7] 355 M_STR tVal7, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7] 356 STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3] 357 STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 358 M_STR tVal8, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7] 359 M_STR tVal9, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7] 360 M_LDRB tVal6, [pSrcLeft], +leftStepx2 ;// tVal6 = pSrcLeft[4] 361 M_LDRB tVal7, [pSrcLeft2],+leftStepx2 ;// tVal7 = pSrcLeft[5] 362 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[6] 363 M_LDRB tVal9, [pSrcLeft2],+leftStepx2 ;// tVal9 = pSrcLeft[7] 364 MUL tVal6, tVal6, r0x01010101 ;// replicate the val in all the bytes 365 MUL tVal7, tVal7, r0x01010101 ;// replicate the val in all the bytes 366 MUL tVal8, tVal8, r0x01010101 ;// replicate the val in all the bytes 367 MUL tVal9, tVal9, r0x01010101 ;// replicate the val in all the bytes 368 STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3] 369 STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 370 M_STR tVal6, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7] 371 M_STR tVal7, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7] 372 STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3] 373 STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 374 M_STR tVal8, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7] 375 M_STR tVal9, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7] 376 MOV return, #OMX_Sts_NoErr 377 M_EXIT 378 379OMX_VC_CHROMA_VERT 380 381 ;// M_STALL ARM1136JS=4 382 383 LDMIA pSrcAbove, {tVal6,tVal7} ;// tVal 6 to 7 = pSrcAbove[0 to 7] 384 MOV return, #OMX_Sts_NoErr 385 386 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7 387 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7 388 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7 389 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7 390 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[32 to 39] = tVal 6 to 7 391 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[40 to 47] = tVal 6 to 7 392 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[48 to 55] = tVal 6 to 7 393 M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[56 to 63] = tVal 6 to 7 394 395 M_EXIT ;// Macro to exit midway-break frm case 396 397OMX_VC_CHROMA_PLANE 398 399 ;// M_STALL ARM1136JS=3 400 401 RSB tVal14, leftStep, leftStep, LSL #3 ;// 7*leftStep 402 LDRB tVal7, [pSrcAbove, #+7] ;// pSrcAbove[7] 403 LDRB tVal6, [pSrcLeft, +tVal14] ;// pSrcLeft[7*leftStep] 404 LDRB tVal8, [pSrcAboveLeft] ;// pSrcAboveLeft[0] 405 LDRB tVal9, [pSrcAbove, #+6 ] ;// pSrcAbove[6] 406 LDRB tVal10,[pSrcAbove] ;// pSrcAbove[0] 407 ADD tVal2, tVal7, tVal6 ;// pSrcAbove[7] + pSrcLeft[7*leftStep] 408 SUB tVal6, tVal6, tVal8 ;// V0 = pSrcLeft[7*leftStep] - pSrcAboveLeft[0] 409 SUB tVal7, tVal7, tVal8 ;// H0 = pSrcAbove[7] - pSrcAboveLeft[0] 410 LSL tVal2, tVal2, #4 ;// a = 16 * (pSrcAbove[15] + pSrcLeft[15*lS]) 411 ADD tVal2, tVal2, #16 ;// a + 16 412 SUB tVal9, tVal9,tVal10 ;// pSrcAbove[6] - pSrcAbove[0] 413 LDRB tVal8, [pSrcAbove,#+5] ;// pSrcAbove[5] 414 LDRB tVal10,[pSrcAbove,#+1] ;// pSrcAbove[1] 415 ADD tVal9, tVal9, tVal9, LSL #1 ;// H1 = 3 * (pSrcAbove[6] - pSrcAbove[0]) 416 ADD tVal7, tVal9, tVal7, LSL #2 ;// H = H1 + H0 417 SUB tVal8, tVal8, tVal10 ;// pSrcAbove[5] - pSrcAbove[1] 418 LDRB tVal9, [pSrcAbove,#+4] ;// pSrcAbove[4] 419 LDRB tVal10,[pSrcAbove,#+2] ;// pSrcAbove[2] 420 ADD tVal7, tVal7, tVal8, LSL #1 ;// H = H + H2 421 SUB tVal11, tVal14,leftStep ;// 6*leftStep 422 ADD tVal11, pSrcLeft, tVal11 ;// pSrcLeft + 6*leftStep 423 MOV tVal12, pSrcLeft ;// pSrcLeft 424 SUB tVal9, tVal9, tVal10 ;// pSrcAbove[4] - pSrcAbove[2] 425 ADD tVal7, tVal7, tVal9 ;// H = H + H3 426 M_LDRB tVal8, [tVal11],-leftStep ;// pSrcLeft[6*leftStep] 427 M_LDRB tVal10,[tVal12],+leftStep ;// pSrcLeft[0] 428 ADD tVal7, tVal7, tVal7, LSL #4 ;// 17 * H 429 ADD tVal7, tVal7, #16 ;// 17 * H + 16 430 SUB tVal8, tVal8, tVal10 ;// pSrcLeft[6*leftStep] - pSrcLeft[0] 431 ASR b, tVal7, #5 ;// b = (17 * H + 16) >> 5 432 ADD tVal8, tVal8, tVal8, LSL #1 ;// V1 = 3 * (pSrcLeft[6*leftStep] - pSrcLeft[0]) 433 ADD tVal6, tVal8, tVal6, LSL #2 ;// V = V0 +V1 434 M_LDRB tVal8, [tVal11],-leftStep ;// pSrcLeft[5*leftStep] 435 M_LDRB tVal10,[tVal12],+leftStep ;// pSrcLeft[leftStep] 436 ADD tVal7, b, b, LSL #1 ;// 3*b 437 SUB tVal2, tVal2, tVal7 ;// a + 16 - 3*b 438 SUB tVal7, tVal8, tVal10 ;// pSrcLeft[5*leftStep] - pSrcLeft[leftStep] 439 M_LDRB tVal8, [tVal11],-leftStep ;// pSrcLeft[4*leftStep] 440 M_LDRB tVal10,[tVal12],+leftStep ;// pSrcLeft[2*leftStep] 441 ADD tVal6, tVal6, tVal7, LSL #1 ;// V = V + V2 442 LDR r0x00FF00FF, =MASK_CONST ;// r0x00FF00FF = 0x00FF00FF 443 SUB tVal7, tVal8, tVal10 ;// pSrcLeft[4*leftStep] - pSrcLeft[2*leftStep] 444 ADD tVal6, tVal6, tVal7 ;// V = V + V7 445 SUB dstStep, dstStep, #4 ;// dstStep - 4 446 ADD tVal6, tVal6, tVal6, LSL #4 ;// 17*V 447 ADD tVal6, tVal6, #16 ;// 17*V + 16 448 449 ;// M_STALL ARM1136JS=1 450 451 ASR c, tVal6, #5 ;// c = (17*V + 16)>>5 452 453 ;// M_STALL ARM1136JS=1 454 455 ADD tVal6, c, c, LSL #1 ;// 3*c 456 UXTH c, c ;// only in half word 457 SUB tVal6, tVal2, tVal6 ;// a - 3*b - 3*c + 16 458 ORR c, c, c, LSL #16 ;// c c 459 ADD tVal7, b, b ;// 2b 460 ADD tVal2, tVal6, tVal7 ;// pp2 = d + 2*b 461 ADD tVal7, tVal7, b ;// 3b 462 ORR p2p0, tVal6, tVal2, LSL #16 ;// p2p0 = pack {p2, p0} 463 UXTH b, b 464 UXTH tVal7, tVal7 465 ORR b, b, b, LSL #16 ;// {b,b} 466 ORR tVal7, tVal7, tVal7, LSL #16 ;// {3b,3b} 467 SADD16 p3p1, p2p0, b ;// p3p1 = p2p0 + {b,b} 468 SADD16 p6p4, p3p1, tVal7 ;// p6p4 = p3p1 + {3b,3b} 469 SADD16 p7p5, p6p4, b ;// p7p5 = p6p4 + {b,b} 470 MOV outerCount, #BLK_SIZE ;// Outer Loop Count 471 472LOOP_PLANE 473 474 USAT16 p7p5, #13, p7p5 ;// clip13(p7) clip13(p5) 475 USAT16 p6p4, #13, p6p4 ;// clip13(p6) clip13(p4) 476 USAT16 p3p1, #13, p3p1 ;// clip13(p3) clip13(p1) 477 USAT16 p2p0, #13, p2p0 ;// clip13(p2) clip13(p0) 478 479 AND pp7pp5, r0x00FF00FF, p7p5, ASR #5 ;// clip8(p7) clip8(p5) 480 AND pp6pp4, r0x00FF00FF, p6p4, ASR #5 ;// clip8(p6) clip8(p4) 481 AND pp3pp1, r0x00FF00FF, p3p1, ASR #5 ;// clip8(p3) clip8(p1) 482 AND pp2pp0, r0x00FF00FF, p2p0, ASR #5 ;// clip8(p2) clip8(p0) 483 484 SUBS outerCount, outerCount, #1 ;// outerCount-- 485 486 ORR p3210, pp2pp0, pp3pp1, LSL #8 ;// pack {p3,p2, p1, p0} 487 STR p3210, [pDst], #4 ;// store {pDst[0] to pDst[3]} 488 489 ORR p7654, pp6pp4, pp7pp5, LSL #8 ;// pack {p7,p6, p5, p4} 490 M_STR p7654, [pDst], dstStep ;// store {pDst[4] to pDst[7]} 491 492 SADD16 p7p5, p7p5, c ;// {p7 + c}, {p5 + c} 493 SADD16 p6p4, p6p4, c ;// {p6 + c}, {p4 + c} 494 SADD16 p3p1, p3p1, c ;// {p3 + c}, {p1 + c} 495 SADD16 p2p0, p2p0, c ;// {p2 + c}, {p0 + c} 496 497 BNE LOOP_PLANE ;// Loop for 8 times 498 MOV return, #OMX_Sts_NoErr 499 M_END 500 501 ENDIF ;// ARM1136JS 502 503 504 505 END 506;//----------------------------------------------------------------------------------------------- 507;// omxVCM4P10_PredictIntraChroma_8x8 ends 508;//----------------------------------------------------------------------------------------------- 509