1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: omxVCM4P10_PredictIntra_16x16_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 9641 21;// Date: Thursday, February 7, 2008 22;// 23;// 24;// 25;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS ARM1136JS 31 32;//------------------------------------------------------- 33;// This table for implementing switch case of C in asm by 34;// the mehtod of two levels of indexing. 35;//------------------------------------------------------- 36 37 M_TABLE armVCM4P10_pIndexTable16x16 38 DCD OMX_VC_16X16_VERT, OMX_VC_16X16_HOR 39 DCD OMX_VC_16X16_DC, OMX_VC_16X16_PLANE 40 41 IF ARM1136JS 42 43;//-------------------------------------------- 44;// Constants 45;//-------------------------------------------- 46BLK_SIZE EQU 0x10 47MUL_CONST0 EQU 0x01010101 48MUL_CONST1 EQU 0x00060004 49MUL_CONST2 EQU 0x00070005 50MUL_CONST3 EQU 0x00030001 51MASK_CONST EQU 0x00FF00FF 52 53;//-------------------------------------------- 54;// Scratch variable 55;//-------------------------------------------- 56y RN 12 57pc RN 15 58 59return RN 0 60innerCount RN 0 61outerCount RN 1 62pSrcLeft2 RN 1 63pDst2 RN 2 64sum RN 6 65pTable RN 9 66temp1 RN 10 67temp2 RN 12 68cMul1 RN 11 69cMul2 RN 12 70count RN 12 71dstStepx2 RN 11 72leftStepx2 RN 14 73r0x01010101 RN 10 74r0x00FF00FF RN 11 75 76tVal0 RN 0 77tVal1 RN 1 78tVal2 RN 2 79tVal3 RN 3 80tVal4 RN 4 81tVal5 RN 5 82tVal6 RN 6 83tVal7 RN 7 84tVal8 RN 8 85tVal9 RN 9 86tVal10 RN 10 87tVal11 RN 11 88tVal12 RN 12 89tVal14 RN 14 90 91b RN 12 92c RN 14 93 94p2p0 RN 0 95p3p1 RN 1 96p6p4 RN 2 97p7p5 RN 4 98p10p8 RN 6 99p11p9 RN 7 100p14p12 RN 8 101p15p13 RN 9 102 103p3210 RN 10 104p7654 RN 10 105p111098 RN 10 106p15141312 RN 10 107 108;//-------------------------------------------- 109;// Declare input registers 110;//-------------------------------------------- 111pSrcLeft RN 0 ;// input pointer 112pSrcAbove RN 1 ;// input pointer 113pSrcAboveLeft RN 2 ;// input pointer 114pDst RN 3 ;// output pointer 115leftStep RN 4 ;// input variable 116dstStep RN 5 ;// input variable 117predMode RN 6 ;// input variable 118availability RN 7 ;// input variable 119 120;//----------------------------------------------------------------------------------------------- 121;// omxVCM4P10_PredictIntra_16x16 starts 122;//----------------------------------------------------------------------------------------------- 123 124 ;// Write function header 125 M_START omxVCM4P10_PredictIntra_16x16, r11 126 127 ;// Define stack arguments 128 M_ARG LeftStep, 4 129 M_ARG DstStep, 4 130 M_ARG PredMode, 4 131 M_ARG Availability, 4 132 133 ;// M_STALL ARM1136JS=4 134 135 LDR pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case 136 137 ;// Load argument from the stack 138 M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg 139 M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg 140 M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg 141 M_LDR availability, Availability ;// Arg availability loaded from stack to reg 142 143 MOV y, #BLK_SIZE ;// Outer Loop Count 144 LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode 145 146OMX_VC_16X16_VERT 147 LDM pSrcAbove, {tVal6,tVal7,tVal8,tVal9};// tVal 6 to 9 = pSrcAbove[0 to 15] 148 ADD dstStepx2, dstStep, dstStep ;// double dstStep 149 ADD pDst2, pDst, dstStep ;// pDst2- pDst advanced by dstStep 150 151 ;// M_STALL ARM1136JS=2 ;// Stall outside the loop 152 153LOOP_VERT 154 STM pDst, {tVal6,tVal7,tVal8,tVal9} ;// pDst[0 to 15] = tVal 6 to 9 155 SUBS y, y, #2 ;// y-- 156 ADD pDst, pDst, dstStepx2 ;// pDst advanced by dstStep 157 STM pDst2, {tVal6,tVal7,tVal8,tVal9} ;// pDst2[16 to 31] = tVal 6 to 9 158 ADD pDst2, pDst2, dstStepx2 ;// pDst advanced by dstStep 159 BNE LOOP_VERT ;// Loop for 8 times 160 MOV return, #OMX_Sts_NoErr 161 M_EXIT 162 163 164OMX_VC_16X16_HOR 165 166 ;// M_STALL ARM1136JS=6 167 168 LDR r0x01010101, =MUL_CONST0 ;// Const to repeat the byte in reg 4 times 169 MOV y, #4 ;// Outer Loop Count 170 M_LDRB tVal6, [pSrcLeft], +leftStep ;// tVal6 = pSrcLeft[0 to 3] 171 ADD pDst2, pDst, dstStep ;// pDst2- pDst advanced by dstStep 172 M_LDRB tVal7, [pSrcLeft], +leftStep ;// tVal1 = pSrcLeft[4 to 7] 173 ADD dstStepx2, dstStep, dstStep ;// double dstStep 174 SUB dstStepx2, dstStepx2, #12 ;// double dstStep minus 12 175 176LOOP_HOR 177 M_LDRB tVal8, [pSrcLeft], +leftStep ;// tVal8 = pSrcLeft[0 to 3] 178 MUL tVal6, tVal6, r0x01010101 ;// replicate the val in all the bytes 179 M_LDRB tVal9, [pSrcLeft], +leftStep ;// tVal9 = pSrcLeft[4 to 7] 180 MUL tVal7, tVal7, r0x01010101 ;// replicate the val in all the bytes 181 SUBS y, y, #1 ;// y-- 182 STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst[0 to 3] 183 STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 184 STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst[4 to 7] 185 STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[4 to 7] 186 MUL tVal8, tVal8, r0x01010101 ;// replicate the val in all the bytes 187 STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst[8 to 11] 188 STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[8 to 11] 189 MUL tVal9, tVal9, r0x01010101 ;// replicate the val in all the bytes 190 M_STR tVal6, [pDst], dstStepx2 ;// store {tVal6} at pDst[12 to 15] 191 M_STR tVal7, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[12 to 15] 192 STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst[0 to 3] 193 STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3] 194 STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst[4 to 7] 195 STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[4 to 7] 196 STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst[8 to 11] 197 STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[8 to 11] 198 M_STR tVal8, [pDst], dstStepx2 ;// store {tVal6} at pDst[12 to 15] 199 M_LDRB tVal6, [pSrcLeft], +leftStep ;// tVal6 = pSrcLeft[0 to 3] 200 M_STR tVal9, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[12 to 15] 201 M_LDRB tVal7, [pSrcLeft], +leftStep ;// tVal7 = pSrcLeft[4 to 7] 202 BNE LOOP_HOR ;// Loop for 3 times 203 MOV return, #OMX_Sts_NoErr 204 M_EXIT 205 206OMX_VC_16X16_DC 207 208 ;// M_STALL ARM1136JS=2 209 210 MOV count, #0 ;// count = 0 211 TST availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER) 212 BEQ TST_LEFT ;// Jump to Left if not upper 213 LDM pSrcAbove,{tVal8,tVal9,tVal10,tVal11};// tVal 8 to 11 = pSrcAbove[0 to 15] 214 ADD count, count, #1 ;// if upper inc count by 1 215 216 ;// M_STALL ARM1136JS=2 217 218 UXTB16 tVal2, tVal8 ;// pSrcAbove[0, 2] 219 UXTB16 tVal6, tVal9 ;// pSrcAbove[4, 6] 220 UADD16 tVal2, tVal2, tVal6 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6] 221 UXTB16 tVal8, tVal8, ROR #8 ;// pSrcAbove[1, 3] 222 UXTB16 tVal9, tVal9, ROR #8 ;// pSrcAbove[5, 7] 223 UADD16 tVal8, tVal8, tVal9 ;// pSrcAbove[1, 3] + pSrcAbove[5, 7] 224 UADD16 tVal2, tVal2, tVal8 ;// sum(pSrcAbove[0] to pSrcAbove[7]) 225 226 UXTB16 tVal8, tVal10 ;// pSrcAbove[8, 10] 227 UXTB16 tVal9, tVal11 ;// pSrcAbove[12, 14] 228 UADD16 tVal8, tVal8, tVal9 ;// pSrcAbove[8, 10] + pSrcAbove[12, 14] 229 UXTB16 tVal10, tVal10, ROR #8 ;// pSrcAbove[9, 11] 230 UXTB16 tVal11, tVal11, ROR #8 ;// pSrcAbove[13, 15] 231 UADD16 tVal10, tVal10, tVal11 ;// pSrcAbove[9, 11] + pSrcAbove[13, 15] 232 UADD16 tVal8, tVal8, tVal10 ;// sum(pSrcAbove[8] to pSrcAbove[15]) 233 234 UADD16 tVal2, tVal2, tVal8 ;// sum(pSrcAbove[0] to pSrcAbove[15]) 235 236 ;// M_STALL ARM1136JS=1 237 238 ADD tVal2, tVal2, tVal2, LSR #16 ;// sum(pSrcAbove[0] to pSrcAbove[15]) 239 240 ;// M_STALL ARM1136JS=1 241 242 UXTH sum, tVal2 ;// Extract the lower half for result 243 244TST_LEFT 245 TST availability, #OMX_VC_LEFT 246 BEQ TST_COUNT 247 ADD leftStepx2, leftStep,leftStep ;// leftStepx2 = 2 * leftStep 248 ADD pSrcLeft2, pSrcLeft, leftStep ;// pSrcLeft2 = pSrcLeft + leftStep 249 250 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 251 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 252 M_LDRB tVal10, [pSrcLeft], +leftStepx2 ;// tVal10= pSrcLeft[2] 253 M_LDRB tVal11, [pSrcLeft2],+leftStepx2 ;// tVal11= pSrcLeft[3] 254 ADD tVal7, tVal8, tVal9 ;// tVal7 = tVal8 + tVal9 255 ADD count, count, #1 ;// Inc Counter if Left is available 256 ADD tVal6, tVal10, tVal11 ;// tVal6 = tVal10 + tVal11 257 258 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 259 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 260 M_LDRB tVal10, [pSrcLeft], +leftStepx2 ;// tVal10= pSrcLeft[2] 261 M_LDRB tVal11, [pSrcLeft2],+leftStepx2 ;// tVal11= pSrcLeft[3] 262 ADD sum, tVal7, tVal6 ;// sum = tVal8 + tVal10 263 ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9 264 ADD tVal10, tVal10, tVal11 ;// tVal10= tVal10 + tVal11 265 ADD tVal7, tVal8, tVal10 ;// tVal7 = tVal8 + tVal10 266 267 268 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 269 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 270 M_LDRB tVal10, [pSrcLeft], +leftStepx2 ;// tVal10= pSrcLeft[2] 271 M_LDRB tVal11, [pSrcLeft2],+leftStepx2 ;// tVal11= pSrcLeft[3] 272 ADD sum, sum, tVal7 ;// sum = sum + tVal7 273 ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9 274 ADD tVal10, tVal10, tVal11 ;// tVal10= tVal10 + tVal11 275 ADD tVal7, tVal8, tVal10 ;// tVal7 = tVal8 + tVal10 276 277 278 M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0] 279 M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1] 280 M_LDRB tVal10, [pSrcLeft], +leftStepx2 ;// tVal10= pSrcLeft[2] 281 M_LDRB tVal11, [pSrcLeft2],+leftStepx2 ;// tVal11= pSrcLeft[3] 282 ADD sum, sum, tVal7 ;// sum = sum + tVal7 283 ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9 284 ADD tVal10, tVal10, tVal11 ;// tVal10= tVal10 + tVal11 285 ADD tVal7, tVal8, tVal10 ;// tVal7 = tVal8 + tVal10 286 ADD sum, sum, tVal7 ;// sum = sum + tVal7 287 288TST_COUNT 289 CMP count, #0 ;// if(count == 0) 290 MOVEQ sum, #128 ;// sum = 128 if(count == 0) 291 BEQ TST_COUNT0 ;// if(count == 0) 292 CMP count, #1 ;// if(count == 1) 293 ADDEQ sum, sum, #8 ;// sum += 8 if(count == 1) 294 ADDNE sum, sum, tVal2 ;// sum = sumleft + sumupper 295 ADDNE sum, sum, #16 ;// sum += 16 if(count == 2) 296 297 ;// M_STALL ARM1136JS=1 298 299 UXTH sum, sum ;// sum only byte rest cleared 300 301 ;// M_STALL ARM1136JS=1 302 303 LSREQ sum, sum, #4 ;// sum >> 4 if(count == 1) 304 305 ;// M_STALL ARM1136JS=1 306 307 LSRNE sum, sum, #5 ;// sum >> 5 if(count == 2) 308 309TST_COUNT0 310 311 ;// M_STALL ARM1136JS=1 312 313 ORR sum, sum, sum, LSL #8 ;// sum replicated in two halfword 314 315 ;// M_STALL ARM1136JS=1 316 317 ORR tVal6, sum, sum, LSL #16 ;// sum replicated in all bytes 318 CPY tVal7, tVal6 ;// tVal1 = tVal0 319 CPY tVal8, tVal6 ;// tVal2 = tVal0 320 CPY tVal9, tVal6 ;// tVal3 = tVal0 321 ADD dstStepx2, dstStep, dstStep ;// double dstStep 322 ADD pDst2, pDst, dstStep ;// pDst2- pDst advanced by dstStep 323 MOV y, #BLK_SIZE ;// Outer Loop Count 324 325LOOP_DC 326 STM pDst, {tVal6,tVal7,tVal8,tVal9} ;// pDst[0 to 15] = tVal 6 to 9 327 SUBS y, y, #2 ;// y-- 328 ADD pDst, pDst, dstStepx2 ;// pDst advanced by dstStep 329 STM pDst2, {tVal6,tVal7,tVal8,tVal9} ;// pDst2[16 to 31] = tVal 6 to 9 330 ADD pDst2, pDst2, dstStepx2 ;// pDst advanced by dstStep 331 BNE LOOP_DC ;// Loop for 8 times 332 333 MOV return, #OMX_Sts_NoErr 334 M_EXIT 335 336OMX_VC_16X16_PLANE 337 338 ;// M_STALL ARM1136JS=3 339 RSB tVal14, leftStep, leftStep, LSL #4 ;// tVal14 = 15*leftStep 340 341 ;// M_STALL ARM1136JS=2 342 LDRB tVal10, [pSrcLeft, tVal14] ;// tVal10 = pSrcLeft[15*leftStep] 343 LDRB tVal11, [pSrcAboveLeft] ;// tVal11 = pSrcAboveLeft[0] 344 LDRB tVal12, [pSrcAbove, #15] 345 346 ADD tVal2, tVal12, tVal10 ;// tVal2 = pSrcAbove[15] + pSrcLeft[15*leftStep] 347 SUB tVal10, tVal10, tVal11 ;// tVal10 = V0 = pSrcLeft[15*leftStep] - pSrcAboveLeft[0] 348 SUB tVal11, tVal12, tVal11 ;// tVal11 = H0 = pSrcAbove[15] - pSrcAboveLeft[0] 349 MOV tVal2, tVal2, LSL #4 ;// tVal2 = a = 16 * (pSrcAbove[15] + pSrcLeft[15*leftStep]) 350 351 MOV tVal11, tVal11, LSL #3 ;// 8*[15]-[-1] 352 LDRB tVal6, [pSrcAbove, #0] 353 LDRB tVal7, [pSrcAbove, #14] 354 SUB tVal8, tVal7, tVal6 355 RSB tVal8, tVal8, tVal8, LSL #3 ;// 7*[14]-[0] 356 ADD tVal11, tVal11, tVal8 357 LDRB tVal6, [pSrcAbove, #1] 358 LDRB tVal7, [pSrcAbove, #13] 359 SUB tVal8, tVal7, tVal6 360 ADD tVal8, tVal8, tVal8 361 ADD tVal8, tVal8, tVal8, LSL #1 ;// 6*[13]-[1] 362 ADD tVal11, tVal11, tVal8 363 LDRB tVal6, [pSrcAbove, #2] 364 LDRB tVal7, [pSrcAbove, #12] 365 SUB tVal8, tVal7, tVal6 366 ADD tVal8, tVal8, tVal8, LSL #2 ;// 5*[12]-[2] 367 ADD tVal11, tVal11, tVal8 368 LDRB tVal6, [pSrcAbove, #3] 369 LDRB tVal7, [pSrcAbove, #11] 370 SUB tVal8, tVal7, tVal6 371 ADD tVal11, tVal11, tVal8, LSL #2 ;// + 4*[11]-[3] 372 LDRB tVal6, [pSrcAbove, #4] 373 LDRB tVal7, [pSrcAbove, #10] 374 SUB tVal8, tVal7, tVal6 375 ADD tVal8, tVal8, tVal8, LSL #1 ;// 3*[10]-[4] 376 ADD tVal11, tVal11, tVal8 377 LDRB tVal6, [pSrcAbove, #5] 378 LDRB tVal7, [pSrcAbove, #9] 379 SUB tVal8, tVal7, tVal6 380 ADD tVal11, tVal11, tVal8, LSL #1 ;// + 2*[9]-[5] 381 LDRB tVal6, [pSrcAbove, #6] 382 LDRB tVal7, [pSrcAbove, #8] 383 SUB tVal8, tVal7, tVal6 ;// 1*[8]-[6] 384 ADD tVal7, tVal11, tVal8 385 386 ADD tVal2, tVal2, #16 ;// tVal2 = a + 16 387 MOV tVal1, pSrcLeft ;// tVal4 = pSrcLeft 388 SUB tVal9, tVal14, leftStep ;// tVal9 = 14*leftStep 389 ADD tVal9, pSrcLeft, tVal9 ;// tVal9 = pSrcLeft + 14*leftStep 390 391 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[14*leftStep] 392 M_LDRB tVal11, [tVal1], +leftStep ;// tVal11 = pSrcLeft[0] 393 ADD tVal7, tVal7, tVal7, LSL #2 ;// tVal7 = 5 * H 394 ADD tVal7, tVal7, #32 ;// tVal7 = 5 * H + 32 395 SUB tVal8, tVal8, tVal11 ;// tVal8 = pSrcLeft[14*leftStep] - pSrcLeft[0] 396 ASR tVal12, tVal7, #6 ;// tVal12 = b = (5 * H + 32) >> 6 397 398 RSB tVal8, tVal8, tVal8, LSL #3 ;// tVal8 = V1 = 7* (pSrcLeft[14*leftStep]-pSrcLeft[0]) 399 ADD tVal6, tVal8, tVal10, LSL #3 ;// tVal6 = V = V0 +V1 400 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[13*leftStep] 401 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[leftStep] 402 RSB tVal7, tVal12, tVal12, LSL #3 ;// tVal7 = 7*b 403 SUB tVal2, tVal2, tVal7 ;// tVal2 = a + 16 - 7*b 404 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[13*leftStep] - pSrcLeft[leftStep] 405 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[12*lS] 406 ADD tVal7, tVal7, tVal7 ;// tVal7 = 2 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep]) 407 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[2*leftStep] 408 ADD tVal7, tVal7, tVal7, LSL #1 ;// tVal7 = 6 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep]) 409 ADD tVal6, tVal6, tVal7 ;// tVal6 = V = V + V2 410 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep] 411 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[11*leftStep] 412 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[3*leftStep] 413 ADD tVal7, tVal7, tVal7, LSL #2 ;// tVal7 = 5 * (pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep]) 414 ADD tVal6, tVal6, tVal7 ;// tVal6 = V = V + V3 415 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[11*leftStep] - pSrcLeft[3*leftStep] 416 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[10*leftStep] 417 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[4*leftStep] 418 ADD tVal6, tVal6, tVal7, LSL #2 ;// tVal6 = V = V + V4 419 SUB dstStep, dstStep, #16 ;// tVal5 = dstStep - 16 420 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep] 421 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[9*leftStep] 422 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[5*leftStep] 423 ADD tVal7, tVal7, tVal7, LSL #1 ;// tVal7 = 3 * (pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep]) 424 ADD tVal6, tVal6, tVal7 ;// tVal6 = V = V + V5 425 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[9*leftStep] - pSrcLeft[5*leftStep] 426 M_LDRB tVal8, [tVal9], -leftStep ;// tVal8 = pSrcLeft[8*leftStep] 427 M_LDRB tVal10, [tVal1], +leftStep ;// tVal10 = pSrcLeft[6*leftStep] 428 ADD tVal6, tVal6, tVal7, LSL #1 ;// tVal6 = V = V + V6 429 430 ;// M_STALL ARM1136JS=1 431 SUB tVal7, tVal8, tVal10 ;// tVal7 = pSrcLeft[8*leftStep] - pSrcLeft[6*leftStep] 432 ADD tVal6, tVal6, tVal7 ;// tVal6 = V = V + V7 433 434 ;// M_STALL ARM1136JS=1 435 ADD tVal6, tVal6, tVal6, LSL #2 ;// tVal6 = 5*V 436 ADD tVal6, tVal6, #32 ;// tVal6 = 5*V + 32 437 438 ;// M_STALL ARM1136JS=1 439 ASR tVal14, tVal6, #6 ;// tVal14 = c = (5*V + 32)>>6 440 441 ;// M_STALL ARM1136JS=1 442 RSB tVal6, tVal14, tVal14, LSL #3 ;// tVal6 = 7*c 443 UXTH tVal14, tVal14 ;// tVal14 = Cleared the upper half word 444 ADD tVal10, tVal12, tVal12 ;// tVal10 = 2*b 445 ORR tVal14, tVal14, tVal14, LSL #16 ;// tVal14 = {c , c} 446 SUB tVal6, tVal2, tVal6 ;// tVal6 = d = a - 7*b - 7*c + 16 447 ADD tVal1, tVal6, tVal10 ;// tVal1 = pp2 = d + 2*b 448 ADD tVal10, tVal10, tVal12 ;// tVal10 =3*b 449 ORR tVal0, tVal6, tVal1, LSL #16 ;// tval0 = p2p0 = pack {p2, p0} 450 UXTH tVal12, tVal12 ;// tVal12 = Cleared the upper half word 451 UXTH tVal10, tVal10 ;// tVal12 = Cleared the upper half word 452 ORR tVal12, tVal12, tVal12, LSL #16 ;// tVal12 = {b , b} 453 ORR tVal10, tVal10, tVal10, LSL #16 ;// tVal10 = {3b , 3b} 454 SADD16 tVal1, tVal0, tVal12 ;// tVal1 = p3p1 = p2p0 + {b,b} 455 SADD16 tVal2, tVal1, tVal10 ;// tVal2 = p6p4 = p3p1 + {3b,3b} 456 SADD16 tVal4, tVal2, tVal12 ;// tVal4 = p7p5 = p6p4 + {b,b} 457 SADD16 tVal6, tVal4, tVal10 ;// tVal6 = p10p8 = p7p5 + {3b,3b} 458 SADD16 tVal7, tVal6, tVal12 ;// tVal7 = p11p9 = p10p8 + {b,b} 459 SADD16 tVal8, tVal7, tVal10 ;// tVal8 = p14p12 = p11p9 + {3b,3b} 460 SADD16 tVal9, tVal8, tVal12 ;// tVal9 = p15p13 = p14p12 + {b,b} 461 LDR r0x00FF00FF, =MASK_CONST ;// r0x00FF00FF = 0x00FF00FF 462 463LOOP_PLANE 464 465 USAT16 temp2, #13, p3p1 466 USAT16 temp1, #13, p2p0 467 SADD16 p3p1, p3p1, c 468 SADD16 p2p0, p2p0, c 469 AND temp2, r0x00FF00FF, temp2, ASR #5 470 AND temp1, r0x00FF00FF, temp1, ASR #5 471 ORR temp1, temp1, temp2, LSL #8 472 STR temp1, [pDst], #4 473 474 USAT16 temp2, #13, p7p5 475 USAT16 temp1, #13, p6p4 476 SADD16 p7p5, p7p5, c 477 SADD16 p6p4, p6p4, c 478 AND temp2, r0x00FF00FF, temp2, ASR #5 479 AND temp1, r0x00FF00FF, temp1, ASR #5 480 ORR temp1, temp1, temp2, LSL #8 481 STR temp1, [pDst], #4 482 483 USAT16 temp2, #13, p11p9 484 USAT16 temp1, #13, p10p8 485 SADD16 p11p9, p11p9, c 486 SADD16 p10p8, p10p8, c 487 AND temp2, r0x00FF00FF, temp2, ASR #5 488 AND temp1, r0x00FF00FF, temp1, ASR #5 489 ORR temp1, temp1, temp2, LSL #8 490 STR temp1, [pDst], #4 491 492 USAT16 temp2, #13, p15p13 493 USAT16 temp1, #13, p14p12 494 SADD16 p15p13, p15p13, c 495 SADD16 p14p12, p14p12, c 496 AND temp2, r0x00FF00FF, temp2, ASR #5 497 AND temp1, r0x00FF00FF, temp1, ASR #5 498 ORR temp1, temp1, temp2, LSL #8 499 STR temp1, [pDst], #4 500 501 ADDS r0x00FF00FF, r0x00FF00FF, #1<<28 ;// Loop counter value in top 4 bits 502 503 ADD pDst, pDst, dstStep 504 505 BCC LOOP_PLANE ;// Loop for 16 times 506 MOV return, #OMX_Sts_NoErr 507 M_END 508 509 ENDIF ;// ARM1136JS 510 511 512 END 513;----------------------------------------------------------------------------------------------- 514; omxVCM4P10_PredictIntra_16x16 ends 515;----------------------------------------------------------------------------------------------- 516