armVCM4P10_DeblockingLuma_unsafe_s.s revision 78e52bfac041d71ce53b5b13c2abf78af742b09d
1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: armVCM4P10_DeblockingLuma_unsafe_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 12290 21;// Date: Wednesday, April 9, 2008 22;// 23;// 24;// 25;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS CortexA8 31 32 33 IF CortexA8 34 35pThresholds RN 5 36 37;// Pixels 38dP_0 DN D4.U8 39dP_1 DN D5.U8 40dP_2 DN D6.U8 41dP_3 DN D7.U8 42dQ_0 DN D8.U8 43dQ_1 DN D9.U8 44dQ_2 DN D10.U8 45dQ_3 DN D11.U8 46 47 48;// Filtering Decision 49dAlpha DN D0.U8 50 51dFilt DN D16.U8 52dAqflg DN D12.U8 53dApflg DN D17.U8 54 55dAp0q0 DN D13.U8 56 57;// bSLT4 58dTC0 DN D18.U8 59dTC1 DN D19.U8 60dTC01 DN D18.U8 61 62dTCs DN D31.S8 63dTC DN D31.U8 64 65dMask_0 DN D14.U8 66dMask_1 DN D15.U8 67 68dTemp DN D19.U8 69 70;// Computing P0,Q0 71qDq0p0 QN Q10.S16 72qDp1q1 QN Q11.S16 73qDelta QN Q10.S16 ; reuse qDq0p0 74dDelta DN D20.S8 75 76 77;// Computing P1,Q1 78dRp0q0 DN D24.U8 79 80dMaxP DN D23.U8 81dMinP DN D22.U8 82 83dMaxQ DN D19.U8 84dMinQ DN D21.U8 85 86dDeltaP DN D26.U8 87dDeltaQ DN D27.U8 88 89qP_0n QN Q14.S16 90qQ_0n QN Q12.S16 91 92dQ_0n DN D24.U8 93dQ_1n DN D25.U8 94dP_0n DN D29.U8 95dP_1n DN D30.U8 96 97;// bSGE4 98 99qSp0q0 QN Q10.U16 100 101qSp2q1 QN Q11.U16 102qSp0q0p1 QN Q12.U16 103qSp3p2 QN Q13.U16 104dHSp0q1 DN D28.U8 105 106qSq2p1 QN Q11.U16 107qSp0q0q1 QN Q12.U16 108qSq3q2 QN Q13.U16 ;!! 109dHSq0p1 DN D28.U8 ;!! 110 111qTemp1 QN Q11.U16 ;!!;qSp2q1 112qTemp2 QN Q12.U16 ;!!;qSp0q0p1 113 114dP_0t DN D28.U8 ;!!;dHSp0q1 115dQ_0t DN D22.U8 ;!!;Temp1 116 117dP_0n DN D29.U8 118dP_1n DN D30.U8 119dP_2n DN D31.U8 120 121dQ_0n DN D24.U8 ;!!;Temp2 122dQ_1n DN D25.U8 ;!!;Temp2 123dQ_2n DN D28.U8 ;!!;dQ_0t 124 125;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe 126;// 127;// Inputs - Pixels - p0-p3: D4-D7, q0-q3: D8-D11 128;// - Filter masks - filt: D16, aqflg: D12, apflg: D17 129;// - Additional Params - pThresholds: r5 130;// 131;// Outputs - Pixels - P0-P1: D29-D30, Q0-Q1: D24-D25 132;// - Additional Params - pThresholds: r5 133 134;// Registers Corrupted - D18-D31 135 136 137 M_START armVCM4P10_DeblockingLumabSLT4_unsafe 138 139 140 ;// qDq0p0-10 141 VSUBL qDp1q1, dP_1, dQ_1 142 VLD1 {dTC0[]}, [pThresholds]! 143 ;// qDp1q1-11 144 VSUBL qDq0p0, dQ_0, dP_0 145 VLD1 {dTC1[]}, [pThresholds]! 146 147 ;// dRp0q0-24 148 VSHR qDp1q1, qDp1q1, #2 149 150 ;// dTC01 = (dTC1 << 4) | dTC0 151 ;// dTC01-18 152 VEXT dTC01, dTC0, dTC1, #4 153 ;// dTemp-19 154 VAND dTemp, dApflg, dMask_1 155 156 VBIF dTC01, dMask_0, dFilt 157 158 159 ;// delta = (((q0-p0)<<2) + (p1-q1) + 4) >> 3; 160 ;// dDelta = (qDp1q1 >> 2 + qDq0p0 + 1)>> 1 161 162 ;// qDelta-qDq0p0-10 163 VRHADD qDelta, qDp1q1, qDq0p0 164 VRHADD dRp0q0, dP_0, dQ_0 165 VADD dTC, dTC01, dTemp 166 167 ;// dTC = dTC01 + (dAplg & 1) + (dAqflg & 1) 168 169 VAND dTemp, dAqflg, dMask_1 170 VQADD dMaxP, dP_1, dTC01 171 VQMOVN dDelta, qDelta 172 VADD dTC, dTC, dTemp 173 174 ;// dMaxP = QADD(dP_1, dTC01) 175 ;// dMinP = QSUB(dP_1, dTC01) 176 177 ;// dMaxP-d23 178 ;// dMinP-d22 179 VQSUB dMinP, dP_1, dTC01 180 181 ;// dDelta-d20 182 183 ;// dMaxQ = QADD(dQ_1, dTC01) 184 ;// dMinQ = QSUB(dQ_1, dTC01) 185 186 ;// dMaxQ-19 187 ;// dMinQ-21 188 VQADD dMaxQ, dQ_1, dTC01 189 VHADD dDeltaP, dRp0q0, dP_2 190 VMIN dDelta, dDelta, dTCs 191 192 ;// dDelta = (OMX_U8)armClip(0, 255, q0 - delta); 193 VNEG dTCs, dTCs 194 195 VQSUB dMinQ, dQ_1, dTC01 196 197 ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1; 198 ;// delta = armClip(-tC0, tC0, delta); 199 ;// pQ0[-2*Step] = (OMX_U8)(p1 + delta); 200 201 ;// dDeltaP = (dP_2 + dRp0q0)>>1; 202 ;// dP_1n = armClip(dP_1 - dTC01, dP_1 + dTC01, dDeltaP); 203 ;// dP_1n = armClip(MinP, MaxP, dDeltaP); 204 205 ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1; 206 ;// delta = armClip(-tC0, tC0, delta); 207 ;// pQ0[1*Step] = (OMX_U8)(q1 + delta); 208 209 ;// dDeltaQ = (dQ_2 + dRp0q0)>>1; 210 ;// dQ_1n = armClip(dQ_1 - dTC01, dQ_1 + dTC01, dDeltaQ); 211 ;// dQ_1n = armClip(MinQ, MaxQ, dDeltaQ); 212 213 ;// dDeltaP-26 214 VHADD dDeltaQ, dRp0q0, dQ_2 215 216 ;// dDeltaQ-27 217 218 ;// dP_0n - 29 219 ;// dP_1n - 30 220 ;// dQ_0n - 24 221 ;// dQ_1n - 25 222 223 ;// delta = (q2 + ((p0+q0+1)>>1) - (q1<<1))>>1; 224 ;// dDeltaQ = (dQ_2 + dRp0q0)>>1; 225 226 VMAX dP_1n, dDeltaP, dMinP 227 VMAX dDelta, dDelta, dTCs 228 229 ;// pQ0[-1*Step] = (OMX_U8)armClip(0, 255, dP_0 - delta); 230 ;// pQ0[0*Step] = (OMX_U8)armClip(0, 255, dQ_0 - delta); 231 232 ;// dP_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta); 233 ;// dQ_0n = (OMX_U8)armClip(0, 255, dP_0 - dDelta); 234 235 ;// qP_0n - 14 236 ;// qQ_0n - 12 237 238 VMOVL qP_0n, dP_0 239 VMOVL qQ_0n, dQ_0 240 241 VADDW qP_0n, qP_0n, dDelta 242 VSUBW qQ_0n, qQ_0n, dDelta 243 244 VQMOVUN dP_0n, qP_0n 245 VQMOVUN dQ_0n, qQ_0n 246 247 VMAX dQ_1n, dDeltaQ, dMinQ 248 249 VMIN dP_1n, dP_1n, dMaxP 250 VMIN dQ_1n, dQ_1n, dMaxQ 251 VBIF dP_0n, dP_0, dFilt 252 253 VBIF dP_1n, dP_1, dApflg 254 VBIF dQ_0n, dQ_0, dFilt 255 VBIF dQ_1n, dQ_1, dAqflg 256 257 M_END 258 259;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe() 260;// 261;// Inputs - Pixels - p0-p3: D4-D7, q0-q3: D8-D11 262;// - Filter masks - filt: D16, aqflg: D12, apflg: D17 263;// - Additional Params - alpha: D0, dMask_1: D15 264;// 265;// Outputs - Pixels - P0-P2: D29-D31, Q0-Q2: D24,D25,D28 266 267;// Registers Corrupted - D18-D31 268 269 M_START armVCM4P10_DeblockingLumabSGE4_unsafe 270 271 272 ;// ap<beta && armAbs(p0-q0)<((alpha>>2)+2) 273 ;// aq<beta && armAbs(p0-q0)<((alpha>>2)+2) 274 275 ;// ( dApflg & dAp0q0 < (dAlpha >> 2 + 2) ) 276 ;// ( dAqflg & dAp0q0 < (dAlpha >> 2 + 2) ) 277 278 ;// ( dApflg = dApflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) ) 279 ;// ( dAqflg = dAqflg & dAp0q0 < (dTemp + dMask_1 + dMask_1) ) 280 281 ;// P Filter 282 283 VSHR dTemp, dAlpha, #2 284 VADD dTemp, dTemp, dMask_1 285 286 ;// qSp0q0-10 287 VADDL qSp0q0, dQ_0, dP_0 288 VADD dTemp, dTemp, dMask_1 289 290 ;// qSp2q1-11 291 ;// qSp0q0p1-12 292 VADDL qSp2q1, dP_2, dQ_1 293 VADDW qSp0q0p1, qSp0q0, dP_1 294 295 VCGT dTemp, dTemp, dAp0q0 296 VSHR qSp2q1, #1 297 298 ;// pQ0[-1*Step] = (OMX_U8)((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3); 299 ;// pQ0[-1*Step] = ( ( (p0 + q0 + p1) + (p2 + q1)>>1 ) >> 1 + 1 ) >> 1 300 301 ;// dP_0n = ( ( (qSp0q0 + dP_1) + qSp2q1>>1 ) >> 1 + 1 ) >> 1 302 ;// dP_0n = ( ( qSp0q0p1 + qSp2q1>>1 ) >> 1 + 1 ) >> 1 303 ;// dP_0n = ( qTemp1 + 1 ) >> 1 304 305 ;// pQ0[-2*Step] = (OMX_U8)((p2 + p1 + p0 + q0 + 2)>>2); 306 307 ;// dP_1n = (OMX_U8)((dP_2 + qSp0q0p1 + 2)>>2); 308 ;// dP_1n = (OMX_U8)((qTemp2 + 2)>>2); 309 310 ;// pQ0[-3*Step] = (OMX_U8)((2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3); 311 ;// pQ0[-3*Step] = (OMX_U8)(( (p3 + p2) + (p1 + p0 + q0 + p2) >> 1 + 2)>>2); 312 313 ;// dP_2n = (OMX_U8)(( qSp3p2 + (dP_2 + qSp0q0p1) >> 1 + 2) >> 2); 314 ;// dP_2n = (OMX_U8)(( qSp3p2 + qTemp2 >> 1 + 2) >> 2); 315 316 ;// qTemp1-qSp2q1-11 317 ;// qTemp2-qSp0q0p1-12 318 VHADD qTemp1, qSp0q0p1, qSp2q1 319 VADDW qTemp2, qSp0q0p1, dP_2 320 321 ;// qSp3p2-13 322 VADDL qSp3p2, dP_3, dP_2 323 324 VAND dApflg, dApflg, dTemp 325 VHADD dHSp0q1, dP_0, dQ_1 326 VSRA qSp3p2, qTemp2, #1 327 ;// dHSp0q1-28 328 VAND dAqflg, dAqflg, dTemp 329 330 ;// dP_0n-29 331 ;// dP_0t-dHSp0q1-28 332 VQRSHRN dP_0n, qTemp1, #1 333 VRHADD dP_0t, dHSp0q1, dP_1 334 335 ;// dP_1n-30 336 VQRSHRN dP_1n, qTemp2, #2 337 338 VADDL qSq2p1, dQ_2, dP_1 339 VADDW qSp0q0q1, qSp0q0, dQ_1 340 341 VBIF dP_0n, dP_0t, dApflg 342 343 ;// Q Filter 344 345 ;// pQ0[0*Step] = (OMX_U8)((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3); 346 ;// pQ0[0*Step] = ( ( (p0 + q0 + q1) + (q2 + p1)>>1 ) >> 1 + 1 ) >> 1 347 348 ;// dQ_0n = ( ( (qSp0q0 + dQ_1) + qSq2p1>>1 ) >> 1 + 1 ) >> 1 349 ;// dQ_0n = ( ( qSp0q0q1 + qSq2p1>>1 ) >> 1 + 1 ) >> 1 350 ;// dQ_0n = ( qTemp1 + 1 ) >> 1 351 352 ;// pQ0[1*Step] = (OMX_U8)((q2 + q1 + q0 + q0 + 2)>>2); 353 354 ;// dQ_1n = (OMX_U8)((dQ_2 + qSp0q0q1 + 2)>>2); 355 ;// dQ_1n = (OMX_U8)((qTemp2 + 2)>>2); 356 357 ;// pQ0[2*Step] = (OMX_U8)((2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3); 358 ;// pQ0[2*Step] = (OMX_U8)(( (q3 + q2) + (q1 + p0 + q0 + q2) >> 1 + 2)>>2); 359 360 ;// dQ_2n = (OMX_U8)(( qSq3q2 + (dQ_2 + qSp0q0q1) >> 1 + 2) >> 2); 361 ;// dQ_2n = (OMX_U8)(( qSq3q2 + qTemp2 >> 1 + 2) >> 2); 362 363 ;// qTemp1-qSp2q1-11 364 ;// qTemp2-qSp0q0p1-12 365 ;// qSq2p1-11 366 ;// qSp0q0q1-12 367 368 369 ;// qTemp2-qSp0q0p1-12 370 ;// qTemp1-qSq2p1-11 371 ;// qSq3q2-13 372 ;// dP_2n-31 373 374 VQRSHRN dP_2n, qSp3p2, #2 375 VADDL qSq3q2, dQ_3, dQ_2 376 377 VSHR qSq2p1, #1 378 379 VHADD qTemp1, qSp0q0q1, qSq2p1 380 VADDW qTemp2, qSp0q0q1, dQ_2 381 382 ;// dHSq0p1-28 383 VHADD dHSq0p1, dQ_0, dP_1 384 385 VBIF dP_0n, dP_0, dFilt 386 VBIF dP_1n, dP_1, dApflg 387 388 VSRA qSq3q2, qTemp2, #1 389 390 ;// dQ_1-Temp2-25 391 ;// dQ_0-Temp2-24 392 VQRSHRN dQ_1n, qTemp2, #2 393 VQRSHRN dQ_0n, qTemp1, #1 394 395 ;// dQ_0t-Temp1-22 396 VRHADD dQ_0t, dHSq0p1, dQ_1 397 VBIF dQ_1n, dQ_1, dAqflg 398 399 VBIF dP_2n, dP_2, dApflg 400 VBIF dQ_0n, dQ_0t, dAqflg 401 VQRSHRN dQ_2n, qSq3q2, #2 402 VBIF dQ_0n, dQ_0, dFilt 403 VBIF dQ_2n, dQ_2, dAqflg 404 405 M_END 406 407 ENDIF 408 409 410 END 411