1;// 2;// Copyright (C) 2007-2008 ARM Limited 3;// 4;// Licensed under the Apache License, Version 2.0 (the "License"); 5;// you may not use this file except in compliance with the License. 6;// You may obtain a copy of the License at 7;// 8;// http://www.apache.org/licenses/LICENSE-2.0 9;// 10;// Unless required by applicable law or agreed to in writing, software 11;// distributed under the License is distributed on an "AS IS" BASIS, 12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13;// See the License for the specific language governing permissions and 14;// limitations under the License. 15;// 16;// 17;// 18;// File Name: armVCM4P10_DeblockingLuma_unsafe_s.s 19;// OpenMAX DL: v1.0.2 20;// Revision: 9641 21;// Date: Thursday, February 7, 2008 22;// 23;// 24;// 25;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS ARM1136JS 31 32 33 34 IF ARM1136JS 35 36MASK_1 EQU 0x01010101 37 38;// Declare input registers 39 40pQ0 RN 0 41StepArg RN 1 42tC0Arg RN 2 43alpha RN 6 44 45beta RN 14 46bS RN 14 47tC0 RN 14 48ptC0 RN 1 49 50;// Declare Local/Temporary variables 51 52;// Pixels 53p_0 RN 3 54p_1 RN 5 55p_2 RN 4 56p_3 RN 2 57q_0 RN 8 58q_1 RN 9 59q_2 RN 10 60q_3 RN 12 61 62 63;// Filtering 64 65ap0q0 RN 1 66filt RN 2 67 68m00 RN 7 69m01 RN 11 70 71apflg RN 0 72aqflg RN 6 73 74tC RN 1 75 76 77;//Declarations for bSLT4 kernel 78 79pos RN 7 80neg RN 12 81 82P0a RN 1 83P1a RN 8 84Q0a RN 7 85Q1a RN 4 86 87u1 RN 3 88max RN 12 89min RN 2 90 91 92 93;//Declarations for bSGE4 kernel 94 95q_3b RN 9 96p_3b RN 0 97apqflg RN 12 98 99P0b RN 6 100P1b RN 7 101P2b RN 1 102 103Q0b RN 9 104Q1b RN 0 105Q2b RN 2 106 107;// Miscellanous 108 109a RN 0 110t0 RN 3 111t1 RN 12 112t2 RN 7 113t3 RN 11 114t4 RN 4 115t5 RN 1 116t8 RN 6 117t9 RN 14 118t10 RN 5 119t11 RN 9 120 121;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe() 122;// 123;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2) 124;// - 2 - filt, 0 - apflg, 6 - aqflg 125;// - 11 - m01, 7 - tC0 126;// 127;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a) 128;// 129;// Registers Corrupted - 0-3,5-12,14 130 131 132 M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr 133 134 ;// Since beta <= 18 and alpha <= 255 we know 135 ;// -254 <= p0-q0 <= 254 136 ;// -17 <= q1-q0 <= 17 137 ;// -17 <= p1-p0 <= 17 138 139 ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3)) 140 ;// 141 ;// Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3 142 ;// = (4*q0 - 4*p0 + p1 - q1 + 4)>>3 143 ;// = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3 144 145 USUB8 t1, p_1, p_0 146 MUL tC0, t2, m01 147 148 USUB8 t2, q_1, q_0 149 SSUB8 t1, t1, t2 150 151 USUB8 t2, p_0, q_0 152 AND t2, t2, m01 153 SHSUB8 t1, t1, t2 154 UHSUB8 t5, p_0, q_0 155 SSUB8 t1, t1, t2 156 SHSUB8 t1, t1, t5 157 MOV m00, #0 158 SADD8 t1, t1, m01 159 SHSUB8 t1, t1, t5 160 161 ;// tC = tC0 162 ;// if (ap < beta) tC++; 163 ;// if (aq < beta) tC++; 164 USUB8 t5, filt, m01 165 SEL tC0, tC0, m00 166 UQADD8 tC, tC0, apflg 167 SSUB8 t1, t1, m00 168 UQADD8 tC, tC, aqflg 169 170 ;// Split into positive and negative part and clip 171 SEL pos, t1, m00 172 USUB8 neg, pos, t1 173 USUB8 t3, pos, tC 174 SEL pos, tC, pos 175 USUB8 t3, neg, tC 176 SEL neg, tC, neg 177 178 ;//Reload m01 179 LDR m01,=MASK_1 180 181 UQADD8 P0a, p_0, pos 182 UQSUB8 Q0a, q_0, pos 183 UQSUB8 P0a, P0a, neg 184 UQADD8 Q0a, Q0a, neg 185 186 ;// Choose to store the filtered 187 ;// value or the original pixel 188 USUB8 t1, filt, m01 189 SEL P0a, P0a, p_0 190 SEL Q0a, Q0a, q_0 191 192 ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1; 193 ;// u1 = (p0 + q0 + 1)>>1 194 ;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80 195 MVN p_0, p_0 196 UHSUB8 u1, q_0, p_0 197 UQADD8 max, p_1, tC0 198 EOR u1, u1, m01 ,LSL #7 199 200 ;// Calculate A = (p2+u1)>>1 201 ;// Then delta = Clip3( -tC0, tC0, A - p1) 202 203 ;// Clip P1 204 UHADD8 P1a, p_2, u1 205 UQSUB8 min, p_1, tC0 206 USUB8 t4, P1a, max 207 SEL P1a, max, P1a 208 USUB8 t4, P1a, min 209 SEL P1a, P1a, min 210 211 ;// Clip Q1 212 UHADD8 Q1a, q_2, u1 213 UQADD8 max, q_1, tC0 214 UQSUB8 min, q_1, tC0 215 USUB8 t0, Q1a, max 216 SEL Q1a, max, Q1a 217 USUB8 t0, Q1a, min 218 SEL Q1a, Q1a, min 219 220 ;// Choose to store the filtered 221 ;// value or the original pixel 222 USUB8 t0, apflg, m01 223 SEL P1a, P1a, p_1 224 USUB8 t0, aqflg, m01 225 SEL t3, Q1a, q_1 226 227 M_END 228 229;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe() 230;// 231;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2) 232;// - 2 - filt, 0 - apflg,aqflg 233;// - 1 - ap0q0, 6 - alpha 234;// - 7 - m00, 11 - m01 235;// 236;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b) 237;// 238;// Registers Corrupted - 0-3,5-12,14 239 240 M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr 241 242 ;// apflg = apflg && |p0-q0|<((alpha>>2)+2) 243 ;// apflg = aqflg && |p0-q0|<((alpha>>2)+2) 244 245 M_ARG pDummy,4 246 M_ARG pQ_3,4 247 M_ARG pP_3,4 248 249 UHADD8 alpha, alpha, m00 250 USUB8 t9, p_2, p_0 ;//t9 = dp2p0 251 UHADD8 alpha, alpha, m00 252 ADD alpha, alpha, m01, LSL #1 253 USUB8 ap0q0, ap0q0, alpha 254 SEL apqflg, m00, apflg 255 256 ;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 257 ;// = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3 258 ;// = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3) 259 260 ;// P1 = (p2 + p1 + q0 + p0 + 2)>>2 261 ;// = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2) 262 263 ;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3 264 ;// = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3 265 ;// = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2) 266 267 ;// Compute P0b 268 USUB8 t2, p_0, q_0 269 SSUB8 t5, t9, t2 270 271 USUB8 t8, q_1, q_0 272 SHADD8 t8, t5, t8 273 274 USUB8 t9, p_1, p_0 275 SADD8 t8, t8, t9 276 SHSUB8 t8, t8, t2 277 SHADD8 t5, t5, t9 278 SHADD8 t8, t8, m01 279 SHADD8 t9, t5, m01 280 SADD8 P0b, p_0, t8 281 ;// P0b ready 282 283 ;// Compute P1b 284 M_LDR p_3b, pP_3 285 SADD8 P1b, p_0, t9 286 ;// P1b ready 287 288 ;// Compute P2b 289 USUB8 t9, p_2, p_0 290 SADD8 t5, t5, t9 291 UHSUB8 t9, p_3b, p_0 292 EOR a, p_3b, p_0 293 AND a, a, m01 294 SHADD8 t5, t5, a 295 UHADD8 a, p_0, q_1 296 SADD8 t5, t5, m01 297 SHADD8 t5, t5, t9 298 MVN t9, p_1 299 SADD8 P2b, p_0, t5 300 ;// P2b ready 301 302 UHSUB8 a, a, t9 303 ORR t9, apqflg, m01 304 USUB8 t9, apqflg, t9 305 306 EOR a, a, m01, LSL #7 307 SEL P0b, P0b, a 308 SEL P1b, P1b, p_1 309 SEL P2b, P2b, p_2 310 311 USUB8 t4, filt, m01 312 SEL P0b, P0b, p_0 313 314 315 ;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3 316 ;// = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3 317 ;// = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3) 318 319 ;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2 320 ;// = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2) 321 322 ;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3 323 ;// = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3 324 ;// = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2) 325 326 327 ;// Compute Q0b Q1b 328 USUB8 t4, q_2, q_0 329 USUB8 a, p_0, q_0 330 USUB8 t9, p_1, p_0 331 SADD8 t0, t4, a 332 SHADD8 t9, t0, t9 333 UHADD8 t10, q_0, p_1 334 SADD8 t9, t9, a 335 USUB8 a, q_1, q_0 336 SHADD8 t9, t9, a 337 SHADD8 t0, t0, a 338 SHADD8 t9, t9, m01 339 SHADD8 a, t0, m01 340 SADD8 t9, q_0, t9 341 ;// Q0b ready - t9 342 343 MOV t4, #0 344 UHADD8 apqflg, apqflg, t4 345 346 SADD8 Q1b, q_0, a 347 ;// Q1b ready 348 349 USUB8 t4, apqflg, m01 350 SEL Q1b, Q1b, q_1 351 MVN t11, q_1 352 UHSUB8 t10, t10, t11 353 M_LDR q_3b, pQ_3 354 EOR t10, t10, m01, LSL #7 355 SEL t9, t9, t10 356 357 ;// Compute Q2b 358 USUB8 t4, q_2, q_0 359 SADD8 t4, t0, t4 360 EOR t0, q_3b, q_0 361 AND t0, t0, m01 362 SHADD8 t4, t4, t0 363 UHSUB8 t10, q_3b, q_0 364 SADD8 t4, t4, m01 365 SHADD8 t4, t4, t10 366 367 USUB8 t10, filt, m01 368 SEL Q0b, t9, q_0 369 370 SADD8 t4, q_0, t4 371 ;// Q2b ready - t4 372 373 USUB8 t10, apqflg, m01 374 SEL Q2b, t4, q_2 375 376 M_END 377 378 ENDIF 379 380 END 381