1;// 2;// 3;// File Name: armVCM4P10_DeblockingLuma_unsafe_s.s 4;// OpenMAX DL: v1.0.2 5;// Revision: 9641 6;// Date: Thursday, February 7, 2008 7;// 8;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9;// 10;// 11;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS ARM1136JS 17 18 19 20 IF ARM1136JS 21 22MASK_1 EQU 0x01010101 23 24;// Declare input registers 25 26pQ0 RN 0 27StepArg RN 1 28tC0Arg RN 2 29alpha RN 6 30 31beta RN 14 32bS RN 14 33tC0 RN 14 34ptC0 RN 1 35 36;// Declare Local/Temporary variables 37 38;// Pixels 39p_0 RN 3 40p_1 RN 5 41p_2 RN 4 42p_3 RN 2 43q_0 RN 8 44q_1 RN 9 45q_2 RN 10 46q_3 RN 12 47 48 49;// Filtering 50 51ap0q0 RN 1 52filt RN 2 53 54m00 RN 7 55m01 RN 11 56 57apflg RN 0 58aqflg RN 6 59 60tC RN 1 61 62 63;//Declarations for bSLT4 kernel 64 65pos RN 7 66neg RN 12 67 68P0a RN 1 69P1a RN 8 70Q0a RN 7 71Q1a RN 4 72 73u1 RN 3 74max RN 12 75min RN 2 76 77 78 79;//Declarations for bSGE4 kernel 80 81q_3b RN 9 82p_3b RN 0 83apqflg RN 12 84 85P0b RN 6 86P1b RN 7 87P2b RN 1 88 89Q0b RN 9 90Q1b RN 0 91Q2b RN 2 92 93;// Miscellanous 94 95a RN 0 96t0 RN 3 97t1 RN 12 98t2 RN 7 99t3 RN 11 100t4 RN 4 101t5 RN 1 102t8 RN 6 103t9 RN 14 104t10 RN 5 105t11 RN 9 106 107;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe() 108;// 109;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2) 110;// - 2 - filt, 0 - apflg, 6 - aqflg 111;// - 11 - m01, 7 - tC0 112;// 113;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a) 114;// 115;// Registers Corrupted - 0-3,5-12,14 116 117 118 M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr 119 120 ;// Since beta <= 18 and alpha <= 255 we know 121 ;// -254 <= p0-q0 <= 254 122 ;// -17 <= q1-q0 <= 17 123 ;// -17 <= p1-p0 <= 17 124 125 ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3)) 126 ;// 127 ;// Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3 128 ;// = (4*q0 - 4*p0 + p1 - q1 + 4)>>3 129 ;// = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3 130 131 USUB8 t1, p_1, p_0 132 MUL tC0, t2, m01 133 134 USUB8 t2, q_1, q_0 135 SSUB8 t1, t1, t2 136 137 USUB8 t2, p_0, q_0 138 AND t2, t2, m01 139 SHSUB8 t1, t1, t2 140 UHSUB8 t5, p_0, q_0 141 SSUB8 t1, t1, t2 142 SHSUB8 t1, t1, t5 143 MOV m00, #0 144 SADD8 t1, t1, m01 145 SHSUB8 t1, t1, t5 146 147 ;// tC = tC0 148 ;// if (ap < beta) tC++; 149 ;// if (aq < beta) tC++; 150 USUB8 t5, filt, m01 151 SEL tC0, tC0, m00 152 UQADD8 tC, tC0, apflg 153 SSUB8 t1, t1, m00 154 UQADD8 tC, tC, aqflg 155 156 ;// Split into positive and negative part and clip 157 SEL pos, t1, m00 158 USUB8 neg, pos, t1 159 USUB8 t3, pos, tC 160 SEL pos, tC, pos 161 USUB8 t3, neg, tC 162 SEL neg, tC, neg 163 164 ;//Reload m01 165 LDR m01,=MASK_1 166 167 UQADD8 P0a, p_0, pos 168 UQSUB8 Q0a, q_0, pos 169 UQSUB8 P0a, P0a, neg 170 UQADD8 Q0a, Q0a, neg 171 172 ;// Choose to store the filtered 173 ;// value or the original pixel 174 USUB8 t1, filt, m01 175 SEL P0a, P0a, p_0 176 SEL Q0a, Q0a, q_0 177 178 ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1; 179 ;// u1 = (p0 + q0 + 1)>>1 180 ;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80 181 MVN p_0, p_0 182 UHSUB8 u1, q_0, p_0 183 UQADD8 max, p_1, tC0 184 EOR u1, u1, m01 ,LSL #7 185 186 ;// Calculate A = (p2+u1)>>1 187 ;// Then delta = Clip3( -tC0, tC0, A - p1) 188 189 ;// Clip P1 190 UHADD8 P1a, p_2, u1 191 UQSUB8 min, p_1, tC0 192 USUB8 t4, P1a, max 193 SEL P1a, max, P1a 194 USUB8 t4, P1a, min 195 SEL P1a, P1a, min 196 197 ;// Clip Q1 198 UHADD8 Q1a, q_2, u1 199 UQADD8 max, q_1, tC0 200 UQSUB8 min, q_1, tC0 201 USUB8 t0, Q1a, max 202 SEL Q1a, max, Q1a 203 USUB8 t0, Q1a, min 204 SEL Q1a, Q1a, min 205 206 ;// Choose to store the filtered 207 ;// value or the original pixel 208 USUB8 t0, apflg, m01 209 SEL P1a, P1a, p_1 210 USUB8 t0, aqflg, m01 211 SEL t3, Q1a, q_1 212 213 M_END 214 215;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe() 216;// 217;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2) 218;// - 2 - filt, 0 - apflg,aqflg 219;// - 1 - ap0q0, 6 - alpha 220;// - 7 - m00, 11 - m01 221;// 222;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b) 223;// 224;// Registers Corrupted - 0-3,5-12,14 225 226 M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr 227 228 ;// apflg = apflg && |p0-q0|<((alpha>>2)+2) 229 ;// apflg = aqflg && |p0-q0|<((alpha>>2)+2) 230 231 M_ARG pDummy,4 232 M_ARG pQ_3,4 233 M_ARG pP_3,4 234 235 UHADD8 alpha, alpha, m00 236 USUB8 t9, p_2, p_0 ;//t9 = dp2p0 237 UHADD8 alpha, alpha, m00 238 ADD alpha, alpha, m01, LSL #1 239 USUB8 ap0q0, ap0q0, alpha 240 SEL apqflg, m00, apflg 241 242 ;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 243 ;// = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3 244 ;// = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3) 245 246 ;// P1 = (p2 + p1 + q0 + p0 + 2)>>2 247 ;// = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2) 248 249 ;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3 250 ;// = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3 251 ;// = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2) 252 253 ;// Compute P0b 254 USUB8 t2, p_0, q_0 255 SSUB8 t5, t9, t2 256 257 USUB8 t8, q_1, q_0 258 SHADD8 t8, t5, t8 259 260 USUB8 t9, p_1, p_0 261 SADD8 t8, t8, t9 262 SHSUB8 t8, t8, t2 263 SHADD8 t5, t5, t9 264 SHADD8 t8, t8, m01 265 SHADD8 t9, t5, m01 266 SADD8 P0b, p_0, t8 267 ;// P0b ready 268 269 ;// Compute P1b 270 M_LDR p_3b, pP_3 271 SADD8 P1b, p_0, t9 272 ;// P1b ready 273 274 ;// Compute P2b 275 USUB8 t9, p_2, p_0 276 SADD8 t5, t5, t9 277 UHSUB8 t9, p_3b, p_0 278 EOR a, p_3b, p_0 279 AND a, a, m01 280 SHADD8 t5, t5, a 281 UHADD8 a, p_0, q_1 282 SADD8 t5, t5, m01 283 SHADD8 t5, t5, t9 284 MVN t9, p_1 285 SADD8 P2b, p_0, t5 286 ;// P2b ready 287 288 UHSUB8 a, a, t9 289 ORR t9, apqflg, m01 290 USUB8 t9, apqflg, t9 291 292 EOR a, a, m01, LSL #7 293 SEL P0b, P0b, a 294 SEL P1b, P1b, p_1 295 SEL P2b, P2b, p_2 296 297 USUB8 t4, filt, m01 298 SEL P0b, P0b, p_0 299 300 301 ;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3 302 ;// = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3 303 ;// = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3) 304 305 ;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2 306 ;// = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2) 307 308 ;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3 309 ;// = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3 310 ;// = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2) 311 312 313 ;// Compute Q0b Q1b 314 USUB8 t4, q_2, q_0 315 USUB8 a, p_0, q_0 316 USUB8 t9, p_1, p_0 317 SADD8 t0, t4, a 318 SHADD8 t9, t0, t9 319 UHADD8 t10, q_0, p_1 320 SADD8 t9, t9, a 321 USUB8 a, q_1, q_0 322 SHADD8 t9, t9, a 323 SHADD8 t0, t0, a 324 SHADD8 t9, t9, m01 325 SHADD8 a, t0, m01 326 SADD8 t9, q_0, t9 327 ;// Q0b ready - t9 328 329 MOV t4, #0 330 UHADD8 apqflg, apqflg, t4 331 332 SADD8 Q1b, q_0, a 333 ;// Q1b ready 334 335 USUB8 t4, apqflg, m01 336 SEL Q1b, Q1b, q_1 337 MVN t11, q_1 338 UHSUB8 t10, t10, t11 339 M_LDR q_3b, pQ_3 340 EOR t10, t10, m01, LSL #7 341 SEL t9, t9, t10 342 343 ;// Compute Q2b 344 USUB8 t4, q_2, q_0 345 SADD8 t4, t0, t4 346 EOR t0, q_3b, q_0 347 AND t0, t0, m01 348 SHADD8 t4, t4, t0 349 UHSUB8 t10, q_3b, q_0 350 SADD8 t4, t4, m01 351 SHADD8 t4, t4, t10 352 353 USUB8 t10, filt, m01 354 SEL Q0b, t9, q_0 355 356 SADD8 t4, q_0, t4 357 ;// Q2b ready - t4 358 359 USUB8 t10, apqflg, m01 360 SEL Q2b, t4, q_2 361 362 M_END 363 364 ENDIF 365 366 END