1; Copyright (C) 2009 The Android Open Source Project 2; 3; Licensed under the Apache License, Version 2.0 (the "License"); 4; you may not use this file except in compliance with the License. 5; You may obtain a copy of the License at 6; 7; http://www.apache.org/licenses/LICENSE-2.0 8; 9; Unless required by applicable law or agreed to in writing, software 10; distributed under the License is distributed on an "AS IS" BASIS, 11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12; See the License for the specific language governing permissions and 13; limitations under the License. 14 15;------------------------------------------------------------------------------- 16;-- 17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerQuarter function 18;-- 19;------------------------------------------------------------------------------- 20 21 IF :DEF: H264DEC_WINASM 22 ;// We dont use REQUIRE8 and PRESERVE8 for winasm 23 ELSE 24 REQUIRE8 25 PRESERVE8 26 ENDIF 27 28 AREA |.text|, CODE 29 30;// h264bsdInterpolateVerQuarter register allocation 31 32ref RN 0 33 34mb RN 1 35buff RN 1 36 37count RN 2 38x0 RN 2 39 40res RN 3 41y0 RN 3 42 43tmp1 RN 4 44 45tmp2 RN 5 46height RN 5 47 48tmp3 RN 6 49partW RN 6 50 51tmp4 RN 7 52partH RN 7 53 54tmp5 RN 8 55tmp6 RN 9 56 57tmpa RN 10 58tmpb RN 11 59width RN 12 60 61plus16 RN 14 62 63 64;// function exports and imports 65 66 IMPORT h264bsdFillBlock 67 68 EXPORT h264bsdInterpolateVerQuarter 69 70;// Approach to vertical interpolation 71;// 72;// Interpolation is done by using 32-bit loads and stores 73;// and by using 16 bit arithmetic. 4x4 block is processed 74;// in each round. 75;// 76;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n| 77;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n| 78;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n| 79;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n| 80;// .. 81;// .. 82;// |a_m1|a_m1|a_m1|a_m1|... 83;// |b_m1|b_m1|b_m1|b_m1|... 84;// |c_m1|c_m1|c_m1|c_m1|... 85;// |d_m1|d_m1|d_m1|d_m1|... 86 87h264bsdInterpolateVerQuarter 88 STMFD sp!, {r0-r11, lr} 89 SUB sp, sp, #0x1e4 90 91 CMP x0, #0 92 BLT do_fill ;// (x0 < 0) 93 LDR partW, [sp,#0x220] ;// partWidth 94 ADD tmp5, x0, partW ;// (x0+partWidth) 95 LDR width, [sp,#0x218] ;// width 96 CMP tmp5, width 97 BHI do_fill ;// (x0+partW)>width 98 99 CMP y0, #0 100 BLT do_fill ;// (y0 < 0) 101 LDR partH, [sp,#0x224] ;// partHeight 102 ADD tmp6, y0, partH ;// (y0+partHeight) 103 ADD tmp6, tmp6, #5 ;// (y0+partH+5) 104 LDR height, [sp,#0x21c] ;// height 105 CMP tmp6, height 106 BLS skip_fill ;// no overfill needed 107 108 109do_fill 110 LDR partH, [sp,#0x224] ;// partHeight 111 ADD tmp5, partH, #5 ;// r2 = partH + 5; 112 LDR height, [sp,#0x21c] ;// height 113 LDR partW, [sp,#0x220] ;// partWidth 114 STMIB sp, {height, partW} ;// sp+4 = height, sp+8 = partWidth 115 STR tmp5, [sp,#0xc] ;// sp+c partHeight+5 116 STR partW, [sp,#0x10] ;// sp+10 = partWidth 117 LDR width, [sp,#0x218] ;// width 118 STR width, [sp,#0] ;// sp+0 = width 119 ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1] 120 BL h264bsdFillBlock 121 122 MOV x0, #0 123 STR x0,[sp,#0x1ec] ;// x0 = 0 124 STR x0,[sp,#0x1f0] ;// y0 = 0 125 ADD ref,sp,#0x28 ;// ref = p1 126 STR partW, [sp,#0x218] 127 128 129skip_fill 130 LDR x0 ,[sp,#0x1ec] ;// x0 131 LDR y0 ,[sp,#0x1f0] ;// y0 132 LDR width, [sp,#0x218] ;// width 133 MLA tmp6, width, y0, x0 ;// y0*width+x0 134 ADD ref, ref, tmp6 ;// ref += y0*width+x0 135 LDR mb, [sp, #0x1e8] ;// mb 136 137 ADD count, partW, partH, LSL #8 ;// |xx|xx|partH|partW| 138 LDR tmp5, = 0x00010100 139 RSB count, tmp5, count, LSL #8 ;// |xx|partH-1|partW-1|xx| 140 LDR tmp2, [sp, #0x228] ;// verOffset 141 ADD count, count, tmp2 ;// |xx|partH-1|partW-1|verOffset| 142 LDR plus16, = 0x00100010 143 144 AND tmp1, count, #0x0000FF00 ;// partWidth 145 146 147loop_y 148 ADD count, count, tmp1, LSL #16 ;// partWidth-1 to top byte 149 150loop_x 151 LDR tmp1, [ref], width ;// |a4|a3|a2|a1| 152 LDR tmp2, [ref], width ;// |c4|c3|c2|c1| 153 LDR tmp3, [ref], width ;// |g4|g3|g2|g1| 154 LDR tmp4, [ref], width ;// |m4|m3|m2|m1| 155 LDR tmp5, [ref], width ;// |r4|r3|r2|r1| 156 LDR tmp6, [ref], width ;// |t4|t3|t2|t1| 157 158 ;// first four pixels 159 UXTB16 tmpa, tmp3 ;// |g3|g1| 160 UXTAB16 tmpa, tmpa, tmp4 ;// |g3+m3|g1+m1| 161 UXTB16 tmpb, tmp2 ;// |c3|c1| 162 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 163 164 UXTAB16 tmpb, tmpb, tmp5 ;// |c3+r3|c1+r1| 165 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 166 UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A 167 UXTAB16 tmpa, tmpa, tmp6 ;// 16+20(G+M)+A+T 168 169 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 170 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 171 172 USAT16 tmpb, #13, tmpa ;// saturate 173 LDR res, = 0x00FF00FF 174 UXTB16 tmpa, tmp3, ROR #8 ;// |g4|g2| 175 UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// |g4+m4|g2+m2| 176 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 177 178 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 179 UXTB16 tmpb, tmp2, ROR #8 ;// |c4|c2| 180 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 181 UXTAB16 tmpb, tmpb, tmp5, ROR #8 ;// |c4+r4|c2+r2| 182 UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A 183 UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// 16+20(G+M)+A+T 184 185 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 186 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 187 188 USAT16 tmpb, #13, tmpa ;// saturate 189 MOVS tmp1, count, LSL #31 ;// update flags (verOffset) 190 LDR tmpa, = 0xFF00FF00 191 MVNEQ tmp1, tmp3 ;// select verOffset=0 192 MVNNE tmp1, tmp4 ;// select verOffset=1 193 AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divede by 32 194 ORR res, res, tmpa 195 196 LDR tmpa, = 0x80808080 197 UHSUB8 res, res, tmp1 ;// bilinear interpolation 198 LDR tmp1, [ref], width ;// load next row 199 EOR res, res, tmpa ;// correct sign 200 201 STR res, [mb], #16 ;// next row (mb) 202 203 204 ;// tmp2 = |a4|a3|a2|a1| 205 ;// tmp3 = |c4|c3|c2|c1| 206 ;// tmp4 = |g4|g3|g2|g1| 207 ;// tmp5 = |m4|m3|m2|m1| 208 ;// tmp6 = |r4|r3|r2|r1| 209 ;// tmp1 = |t4|t3|t2|t1| 210 211 ;// second four pixels 212 UXTB16 tmpa, tmp4 ;// |g3|g1| 213 UXTAB16 tmpa, tmpa, tmp5 ;// |g3+m3|g1+m1| 214 UXTB16 tmpb, tmp3 ;// |c3|c1| 215 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 216 UXTAB16 tmpb, tmpb, tmp6 ;// |c3+r3|c1+r1| 217 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 218 UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A 219 UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A+T 220 221 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 222 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 223 224 USAT16 tmpb, #13, tmpa ;// saturate 225 LDR res, = 0x00FF00FF 226 UXTB16 tmpa, tmp4, ROR #8 ;// |g4|g2| 227 UXTAB16 tmpa, tmpa, tmp5, ROR #8 ;// |g4+m4|g2+m2| 228 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 229 230 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 231 UXTB16 tmpb, tmp3, ROR #8 ;// |c4|c2| 232 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 233 UXTAB16 tmpb, tmpb, tmp6, ROR #8 ;// |c4+r4|c2+r2| 234 UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A 235 UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A+T 236 237 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 238 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 239 240 USAT16 tmpb, #13, tmpa ;// saturate 241 LDR tmpa, = 0xFF00FF00 242 MVNEQ tmp2, tmp4 ;// select verOffset=0 243 MVNNE tmp2, tmp5 ;// select verOffset=1 244 245 AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32 246 ORR res, res, tmpa 247 LDR tmpa, = 0x80808080 248 UHSUB8 res, res, tmp2 ;// bilinear interpolation 249 LDR tmp2, [ref], width ;// load next row 250 EOR res, res, tmpa ;// correct sign 251 STR res, [mb], #16 ;// next row 252 253 ;// tmp3 = |a4|a3|a2|a1| 254 ;// tmp4 = |c4|c3|c2|c1| 255 ;// tmp5 = |g4|g3|g2|g1| 256 ;// tmp6 = |m4|m3|m2|m1| 257 ;// tmp1 = |r4|r3|r2|r1| 258 ;// tmp2 = |t4|t3|t2|t1| 259 260 ;// third four pixels 261 UXTB16 tmpa, tmp5 ;// |g3|g1| 262 UXTAB16 tmpa, tmpa, tmp6 ;// |g3+m3|g1+m1| 263 UXTB16 tmpb, tmp4 ;// |c3|c1| 264 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 265 UXTAB16 tmpb, tmpb, tmp1 ;// |c3+r3|c1+r1| 266 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 267 UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A 268 UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A+T 269 270 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 271 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 272 273 USAT16 tmpb, #13, tmpa ;// saturate 274 LDR res, = 0x00FF00FF 275 UXTB16 tmpa, tmp5, ROR #8 ;// |g4|g2| 276 UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// |g4+m4|g2+m2| 277 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 278 279 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 280 UXTB16 tmpb, tmp4, ROR #8 ;// |c4|c2| 281 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 282 UXTAB16 tmpb, tmpb, tmp1, ROR #8 ;// |c4+r4|c2+r2| 283 UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A 284 UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A+T 285 286 287 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 288 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 289 290 USAT16 tmpb, #13, tmpa ;// saturate 291 LDR tmpa, = 0xFF00FF00 292 MVNEQ tmp3, tmp5 ;// select verOffset=0 293 MVNNE tmp3, tmp6 ;// select verOffset=1 294 295 AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32 296 ORR res, res, tmpa 297 LDR tmpa, = 0x80808080 298 UHSUB8 res, res, tmp3 ;// bilinear interpolation 299 LDR tmp3, [ref] ;// load next row 300 EOR res, res, tmpa ;// correct sign 301 STR res, [mb], #16 ;// next row 302 303 ;// tmp4 = |a4|a3|a2|a1| 304 ;// tmp5 = |c4|c3|c2|c1| 305 ;// tmp6 = |g4|g3|g2|g1| 306 ;// tmp1 = |m4|m3|m2|m1| 307 ;// tmp2 = |r4|r3|r2|r1| 308 ;// tmp3 = |t4|t3|t2|t1| 309 310 ;// fourth four pixels 311 UXTB16 tmpa, tmp6 ;// |g3|g1| 312 UXTAB16 tmpa, tmpa, tmp1 ;// |g3+m3|g1+m1| 313 UXTB16 tmpb, tmp5 ;// |c3|c1| 314 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 315 UXTAB16 tmpb, tmpb, tmp2 ;// |c3+r3|c1+r1| 316 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 317 UXTAB16 tmpa, tmpa, tmp4 ;// 16+20(G+M)+A 318 UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A+T 319 320 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 321 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 322 323 USAT16 tmpb, #13, tmpa ;// saturate 324 LDR res, = 0x00FF00FF 325 UXTB16 tmpa, tmp6, ROR #8 ;// |g4|g2| 326 UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// |g4+m4|g2+m2| 327 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 328 329 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 330 UXTB16 tmpb, tmp5, ROR #8 ;// |c4|c2| 331 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 332 UXTAB16 tmpb, tmpb, tmp2, ROR #8 ;// |c4+r4|c2+r2| 333 UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// 16+20(G+M)+A 334 UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A+T 335 336 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 337 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 338 339 USAT16 tmpb, #13, tmpa ;// saturate 340 LDR tmp4, = 0xFF00FF00 341 MVNEQ tmp5, tmp6 ;// select verOffset=0 342 MVNNE tmp5, tmp1 ;// select verOffset=1 343 344 AND tmpa, tmp4, tmpb, LSL #3 ;// mask and divide by 32 345 ORR res, res, tmpa 346 LDR tmpa, = 0x80808080 347 UHSUB8 res, res, tmp5 ;// bilinear interpolation 348 349 ;// decrement loop_x counter 350 SUBS count, count, #4<<24 ;// (partWidth-1) -= 4; 351 352 ;// calculate "ref" address for next round 353 SUB ref, ref, width, LSL #3 ;// ref -= 8*width; 354 ADD ref, ref, #4; ;// next column (4 pixels) 355 356 EOR res, res, tmpa ;// correct sign 357 STR res, [mb], #-44 358 359 BCS loop_x 360 361 ADDS count, count, #252<<16 ;// (partHeight-1) -= 4; 362 ADD ref, ref, width, LSL #2 ;// ref += 4*width 363 AND tmp1, count, #0x0000FF00 ;// partWidth-1 364 MOV tmp2, #1 365 ADD tmp2, tmp2, tmp1, LSR #8 ;// partWidth 366 SUB ref, ref, tmp2 ;// ref -= partWidth 367 ADD mb, mb, #64; 368 SUB mb, mb, tmp2; ;// mb -= partWidth 369 BGE loop_y 370 371 ADD sp,sp,#0x1f4 372 LDMFD sp!, {r4-r11, pc} 373 374 END 375