1; Copyright (C) 2009 The Android Open Source Project 2; 3; Licensed under the Apache License, Version 2.0 (the "License"); 4; you may not use this file except in compliance with the License. 5; You may obtain a copy of the License at 6; 7; http://www.apache.org/licenses/LICENSE-2.0 8; 9; Unless required by applicable law or agreed to in writing, software 10; distributed under the License is distributed on an "AS IS" BASIS, 11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12; See the License for the specific language governing permissions and 13; limitations under the License. 14 15;------------------------------------------------------------------------------- 16;-- 17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerHalf function 18;-- 19;------------------------------------------------------------------------------- 20 21 22 IF :DEF: H264DEC_WINASM 23 ;// We dont use REQUIRE8 and PRESERVE8 for winasm 24 ELSE 25 REQUIRE8 26 PRESERVE8 27 ENDIF 28 29 AREA |.text|, CODE 30 31;// h264bsdInterpolateVerHalf register allocation 32 33ref RN 0 34 35mb RN 1 36buff RN 1 37 38count RN 2 39x0 RN 2 40 41res RN 3 42y0 RN 3 43 44tmp1 RN 4 45 46tmp2 RN 5 47height RN 5 48 49tmp3 RN 6 50partW RN 6 51 52tmp4 RN 7 53partH RN 7 54 55tmp5 RN 8 56tmp6 RN 9 57 58tmpa RN 10 59tmpb RN 11 60width RN 12 61 62plus16 RN 14 63 64 65;// function exports and imports 66 67 IMPORT h264bsdFillBlock 68 69 EXPORT h264bsdInterpolateVerHalf 70 71;// Approach to vertical interpolation 72;// 73;// Interpolation is done by using 32-bit loads and stores 74;// and by using 16 bit arithmetic. 4x4 block is processed 75;// in each round. 76;// 77;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n| 78;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n| 79;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n| 80;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n| 81;// .. 82;// .. 83;// |a_m1|a_m1|a_m1|a_m1|... 84;// |b_m1|b_m1|b_m1|b_m1|... 85;// |c_m1|c_m1|c_m1|c_m1|... 86;// |d_m1|d_m1|d_m1|d_m1|... 87 88h264bsdInterpolateVerHalf 89 STMFD sp!, {r0-r11, lr} 90 SUB sp, sp, #0x1e4 91 92 CMP x0, #0 93 BLT do_fill ;// (x0 < 0) 94 LDR partW, [sp,#0x220] ;// partWidth 95 ADD tmp5, x0, partW ;// (x0+partWidth) 96 LDR width, [sp,#0x218] ;// width 97 CMP tmp5, width 98 BHI do_fill ;// (x0+partW)>width 99 100 CMP y0, #0 101 BLT do_fill ;// (y0 < 0) 102 LDR partH, [sp,#0x224] ;// partHeight 103 ADD tmp6, y0, partH ;// (y0+partHeight) 104 ADD tmp6, tmp6, #5 ;// (y0+partH+5) 105 LDR height, [sp,#0x21c] ;// height 106 CMP tmp6, height 107 BLS skip_fill ;// no overfill needed 108 109 110do_fill 111 LDR partH, [sp,#0x224] ;// partHeight 112 ADD tmp5, partH, #5 ;// r2 = partH + 5; 113 LDR height, [sp,#0x21c] ;// height 114 LDR partW, [sp,#0x220] ;// partWidth 115 STMIB sp, {height, partW} ;// sp+4 = height, sp+8 = partWidth 116 STR tmp5, [sp,#0xc] ;// sp+c partHeight+5 117 STR partW, [sp,#0x10] ;// sp+10 = partWidth 118 LDR width, [sp,#0x218] ;// width 119 STR width, [sp,#0] ;// sp+0 = width 120 ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1] 121 BL h264bsdFillBlock 122 123 MOV x0, #0 124 STR x0,[sp,#0x1ec] ;// x0 = 0 125 STR x0,[sp,#0x1f0] ;// y0 = 0 126 ADD ref,sp,#0x28 ;// ref = p1 127 STR partW, [sp,#0x218] 128 129 130skip_fill 131 LDR x0 ,[sp,#0x1ec] ;// x0 132 LDR y0 ,[sp,#0x1f0] ;// y0 133 LDR width, [sp,#0x218] ;// width 134 MLA tmp6, width, y0, x0 ;// y0*width+x0 135 ADD ref, ref, tmp6 ;// ref += y0*width+x0 136 LDR mb, [sp, #0x1e8] ;// mb 137 138 ADD count, partW, partH, LSL #16 ;// |partH|partW| 139 LDR tmp5, = 0x00010001 140 SSUB16 count, count, tmp5; ;// |partH-1|partW-1| 141 LDR plus16, = 0x00100010 142 143 AND tmp1, count, #0x000000FF ;// partWidth 144 145 146loop_y 147 ADD count, count, tmp1, LSL #24 ;// partWidth-1 to top byte 148 149loop_x 150 LDR tmp1, [ref], width ;// |a4|a3|a2|a1| 151 LDR tmp2, [ref], width ;// |c4|c3|c2|c1| 152 LDR tmp3, [ref], width ;// |g4|g3|g2|g1| 153 LDR tmp4, [ref], width ;// |m4|m3|m2|m1| 154 LDR tmp5, [ref], width ;// |r4|r3|r2|r1| 155 LDR tmp6, [ref], width ;// |t4|t3|t2|t1| 156 157 ;// first four pixels 158 UXTB16 tmpa, tmp3 ;// |g3|g1| 159 UXTAB16 tmpa, tmpa, tmp4 ;// |g3+m3|g1+m1| 160 UXTB16 tmpb, tmp2 ;// |c3|c1| 161 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 162 163 UXTAB16 tmpb, tmpb, tmp5 ;// |c3+r3|c1+r1| 164 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 165 UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A 166 UXTAB16 tmpa, tmpa, tmp6 ;// 16+20(G+M)+A+T 167 168 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 169 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 170 171 USAT16 tmpb, #13, tmpa ;// saturate 172 LDR res, = 0x00FF00FF 173 UXTB16 tmpa, tmp3, ROR #8 ;// |g4|g2| 174 UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// |g4+m4|g2+m2| 175 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 176 177 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 178 UXTB16 tmpb, tmp2, ROR #8 ;// |c4|c2| 179 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 180 UXTAB16 tmpb, tmpb, tmp5, ROR #8 ;// |c4+r4|c2+r2| 181 UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A 182 UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// 16+20(G+M)+A+T 183 184 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 185 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 186 187 USAT16 tmpb, #13, tmpa ;// saturate 188 LDR tmp1, [ref], width 189 LDR tmpa, = 0xFF00FF00 190 191 AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divede by 32 192 ORR res, res, tmpa 193 STR res, [mb], #16 ;// next row (mb) 194 195 ;// tmp2 = |a4|a3|a2|a1| 196 ;// tmp3 = |c4|c3|c2|c1| 197 ;// tmp4 = |g4|g3|g2|g1| 198 ;// tmp5 = |m4|m3|m2|m1| 199 ;// tmp6 = |r4|r3|r2|r1| 200 ;// tmp1 = |t4|t3|t2|t1| 201 202 ;// second four pixels 203 UXTB16 tmpa, tmp4 ;// |g3|g1| 204 UXTAB16 tmpa, tmpa, tmp5 ;// |g3+m3|g1+m1| 205 UXTB16 tmpb, tmp3 ;// |c3|c1| 206 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 207 UXTAB16 tmpb, tmpb, tmp6 ;// |c3+r3|c1+r1| 208 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 209 UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A 210 UXTAB16 tmpa, tmpa, tmp1 ;// 16+20(G+M)+A+T 211 212 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 213 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 214 215 USAT16 tmpb, #13, tmpa ;// saturate 216 LDR res, = 0x00FF00FF 217 UXTB16 tmpa, tmp4, ROR #8 ;// |g4|g2| 218 UXTAB16 tmpa, tmpa, tmp5, ROR #8 ;// |g4+m4|g2+m2| 219 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 220 221 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 222 UXTB16 tmpb, tmp3, ROR #8 ;// |c4|c2| 223 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 224 UXTAB16 tmpb, tmpb, tmp6, ROR #8 ;// |c4+r4|c2+r2| 225 UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A 226 UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// 16+20(G+M)+A+T 227 228 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 229 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 230 231 USAT16 tmpb, #13, tmpa ;// saturate 232 LDR tmp2, [ref], width 233 LDR tmpa, = 0xFF00FF00 234 235 AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32 236 ORR res, res, tmpa 237 STR res, [mb], #16 ;// next row 238 239 ;// tmp3 = |a4|a3|a2|a1| 240 ;// tmp4 = |c4|c3|c2|c1| 241 ;// tmp5 = |g4|g3|g2|g1| 242 ;// tmp6 = |m4|m3|m2|m1| 243 ;// tmp1 = |r4|r3|r2|r1| 244 ;// tmp2 = |t4|t3|t2|t1| 245 246 ;// third four pixels 247 UXTB16 tmpa, tmp5 ;// |g3|g1| 248 UXTAB16 tmpa, tmpa, tmp6 ;// |g3+m3|g1+m1| 249 UXTB16 tmpb, tmp4 ;// |c3|c1| 250 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 251 UXTAB16 tmpb, tmpb, tmp1 ;// |c3+r3|c1+r1| 252 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 253 UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A 254 UXTAB16 tmpa, tmpa, tmp2 ;// 16+20(G+M)+A+T 255 256 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 257 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 258 259 USAT16 tmpb, #13, tmpa ;// saturate 260 LDR res, = 0x00FF00FF 261 UXTB16 tmpa, tmp5, ROR #8 ;// |g4|g2| 262 UXTAB16 tmpa, tmpa, tmp6, ROR #8 ;// |g4+m4|g2+m2| 263 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 264 265 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 266 UXTB16 tmpb, tmp4, ROR #8 ;// |c4|c2| 267 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 268 UXTAB16 tmpb, tmpb, tmp1, ROR #8 ;// |c4+r4|c2+r2| 269 UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A 270 UXTAB16 tmpa, tmpa, tmp2, ROR #8 ;// 16+20(G+M)+A+T 271 272 273 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 274 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 275 276 USAT16 tmpb, #13, tmpa ;// saturate 277 LDR tmp3, [ref] 278 LDR tmpa, = 0xFF00FF00 279 280 ;// decrement loop_x counter 281 SUBS count, count, #4<<24 ;// (partWidth-1) -= 4; 282 283 AND tmpa, tmpa, tmpb, LSL #3 ;// mask and divide by 32 284 ORR res, res, tmpa 285 STR res, [mb], #16 ;// next row 286 287 ;// tmp4 = |a4|a3|a2|a1| 288 ;// tmp5 = |c4|c3|c2|c1| 289 ;// tmp6 = |g4|g3|g2|g1| 290 ;// tmp1 = |m4|m3|m2|m1| 291 ;// tmp2 = |r4|r3|r2|r1| 292 ;// tmp3 = |t4|t3|t2|t1| 293 294 ;// fourth four pixels 295 UXTB16 tmpa, tmp6 ;// |g3|g1| 296 UXTAB16 tmpa, tmpa, tmp1 ;// |g3+m3|g1+m1| 297 UXTB16 tmpb, tmp5 ;// |c3|c1| 298 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 299 UXTAB16 tmpb, tmpb, tmp2 ;// |c3+r3|c1+r1| 300 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 301 UXTAB16 tmpa, tmpa, tmp4 ;// 16+20(G+M)+A 302 UXTAB16 tmpa, tmpa, tmp3 ;// 16+20(G+M)+A+T 303 304 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 305 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 306 307 USAT16 tmpb, #13, tmpa ;// saturate 308 LDR res, = 0x00FF00FF 309 UXTB16 tmpa, tmp6, ROR #8 ;// |g4|g2| 310 UXTAB16 tmpa, tmpa, tmp1, ROR #8 ;// |g4+m4|g2+m2| 311 AND res, res, tmpb, LSR #5 ;// mask and divide by 32 312 313 ADD tmpa, tmpa, tmpa, LSL #2 ;// 5(G+M) 314 UXTB16 tmpb, tmp5, ROR #8 ;// |c4|c2| 315 ADD tmpa, plus16, tmpa, LSL #2 ;// 16+20(G+M) 316 UXTAB16 tmpb, tmpb, tmp2, ROR #8 ;// |c4+r4|c2+r2| 317 UXTAB16 tmpa, tmpa, tmp4, ROR #8 ;// 16+20(G+M)+A 318 UXTAB16 tmpa, tmpa, tmp3, ROR #8 ;// 16+20(G+M)+A+T 319 320 ADD tmpb, tmpb, tmpb, LSL #2 ;// 5(C+R) 321 SSUB16 tmpa, tmpa, tmpb ;// 16+20(G+M)+(A+T)-5(C+R) 322 323 USAT16 tmpb, #13, tmpa ;// saturate 324 LDR tmp4, = 0xFF00FF00 325 326 ;// calculate "ref" address for next round 327 SUB ref, ref, width, LSL #3 ;// ref -= 8*width; 328 ADD ref, ref, #4; ;// next column (4 pixels) 329 AND tmpa, tmp4, tmpb, LSL #3 ;// mask and divide by 32 330 ORR res, res, tmpa 331 STR res, [mb], #-44 332 333 BCS loop_x 334 335 ADDS count, count, #252<<16 ;// (partHeight-1) -= 4; 336 ADD ref, ref, width, LSL #2 ;// ref += 4*width 337 AND tmp1, count, #0x000000FF ;// partWidth-1 338 ADD tmp2, tmp1, #1 ;// partWidth 339 SUB ref, ref, tmp2 ;// ref -= partWidth 340 ADD mb, mb, #64; 341 SUB mb, mb, tmp2; ;// mb -= partWidth 342 BGE loop_y 343 344 ADD sp,sp,#0x1f4 345 LDMFD sp!, {r4-r11, pc} 346 347 END 348