1; Copyright (C) 2009 The Android Open Source Project 2; 3; Licensed under the Apache License, Version 2.0 (the "License"); 4; you may not use this file except in compliance with the License. 5; You may obtain a copy of the License at 6; 7; http://www.apache.org/licenses/LICENSE-2.0 8; 9; Unless required by applicable law or agreed to in writing, software 10; distributed under the License is distributed on an "AS IS" BASIS, 11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12; See the License for the specific language governing permissions and 13; limitations under the License. 14 15;------------------------------------------------------------------------------- 16;-- 17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHorVer 18;-- function 19;-- 20;------------------------------------------------------------------------------- 21 22 23 IF :DEF: H264DEC_WINASM 24 ;// We dont use REQUIRE8 and PRESERVE8 for winasm 25 ELSE 26 REQUIRE8 27 PRESERVE8 28 ENDIF 29 30 AREA |.text|, CODE 31 32 33;// h264bsdInterpolateChromaHorVer register allocation 34 35ref RN 0 36ptrA RN 0 37 38mb RN 1 39block RN 1 40 41x0 RN 2 42count RN 2 43 44y0 RN 3 45valY RN 3 46 47width RN 4 48 49tmp4 RN 5 50height RN 5 51 52tmp1 RN 6 53 54tmp2 RN 7 55 56tmp3 RN 8 57 58valX RN 9 59 60tmp5 RN 10 61chrPW RN 10 62 63tmp6 RN 11 64chrPH RN 11 65 66xFrac RN 12 67 68c32 RN 14 69yFrac RN 14 70 71;// function exports and imports 72 73 IMPORT h264bsdFillBlock 74 75 EXPORT h264bsdInterpolateChromaHorVer 76 77;// Function arguments 78;// 79;// u8 *ref, : 0xc4 80;// u8 *predPartChroma, : 0xc8 81;// i32 x0, : 0xcc 82;// i32 y0, : 0xd0 83;// u32 width, : 0xf8 84;// u32 height, : 0xfc 85;// u32 xFrac, : 0x100 86;// u32 yFrac, : 0x104 87;// u32 chromaPartWidth, : 0x108 88;// u32 chromaPartHeight : 0x10c 89 90h264bsdInterpolateChromaHorVer 91 STMFD sp!, {r0-r11,lr} 92 SUB sp, sp, #0xc4 93 94 LDR chrPW, [sp, #0x108] ;// chromaPartWidth 95 LDR xFrac, [sp, #0x100] ;// xFrac 96 LDR width, [sp, #0xf8] ;// width 97 CMP x0, #0 98 BLT do_fill 99 100 ADD tmp1, x0, chrPW ;// tmp1 = x0+ chromaPartWidth 101 ADD tmp1, tmp1, #1 ;// tmp1 = x0+ chromaPartWidth+1 102 CMP tmp1, width ;// x0+chromaPartWidth+1 > width 103 BHI do_fill 104 105 CMP y0, #0 106 BLT do_fill 107 LDR chrPH, [sp, #0x10c] ;// chromaPartHeight 108 LDR height, [sp, #0xfc] ;// height 109 ADD tmp1, y0, chrPH ;// tmp1 = y0 + chromaPartHeight 110 ADD tmp1, tmp1, #1 ;// tmp1 = y0 + chromaPartHeight + 1 111 CMP tmp1, height 112 BLS skip_fill 113 114do_fill 115 LDR chrPH, [sp, #0x10c] ;// chromaPartHeight 116 LDR height, [sp, #0xfc] ;// height 117 ADD tmp3, chrPW, #1 ;// tmp3 = chromaPartWidth+1 118 ADD tmp1, chrPW, #1 ;// tmp1 = chromaPartWidth+1 119 ADD tmp2, chrPH, #1 ;// tmp2 = chromaPartHeight+1 120 STMIA sp,{width,height,tmp1,tmp2,tmp3} 121 ADD block, sp, #0x1c ;// block 122 BL h264bsdFillBlock 123 124 LDR x0, [sp, #0xcc] 125 LDR y0, [sp, #0xd0] 126 LDR ref, [sp, #0xc4] ;// ref 127 STMIA sp,{width,height,tmp1,tmp2,tmp3} 128 ADD block, sp, #0x1c ;// block 129 MLA ref, height, width, ref ;// ref += width * height; 130 MLA block, tmp2, tmp1, block;// block + (chromaPW+1)*(chromaPH+1) 131 BL h264bsdFillBlock 132 133 MOV x0, #0 ;// x0 = 0 134 MOV y0, #0 ;// y0 = 0 135 STR x0, [sp, #0xcc] 136 STR y0, [sp, #0xd0] 137 ADD ref, sp, #0x1c ;// ref = block 138 STR ref, [sp, #0xc4] ;// ref 139 140 STR tmp2, [sp, #0xfc] ;// height 141 STR tmp1, [sp, #0xf8] ;// width 142 MOV width, tmp1 143 144skip_fill 145 MLA tmp3, y0, width, x0 ;// tmp3 = y0*width+x0 146 LDR yFrac, [sp, #0x104] ;// yFrac 147 LDR xFrac, [sp, #0x100] 148 ADD ptrA, ref, tmp3 ;// ptrA = ref + y0*width+x0 149 RSB valX, xFrac, #8 ;// valX = 8-xFrac 150 RSB valY, yFrac, #8 ;// valY = 8-yFrac 151 152 LDR mb, [sp, #0xc8] ;// predPartChroma 153 154 155 ;// pack values to count register 156 ;// [31:28] loop_x (chromaPartWidth-1) 157 ;// [27:24] loop_y (chromaPartHeight-1) 158 ;// [23:20] chromaPartWidth-1 159 ;// [19:16] chromaPartHeight-1 160 ;// [15:00] nothing 161 162 SUB tmp2, chrPH, #1 ;// chromaPartHeight-1 163 SUB tmp1, chrPW, #1 ;// chromaPartWidth-1 164 ADD count, count, tmp2, LSL #16 ;// chromaPartHeight-1 165 ADD count, count, tmp2, LSL #24 ;// loop_y 166 ADD count, count, tmp1, LSL #20 ;// chromaPartWidth-1 167 AND tmp2, count, #0x00F00000 ;// loop_x 168 PKHBT valY, valY, yFrac, LSL #16 ;// |yFrac|valY | 169 MOV c32, #32 170 171 172 ;/////////////////////////////////////////////////////////////////////////// 173 ;// Cb 174 ;/////////////////////////////////////////////////////////////////////////// 175 176 ;// 2x2 pels per iteration 177 ;// bilinear vertical and horizontal interpolation 178 179loop1_y 180 LDRB tmp1, [ptrA] 181 LDRB tmp3, [ptrA, width] 182 LDRB tmp5, [ptrA, width, LSL #1] 183 184 PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1| 185 PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3| 186 187 SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac) 188 SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac) 189 190 ADD count, count, tmp2, LSL #8 191loop1_x 192 ;// first 193 LDRB tmp2, [ptrA, #1]! 194 LDRB tmp4, [ptrA, width] 195 LDRB tmp6, [ptrA, width, LSL #1] 196 197 PKHBT tmp2, tmp2, tmp4, LSL #16 ;// |t4|t2| 198 PKHBT tmp4, tmp4, tmp6, LSL #16 ;// |t6|t4| 199 200 SMUAD tmp2, tmp2, valY ;// t2=(t2*valY + t4*yFrac) 201 MLA tmp5, tmp1, valX, c32 ;// t5=t1*valX+32 202 MLA tmp5, tmp2, xFrac, tmp5 ;// t5=t2*xFrac+t5 203 204 SMUAD tmp4, tmp4, valY ;// t4=(t4*valY + t6*yFrac) 205 MLA tmp6, tmp3, valX, c32 ;// t3=t3*valX+32 206 MLA tmp6, tmp4, xFrac, tmp6 ;// t6=t4*xFrac+t6 207 208 MOV tmp6, tmp6, LSR #6 ;// scale down 209 STRB tmp6, [mb, #8] ;// store pixel 210 MOV tmp5, tmp5, LSR #6 ;// scale down 211 STRB tmp5, [mb], #1 ;// store pixel 212 213 ;// second 214 LDRB tmp1, [ptrA, #1]! 215 LDRB tmp3, [ptrA, width] 216 LDRB tmp5, [ptrA, width, LSL #1] 217 218 PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1| 219 PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3| 220 221 SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac) 222 MLA tmp5, tmp1, xFrac, c32 ;// t1=t1*xFrac+32 223 MLA tmp5, tmp2, valX, tmp5 ;// t5=t2*valX+t5 224 225 SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac) 226 MLA tmp6, tmp3, xFrac, c32 ;// t3=t3*xFrac+32 227 MLA tmp6, tmp4, valX, tmp6 ;// t6=t4*valX+t6 228 229 MOV tmp6, tmp6, LSR #6 ;// scale down 230 STRB tmp6, [mb, #8] ;// store pixel 231 MOV tmp5, tmp5, LSR #6 ;// scale down 232 STRB tmp5, [mb], #1 ;// store pixel 233 234 SUBS count, count, #2<<28 235 BCS loop1_x 236 237 AND tmp2, count, #0x00F00000 238 239 ADDS mb, mb, #16 240 SBC mb, mb, tmp2, LSR #20 241 ADD ptrA, ptrA, width, LSL #1 242 SBC ptrA, ptrA, tmp2, LSR #20 243 244 ADDS count, count, #0xE << 24 245 BGE loop1_y 246 247 ;/////////////////////////////////////////////////////////////////////////// 248 ;// Cr 249 ;/////////////////////////////////////////////////////////////////////////// 250 LDR height, [sp,#0xfc] ;// height 251 LDR ref, [sp, #0xc4] ;// ref 252 LDR tmp1, [sp, #0xd0] ;// y0 253 LDR tmp2, [sp, #0xcc] ;// x0 254 LDR mb, [sp, #0xc8] ;// predPartChroma 255 256 ADD tmp1, height, tmp1 257 MLA tmp3, tmp1, width, tmp2 258 ADD ptrA, ref, tmp3 259 ADD mb, mb, #64 260 261 AND count, count, #0x00FFFFFF 262 AND tmp1, count, #0x000F0000 263 ADD count, count, tmp1, LSL #8 264 AND tmp2, count, #0x00F00000 265 266 ;// 2x2 pels per iteration 267 ;// bilinear vertical and horizontal interpolation 268loop2_y 269 LDRB tmp1, [ptrA] 270 LDRB tmp3, [ptrA, width] 271 LDRB tmp5, [ptrA, width, LSL #1] 272 273 PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1| 274 PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3| 275 276 SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac) 277 SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac) 278 279 ADD count, count, tmp2, LSL #8 280loop2_x 281 ;// first 282 LDRB tmp2, [ptrA, #1]! 283 LDRB tmp4, [ptrA, width] 284 LDRB tmp6, [ptrA, width, LSL #1] 285 286 PKHBT tmp2, tmp2, tmp4, LSL #16 ;// |t4|t2| 287 PKHBT tmp4, tmp4, tmp6, LSL #16 ;// |t6|t4| 288 289 SMUAD tmp2, tmp2, valY ;// t2=(t2*valY + t4*yFrac) 290 MLA tmp5, tmp1, valX, c32 ;// t5=t1*valX+32 291 MLA tmp5, tmp2, xFrac, tmp5 ;// t5=t2*xFrac+t5 292 293 SMUAD tmp4, tmp4, valY ;// t4=(t4*valY + t6*yFrac) 294 MLA tmp6, tmp3, valX, c32 ;// t3=t3*valX+32 295 MLA tmp6, tmp4, xFrac, tmp6 ;// t6=t4*xFrac+t6 296 297 MOV tmp6, tmp6, LSR #6 ;// scale down 298 STRB tmp6, [mb, #8] ;// store pixel 299 MOV tmp5, tmp5, LSR #6 ;// scale down 300 STRB tmp5, [mb], #1 ;// store pixel 301 302 ;// second 303 LDRB tmp1, [ptrA, #1]! 304 LDRB tmp3, [ptrA, width] 305 LDRB tmp5, [ptrA, width, LSL #1] 306 307 PKHBT tmp1, tmp1, tmp3, LSL #16 ;// |t3|t1| 308 PKHBT tmp3, tmp3, tmp5, LSL #16 ;// |t5|t3| 309 310 SMUAD tmp1, tmp1, valY ;// t1=(t1*valY + t3*yFrac) 311 MLA tmp5, tmp1, xFrac, c32 ;// t1=t1*xFrac+32 312 MLA tmp5, tmp2, valX, tmp5 ;// t5=t2*valX+t5 313 314 SMUAD tmp3, tmp3, valY ;// t3=(t3*valY + t5*yFrac) 315 MLA tmp6, tmp3, xFrac, c32 ;// t3=t3*xFrac+32 316 MLA tmp6, tmp4, valX, tmp6 ;// t6=t4*valX+t6 317 318 MOV tmp6, tmp6, LSR #6 ;// scale down 319 STRB tmp6, [mb, #8] ;// store pixel 320 MOV tmp5, tmp5, LSR #6 ;// scale down 321 STRB tmp5, [mb], #1 ;// store pixel 322 323 SUBS count, count, #2<<28 324 BCS loop2_x 325 326 AND tmp2, count, #0x00F00000 327 328 ADDS mb, mb, #16 329 SBC mb, mb, tmp2, LSR #20 330 ADD ptrA, ptrA, width, LSL #1 331 SBC ptrA, ptrA, tmp2, LSR #20 332 333 ADDS count, count, #0xE << 24 334 BGE loop2_y 335 336 ADD sp,sp,#0xd4 337 LDMFD sp!,{r4-r11,pc} 338 339 END 340