1; Copyright (C) 2009 The Android Open Source Project 2; 3; Licensed under the Apache License, Version 2.0 (the "License"); 4; you may not use this file except in compliance with the License. 5; You may obtain a copy of the License at 6; 7; http://www.apache.org/licenses/LICENSE-2.0 8; 9; Unless required by applicable law or agreed to in writing, software 10; distributed under the License is distributed on an "AS IS" BASIS, 11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12; See the License for the specific language governing permissions and 13; limitations under the License. 14 15;------------------------------------------------------------------------------- 16;-- 17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHor function 18;-- 19;------------------------------------------------------------------------------- 20 21 22 IF :DEF: H264DEC_WINASM 23 ;// We dont use REQUIRE8 and PRESERVE8 for winasm 24 ELSE 25 REQUIRE8 26 PRESERVE8 27 ENDIF 28 29 AREA |.text|, CODE 30 31 32;// h264bsdInterpolateChromaHor register allocation 33 34ref RN 0 35ptrA RN 0 36 37mb RN 1 38block RN 1 39 40x0 RN 2 41count RN 2 42 43y0 RN 3 44valX RN 3 45 46width RN 4 47 48height RN 5 49tmp7 RN 5 50 51chrPW RN 6 52tmp8 RN 6 53 54tmp1 RN 7 55chrPH RN 7 56 57tmp2 RN 8 58 59tmp3 RN 9 60 61tmp4 RN 10 62 63tmp5 RN 11 64 65tmp6 RN 12 66 67c32 RN 14 68xFrac RN 14 69 70;// Function exports and imports 71 72 IMPORT h264bsdFillBlock 73 74 EXPORT h264bsdInterpolateChromaHor 75 76;// Function arguments 77;// 78;// u8 *ref, : 0xc4 79;// u8 *predPartChroma, : 0xc8 80;// i32 x0, : 0xcc 81;// i32 y0, : 0xd0 82;// u32 width, : 0xf8 83;// u32 height, : 0xfc 84;// u32 xFrac, : 0x100 85;// u32 chromaPartWidth, : 0x104 86;// u32 chromaPartHeight : 0x108 87 88h264bsdInterpolateChromaHor 89 STMFD sp!, {r0-r11,lr} 90 SUB sp, sp, #0xc4 91 92 LDR chrPW, [sp, #0x104] ;// chromaPartWidth 93 LDR width, [sp, #0xf8] ;// width 94 CMP x0, #0 95 BLT do_fill 96 97 ADD tmp6, x0, chrPW ;// tmp6 = x0+ chromaPartWidth 98 ADD tmp6, tmp6, #1 ;// tmp6 = x0 + chromaPartWidth + 1 99 CMP tmp6, width ;// x0+chromaPartWidth+1 > width 100 BHI do_fill 101 102 CMP y0, #0 103 BLT do_fill 104 LDR chrPH, [sp, #0x108] ;// chromaPartHeight 105 LDR height, [sp, #0xfc] ;// height 106 ADD tmp6, y0, chrPH ;// tmp6 = y0 + chromaPartHeight 107 CMP tmp6, height 108 BLS skip_fill 109 110do_fill 111 LDR chrPH, [sp, #0x108] ;// chromaPartHeight 112 LDR height, [sp, #0xfc] ;// height 113 ADD tmp8, chrPW, #1 ;// tmp8 = chromaPartWidth+1 114 MOV tmp2, tmp8 ;// tmp2 = chromaPartWidth+1 115 STMIA sp,{width,height,tmp8,chrPH,tmp2} 116 ADD block, sp, #0x1c ;// block 117 BL h264bsdFillBlock 118 119 LDR x0, [sp, #0xcc] 120 LDR y0, [sp, #0xd0] 121 LDR ref, [sp, #0xc4] ;// ref 122 STMIA sp,{width,height,tmp8,chrPH,tmp2} 123 ADD block, sp, #0x1c ;// block 124 MLA ref, height, width, ref ;// ref += width * height; 125 MLA block, chrPH, tmp8, block;// block + (chromaPH)*(chromaPW+1) 126 BL h264bsdFillBlock 127 128 MOV x0, #0 ;// x0 = 0 129 MOV y0, #0 ;// y0 = 0 130 STR x0, [sp, #0xcc] 131 STR y0, [sp, #0xd0] 132 ADD ref, sp, #0x1c ;// ref = block 133 STR ref, [sp, #0xc4] ;// ref 134 135 STR chrPH, [sp, #0xfc] ;// height 136 STR tmp8, [sp, #0xf8] ;// width 137 MOV width, tmp8 138 SUB chrPW, chrPW, #1 139 140skip_fill 141 MLA tmp3, y0, width, x0 ;// tmp3 = y0*width+x0 142 LDR xFrac, [sp, #0x100] ;// xFrac 143 ADD ptrA, ref, tmp3 ;// ptrA = ref + y0*width+x0 144 RSB valX, xFrac, #8 ;// valX = 8-xFrac 145 146 LDR mb, [sp, #0xc8] ;// predPartChroma 147 148 149 ;// pack values to count register 150 ;// [31:28] loop_x (chromaPartWidth-1) 151 ;// [27:24] loop_y (chromaPartHeight-1) 152 ;// [23:20] chromaPartWidth-1 153 ;// [19:16] chromaPartHeight-1 154 ;// [15:00] nothing 155 156 SUB tmp2, chrPH, #1 ;// chromaPartHeight-1 157 SUB tmp1, chrPW, #1 ;// chromaPartWidth-1 158 ADD count, count, tmp2, LSL #16 ;// chromaPartHeight-1 159 ADD count, count, tmp2, LSL #24 ;// loop_y 160 ADD count, count, tmp1, LSL #20 ;// chromaPartWidth-1 161 AND tmp2, count, #0x00F00000 ;// loop_x 162 PKHBT valX, valX, xFrac, LSL #16 ;// |xFrac|valX | 163 MOV valX, valX, LSL #3 ;// multiply by 8 in advance 164 MOV c32, #32 165 166 167 ;/////////////////////////////////////////////////////////////////////////// 168 ;// Cb 169 ;/////////////////////////////////////////////////////////////////////////// 170 171 ;// 2x2 pels per iteration 172 ;// bilinear vertical interpolation 173 174loop1_y 175 ADD count, count, tmp2, LSL #8 176 LDRB tmp1, [ptrA, width] 177 LDRB tmp2, [ptrA], #1 178 179loop1_x 180 LDRB tmp3, [ptrA, width] 181 LDRB tmp4, [ptrA], #1 182 183 PKHBT tmp5, tmp1, tmp3, LSL #16 184 PKHBT tmp6, tmp2, tmp4, LSL #16 185 186 LDRB tmp1, [ptrA, width] 187 LDRB tmp2, [ptrA], #1 188 189 SMLAD tmp5, tmp5, valX, c32 ;// multiply 190 SMLAD tmp6, tmp6, valX, c32 ;// multiply 191 192 PKHBT tmp7, tmp3, tmp1, LSL #16 193 PKHBT tmp8, tmp4, tmp2, LSL #16 194 195 SMLAD tmp7, tmp7, valX, c32 ;// multiply 196 SMLAD tmp8, tmp8, valX, c32 ;// multiply 197 198 MOV tmp5, tmp5, LSR #6 ;// scale down 199 STRB tmp5, [mb,#8] ;// store row 2 col 1 200 201 MOV tmp6, tmp6, LSR #6 ;// scale down 202 STRB tmp6, [mb],#1 ;// store row 1 col 1 203 204 MOV tmp7, tmp7, LSR #6 ;// scale down 205 STRB tmp7, [mb,#8] ;// store row 2 col 2 206 207 MOV tmp8, tmp8, LSR #6 ;// scale down 208 STRB tmp8, [mb],#1 ;// store row 1 col 2 209 210 SUBS count, count, #2<<28 211 BCS loop1_x 212 213 AND tmp2, count, #0x00F00000 214 215 ADDS mb, mb, #16 216 SBC mb, mb, tmp2, LSR #20 217 ADD ptrA, ptrA, width, LSL #1 218 SBC ptrA, ptrA, tmp2, LSR #20 219 SUB ptrA, ptrA, #1 220 221 ADDS count, count, #0xE << 24 222 BGE loop1_y 223 224 ;/////////////////////////////////////////////////////////////////////////// 225 ;// Cr 226 ;/////////////////////////////////////////////////////////////////////////// 227 LDR height, [sp,#0xfc] ;// height 228 LDR ref, [sp, #0xc4] ;// ref 229 LDR tmp1, [sp, #0xd0] ;// y0 230 LDR tmp2, [sp, #0xcc] ;// x0 231 LDR mb, [sp, #0xc8] ;// predPartChroma 232 233 ADD tmp1, height, tmp1 234 MLA tmp3, tmp1, width, tmp2 235 ADD ptrA, ref, tmp3 236 ADD mb, mb, #64 237 238 AND count, count, #0x00FFFFFF 239 AND tmp1, count, #0x000F0000 240 ADD count, count, tmp1, LSL #8 241 AND tmp2, count, #0x00F00000 242 243 ;// 2x2 pels per iteration 244 ;// bilinear vertical interpolation 245loop2_y 246 ADD count, count, tmp2, LSL #8 247 LDRB tmp1, [ptrA, width] 248 LDRB tmp2, [ptrA], #1 249 250loop2_x 251 LDRB tmp3, [ptrA, width] 252 LDRB tmp4, [ptrA], #1 253 254 PKHBT tmp5, tmp1, tmp3, LSL #16 255 PKHBT tmp6, tmp2, tmp4, LSL #16 256 257 LDRB tmp1, [ptrA, width] 258 LDRB tmp2, [ptrA], #1 259 260 SMLAD tmp5, tmp5, valX, c32 ;// multiply 261 SMLAD tmp6, tmp6, valX, c32 ;// multiply 262 263 PKHBT tmp7, tmp3, tmp1, LSL #16 264 PKHBT tmp8, tmp4, tmp2, LSL #16 265 266 SMLAD tmp7, tmp7, valX, c32 ;// multiply 267 SMLAD tmp8, tmp8, valX, c32 ;// multiply 268 269 MOV tmp5, tmp5, LSR #6 ;// scale down 270 STRB tmp5, [mb,#8] ;// store row 2 col 1 271 272 MOV tmp6, tmp6, LSR #6 ;// scale down 273 STRB tmp6, [mb],#1 ;// store row 1 col 1 274 275 MOV tmp7, tmp7, LSR #6 ;// scale down 276 STRB tmp7, [mb,#8] ;// store row 2 col 2 277 278 MOV tmp8, tmp8, LSR #6 ;// scale down 279 STRB tmp8, [mb],#1 ;// store row 1 col 2 280 281 SUBS count, count, #2<<28 282 BCS loop2_x 283 284 AND tmp2, count, #0x00F00000 285 286 ADDS mb, mb, #16 287 SBC mb, mb, tmp2, LSR #20 288 ADD ptrA, ptrA, width, LSL #1 289 SBC ptrA, ptrA, tmp2, LSR #20 290 SUB ptrA, ptrA, #1 291 292 ADDS count, count, #0xE << 24 293 BGE loop2_y 294 295 ADD sp,sp,#0xd4 296 LDMFD sp!, {r4-r11,pc} 297 298 END 299