1; Copyright (C) 2009 The Android Open Source Project 2; 3; Licensed under the Apache License, Version 2.0 (the "License"); 4; you may not use this file except in compliance with the License. 5; You may obtain a copy of the License at 6; 7; http://www.apache.org/licenses/LICENSE-2.0 8; 9; Unless required by applicable law or agreed to in writing, software 10; distributed under the License is distributed on an "AS IS" BASIS, 11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12; See the License for the specific language governing permissions and 13; limitations under the License. 14 15;------------------------------------------------------------------------------- 16;-- 17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaVer function 18;-- 19;------------------------------------------------------------------------------- 20 21 22 IF :DEF: H264DEC_WINASM 23 ;// We dont use REQUIRE8 and PRESERVE8 for winasm 24 ELSE 25 REQUIRE8 26 PRESERVE8 27 ENDIF 28 29 AREA |.text|, CODE 30 31;// h264bsdInterpolateChromaVer register allocation 32 33ref RN 0 34ptrA RN 0 35 36mb RN 1 37block RN 1 38 39x0 RN 2 40count RN 2 41 42y0 RN 3 43valY RN 3 44 45width RN 4 46 47height RN 5 48tmp7 RN 5 49 50chrPW RN 6 51tmp8 RN 6 52 53tmp1 RN 7 54 55tmp2 RN 8 56 57tmp3 RN 9 58 59tmp4 RN 10 60 61tmp5 RN 11 62chrPH RN 11 63 64tmp6 RN 12 65 66c32 RN 14 67yFrac RN 14 68 69;// Function exports and imports 70 71 IMPORT h264bsdFillBlock 72 73 EXPORT h264bsdInterpolateChromaVer 74 75;// Function arguments 76;// 77;// u8 *ref, : 0xc4 78;// u8 *predPartChroma, : 0xc8 79;// i32 x0, : 0xcc 80;// i32 y0, : 0xd0 81;// u32 width, : 0xf8 82;// u32 height, : 0xfc 83;// u32 yFrac, : 0x100 84;// u32 chromaPartWidth, : 0x104 85;// u32 chromaPartHeight : 0x108 86 87h264bsdInterpolateChromaVer 88 STMFD sp!, {r0-r11,lr} 89 SUB sp, sp, #0xc4 90 91 LDR chrPW, [sp, #0x104] ;// chromaPartWidth 92 LDR width, [sp, #0xf8] ;// width 93 CMP x0, #0 94 BLT do_fill 95 96 ADD tmp1, x0, chrPW ;// tmp1 = x0+ chromaPartWidth 97 CMP tmp1, width ;// x0+chromaPartWidth > width 98 BHI do_fill 99 100 CMP y0, #0 101 BLT do_fill 102 LDR chrPH, [sp, #0x108] ;// chromaPartHeight 103 LDR height, [sp, #0xfc] ;// height 104 ADD tmp1, y0, chrPH ;// tmp1 = y0 + chromaPartHeight 105 ADD tmp1, tmp1, #1 ;// tmp1 = y0 + chromaPartHeight + 1 106 CMP tmp1, height 107 BLS skip_fill 108 109do_fill 110 LDR chrPH, [sp, #0x108] ;// chromaPartHeight 111 LDR height, [sp, #0xfc] ;// height 112 ADD tmp1, chrPH, #1 ;// tmp1 = chromaPartHeight+1 113 MOV tmp2, chrPW ;// tmp2 = chromaPartWidth 114 STMIA sp,{width,height,chrPW,tmp1,tmp2} 115 ADD block, sp, #0x1c ;// block 116 BL h264bsdFillBlock 117 118 LDR x0, [sp, #0xcc] 119 LDR y0, [sp, #0xd0] 120 LDR ref, [sp, #0xc4] ;// ref 121 STMIA sp,{width,height,chrPW,tmp1,tmp2} 122 ADD block, sp, #0x1c ;// block 123 MLA ref, height, width, ref ;// ref += width * height; 124 MLA block, chrPW, tmp1, block;// block + (chromaPW)*(chromaPH+1) 125 BL h264bsdFillBlock 126 127 MOV x0, #0 ;// x0 = 0 128 MOV y0, #0 ;// y0 = 0 129 STR x0, [sp, #0xcc] 130 STR y0, [sp, #0xd0] 131 ADD ref, sp, #0x1c ;// ref = block 132 STR ref, [sp, #0xc4] ;// ref 133 134 STR tmp1, [sp, #0xfc] ;// height 135 STR chrPW, [sp, #0xf8] ;// width 136 MOV width, chrPW 137 138skip_fill 139 MLA tmp3, y0, width, x0 ;// tmp3 = y0*width+x0 140 LDR yFrac, [sp, #0x100] ;// yFrac 141 ADD ptrA, ref, tmp3 ;// ptrA = ref + y0*width+x0 142 RSB valY, yFrac, #8 ;// valY = 8-yFrac 143 144 LDR mb, [sp, #0xc8] ;// predPartChroma 145 146 147 ;// pack values to count register 148 ;// [31:28] loop_x (chromaPartWidth-1) 149 ;// [27:24] loop_y (chromaPartHeight-1) 150 ;// [23:20] chromaPartWidth-1 151 ;// [19:16] chromaPartHeight-1 152 ;// [15:00] nothing 153 154 SUB tmp2, chrPH, #1 ;// chromaPartHeight-1 155 SUB tmp1, chrPW, #1 ;// chromaPartWidth-1 156 ADD count, count, tmp2, LSL #16 ;// chromaPartHeight-1 157 ADD count, count, tmp2, LSL #24 ;// loop_y 158 ADD count, count, tmp1, LSL #20 ;// chromaPartWidth-1 159 AND tmp2, count, #0x00F00000 ;// loop_x 160 PKHBT valY, valY, yFrac, LSL #16 ;// |yFrac|valY | 161 MOV valY, valY, LSL #3 ;// multiply by 8 in advance 162 MOV c32, #32 163 164 165 ;/////////////////////////////////////////////////////////////////////////// 166 ;// Cb 167 ;/////////////////////////////////////////////////////////////////////////// 168 169 ;// 2x2 pels per iteration 170 ;// bilinear vertical interpolation 171 172loop1_y 173 ADD count, count, tmp2, LSL #8 174loop1_x 175 ;// Process 2x2 block 176 LDRB tmp2, [ptrA,width] ;// 2 row, 1 col 177 LDRB tmp3, [ptrA,width, LSL #1] ;// 3 row, 1 col 178 LDRB tmp1, [ptrA],#1 ;// 1 row, 1 col 179 180 LDRB tmp5, [ptrA,width] ;// 2 row, 2 col 181 LDRB tmp6, [ptrA,width, LSL #1] ;// 3 row, 2 col 182 LDRB tmp4, [ptrA],#1 ;// 1 row, 2 col 183 184 PKHBT tmp1, tmp1, tmp2, LSL #16 ;// |B|A| 185 PKHBT tmp2, tmp2, tmp3, LSL #16 ;// |C|B| 186 PKHBT tmp4, tmp4, tmp5, LSL #16 ;// |B|A| 187 188 SMLAD tmp7, tmp2, valY, c32 ;// multiply 189 PKHBT tmp5, tmp5, tmp6, LSL #16 ;// |C|B| 190 SMLAD tmp2, tmp1, valY, c32 ;// multiply 191 SMLAD tmp8, tmp5, valY, c32 ;// multiply 192 SMLAD tmp5, tmp4, valY, c32 ;// multiply 193 194 MOV tmp7, tmp7, LSR #6 ;// scale down 195 STRB tmp7, [mb,#8] ;// store row 2 col 1 196 MOV tmp2, tmp2, LSR #6 ;// scale down 197 STRB tmp2, [mb],#1 ;// store row 1 col 1 198 199 MOV tmp8, tmp8, LSR #6 ;// scale down 200 STRB tmp8, [mb,#8] ;// store row 2 col 2 201 MOV tmp5, tmp5, LSR #6 ;// scale down 202 STRB tmp5, [mb],#1 ;// store row 1 col 2 203 204 205 SUBS count, count, #2<<28 206 BCS loop1_x 207 208 AND tmp2, count, #0x00F00000 209 210 ADDS mb, mb, #16 211 SBC mb, mb, tmp2, LSR #20 212 ADD ptrA, ptrA, width, LSL #1 213 SBC ptrA, ptrA, tmp2, LSR #20 214 215 ADDS count, count, #0xE << 24 216 BGE loop1_y 217 218 ;/////////////////////////////////////////////////////////////////////////// 219 ;// Cr 220 ;/////////////////////////////////////////////////////////////////////////// 221 LDR height, [sp,#0xfc] ;// height 222 LDR ref, [sp, #0xc4] ;// ref 223 LDR tmp1, [sp, #0xd0] ;// y0 224 LDR tmp2, [sp, #0xcc] ;// x0 225 LDR mb, [sp, #0xc8] ;// predPartChroma 226 227 ADD tmp1, height, tmp1 228 MLA tmp3, tmp1, width, tmp2 229 ADD ptrA, ref, tmp3 230 ADD mb, mb, #64 231 232 AND count, count, #0x00FFFFFF 233 AND tmp1, count, #0x000F0000 234 ADD count, count, tmp1, LSL #8 235 AND tmp2, count, #0x00F00000 236 237 ;// 2x2 pels per iteration 238 ;// bilinear vertical interpolation 239loop2_y 240 ADD count, count, tmp2, LSL #8 241loop2_x 242 ;// Process 2x2 block 243 LDRB tmp2, [ptrA,width] ;// 2 row, 1 col 244 LDRB tmp3, [ptrA,width, LSL #1] ;// 3 row, 1 col 245 LDRB tmp1, [ptrA],#1 ;// 1 row, 1 col 246 247 LDRB tmp5, [ptrA,width] ;// 2 row, 2 col 248 LDRB tmp6, [ptrA,width, LSL #1] ;// 3 row, 2 col 249 LDRB tmp4, [ptrA],#1 ;// 1 row, 2 col 250 251 PKHBT tmp1, tmp1, tmp2, LSL #16 ;// |B|A| 252 PKHBT tmp2, tmp2, tmp3, LSL #16 ;// |C|B| 253 PKHBT tmp4, tmp4, tmp5, LSL #16 ;// |B|A| 254 255 SMLAD tmp7, tmp2, valY, c32 ;// multiply 256 PKHBT tmp5, tmp5, tmp6, LSL #16 ;// |C|B| 257 SMLAD tmp2, tmp1, valY, c32 ;// multiply 258 SMLAD tmp8, tmp5, valY, c32 ;// multiply 259 SMLAD tmp5, tmp4, valY, c32 ;// multiply 260 261 MOV tmp7, tmp7, LSR #6 ;// scale down 262 STRB tmp7, [mb,#8] ;// store row 2 col 1 263 MOV tmp2, tmp2, LSR #6 ;// scale down 264 STRB tmp2, [mb],#1 ;// store row 1 col 1 265 266 MOV tmp8, tmp8, LSR #6 ;// scale down 267 STRB tmp8, [mb,#8] ;// store row 2 col 2 268 MOV tmp5, tmp5, LSR #6 ;// scale down 269 STRB tmp5, [mb],#1 ;// store row 1 col 2 270 271 272 SUBS count, count, #2<<28 273 BCS loop2_x 274 275 AND tmp2, count, #0x00F00000 276 277 ADDS mb, mb, #16 278 SBC mb, mb, tmp2, LSR #20 279 ADD ptrA, ptrA, width, LSL #1 280 SBC ptrA, ptrA, tmp2, LSR #20 281 282 ADDS count, count, #0xE << 24 283 BGE loop2_y 284 285 ADD sp,sp,#0xd4 286 LDMFD sp!, {r4-r11,pc} 287 288 END 289