1; Copyright (C) 2009 The Android Open Source Project 2; 3; Licensed under the Apache License, Version 2.0 (the "License"); 4; you may not use this file except in compliance with the License. 5; You may obtain a copy of the License at 6; 7; http://www.apache.org/licenses/LICENSE-2.0 8; 9; Unless required by applicable law or agreed to in writing, software 10; distributed under the License is distributed on an "AS IS" BASIS, 11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12; See the License for the specific language governing permissions and 13; limitations under the License. 14 15;------------------------------------------------------------------------------- 16;-- 17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorQuarter function 18;-- 19;------------------------------------------------------------------------------- 20 21 22 IF :DEF: H264DEC_WINASM 23 ;// We dont use REQUIRE8 and PRESERVE8 for winasm 24 ELSE 25 REQUIRE8 26 PRESERVE8 27 ENDIF 28 29 AREA |.text|, CODE 30 31;// h264bsdInterpolateHorQuarter register allocation 32 33ref RN 0 34 35mb RN 1 36buff RN 1 37 38count RN 2 39x0 RN 2 40 41y0 RN 3 42x_2_0 RN 3 43 44width RN 4 45x_3_1 RN 4 46 47height RN 5 48x_6_4 RN 5 49 50partW RN 6 51x_7_5 RN 6 52 53partH RN 7 54tmp1 RN 7 55 56tmp2 RN 8 57 58tmp3 RN 9 59 60tmp4 RN 10 61 62mult_20_01 RN 11 63 64mult_20_m5 RN 12 65 66plus16 RN 14 67 68 69;// function exports and imports 70 71 IMPORT h264bsdFillBlock 72 73 EXPORT h264bsdInterpolateHorQuarter 74 75 76;// Horizontal filter approach 77;// 78;// Basic idea in horizontal filtering is to adjust coefficients 79;// like below. Calculation is done with 16-bit maths. 80;// 81;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0 82;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ... 83;// y_0 = 20 1 20 -5 -5 1 84;// y_1 = -5 20 1 1 20 -5 85;// y_2 = 1 -5 -5 20 1 20 86;// y_3 = 1 20 -5 -5 20 1 87 88 89h264bsdInterpolateHorQuarter 90 STMFD sp!, {r0-r11, lr} 91 SUB sp, sp, #0x1e4 92 93 CMP x0, #0 94 BLT do_fill ;// (x0 < 0) 95 LDR partW, [sp,#0x220] ;// partWidth 96 ADD tmp4, x0, partW ;// (x0+partWidth) 97 ADD tmp4, tmp4, #5 ;// (y0+partW+5) 98 LDR width, [sp,#0x218] ;// width 99 CMP tmp4, width 100 BHI do_fill ;// (x0+partW)>width 101 102 CMP y0, #0 103 BLT do_fill ;// (y0 < 0) 104 LDR partH, [sp,#0x224] ;// partHeight 105 ADD tmp2, y0, partH ;// (y0+partHeight) 106 LDR height, [sp,#0x21c] ;// height 107 CMP tmp2, height 108 BLS skip_fill ;// no overfill needed 109 110 111do_fill 112 LDR partH, [sp,#0x224] ;// partHeight 113 LDR height, [sp,#0x21c] ;// height 114 LDR partW, [sp,#0x220] ;// partWidth 115 ADD tmp4, partW, #5 ;// tmp4 = partW + 5; 116 STMIB sp, {height, tmp4} ;// sp+4 = height, sp+8 = partWidth+5 117 STR partH, [sp,#0xc] ;// sp+c = partHeight 118 STR tmp4, [sp,#0x10] ;// sp+10 = partWidth+5 119 LDR width, [sp,#0x218] ;// width 120 STR width, [sp,#0] ;// sp+0 = width 121 ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1] 122 BL h264bsdFillBlock 123 124 MOV x0, #0 125 STR x0,[sp,#0x1ec] ;// x0 = 0 126 STR x0,[sp,#0x1f0] ;// y0 = 0 127 ADD ref,sp,#0x28 ;// ref = p1 128 STR tmp4, [sp,#0x218] ;// width = partWidth+5 129 130 131skip_fill 132 LDR x0 ,[sp,#0x1ec] ;// x0 133 LDR y0 ,[sp,#0x1f0] ;// y0 134 LDR width, [sp,#0x218] ;// width 135 MLA tmp2, width, y0, x0 ;// y0*width+x0 136 ADD ref, ref, tmp2 ;// ref += y0*width+x0 137 ADD ref, ref, #8 ;// ref = ref+8 138 LDR mb, [sp, #0x1e8] ;// mb 139 140 ;// pack values to count register 141 ;// [31:28] loop_x (partWidth-1) 142 ;// [27:24] loop_y (partHeight-1) 143 ;// [23:20] partWidth-1 144 ;// [19:16] partHeight-1 145 ;// [15:00] width 146 MOV count, width 147 SUB partW, partW, #1; 148 SUB partH, partH, #1; 149 ADD tmp2, partH, partW, LSL #4 150 ADD count, count, tmp2, LSL #16 151 152 153 LDR mult_20_01, = 0x00140001 154 LDR mult_20_m5, = 0x0014FFFB 155 MOV plus16, #16 156 AND tmp1, count, #0x000F0000 ;// partHeight-1 157 AND tmp3, count, #0x00F00000 ;// partWidth-1 158 ADD count, count, tmp1, LSL #8 159loop_y 160 LDR x_3_1, [ref, #-8] 161 ADD count, count, tmp3, LSL #8 162 LDR x_7_5, [ref, #-4] 163 UXTB16 x_2_0, x_3_1 164 UXTB16 x_3_1, x_3_1, ROR #8 165 UXTB16 x_6_4, x_7_5 166 167loop_x 168 UXTB16 x_7_5, x_7_5, ROR #8 169 170 SMLAD tmp1, x_2_0, mult_20_01, plus16 171 SMLATB tmp3, x_2_0, mult_20_01, plus16 172 SMLATB tmp2, x_2_0, mult_20_m5, plus16 173 SMLATB tmp4, x_3_1, mult_20_01, plus16 174 175 SMLAD tmp1, x_3_1, mult_20_m5, tmp1 176 SMLATB tmp3, x_3_1, mult_20_m5, tmp3 177 SMLAD tmp2, x_3_1, mult_20_01, tmp2 178 LDR x_3_1, [ref], #4 179 SMLAD tmp4, x_6_4, mult_20_m5, tmp4 180 181 SMLABB tmp1, x_6_4, mult_20_m5, tmp1 182 SMLADX tmp3, x_6_4, mult_20_m5, tmp3 183 SMLADX tmp2, x_6_4, mult_20_01, tmp2 184 SMLADX tmp4, x_7_5, mult_20_m5, tmp4 185 186 SMLABB tmp1, x_7_5, mult_20_01, tmp1 187 UXTB16 x_2_0, x_3_1 188 SMLABB tmp2, x_7_5, mult_20_m5, tmp2 189 SMLADX tmp3, x_7_5, mult_20_01, tmp3 190 SMLABB tmp4, x_2_0, mult_20_01, tmp4 191 192 MOV tmp2, tmp2, ASR #5 193 MOV tmp1, tmp1, ASR #5 194 PKHBT tmp2, tmp2, tmp4, LSL #(16-5) 195 PKHBT tmp1, tmp1, tmp3, LSL #(16-5) 196 LDR tmp4, [sp, #0x228] 197 USAT16 tmp2, #8, tmp2 198 USAT16 tmp1, #8, tmp1 199 SUB tmp4, tmp4, #10 200 201 SUBS count, count, #4<<28 202 LDR tmp3, [ref, tmp4] 203 ORR tmp1, tmp1, tmp2, LSL #8 204 205;// quarter pel position 206 LDR tmp2, = 0x80808080 207 MVN tmp3, tmp3 208 UHSUB8 tmp1, tmp1, tmp3 209 EOR tmp1, tmp1, tmp2 210 STR tmp1, [mb], #4 211 212 BCC next_y 213 214 UXTB16 x_3_1, x_3_1, ROR #8 215 216 SMLAD tmp1, x_6_4, mult_20_01, plus16 217 SMLATB tmp3, x_6_4, mult_20_01, plus16 218 SMLATB tmp2, x_6_4, mult_20_m5, plus16 219 SMLATB tmp4, x_7_5, mult_20_01, plus16 220 221 SMLAD tmp1, x_7_5, mult_20_m5, tmp1 222 SMLATB tmp3, x_7_5, mult_20_m5, tmp3 223 SMLAD tmp2, x_7_5, mult_20_01, tmp2 224 LDR x_7_5, [ref], #4 225 SMLAD tmp4, x_2_0, mult_20_m5, tmp4 226 227 SMLABB tmp1, x_2_0, mult_20_m5, tmp1 228 SMLADX tmp3, x_2_0, mult_20_m5, tmp3 229 SMLADX tmp2, x_2_0, mult_20_01, tmp2 230 SMLADX tmp4, x_3_1, mult_20_m5, tmp4 231 232 SMLABB tmp1, x_3_1, mult_20_01, tmp1 233 UXTB16 x_6_4, x_7_5 234 SMLABB tmp2, x_3_1, mult_20_m5, tmp2 235 SMLADX tmp3, x_3_1, mult_20_01, tmp3 236 SMLABB tmp4, x_6_4, mult_20_01, tmp4 237 238 MOV tmp2, tmp2, ASR #5 239 MOV tmp1, tmp1, ASR #5 240 PKHBT tmp2, tmp2, tmp4, LSL #(16-5) 241 PKHBT tmp1, tmp1, tmp3, LSL #(16-5) 242 LDR tmp4, [sp, #0x228] 243 USAT16 tmp2, #8, tmp2 244 USAT16 tmp1, #8, tmp1 245 SUB tmp4, tmp4, #10 246 247 SUBS count, count, #4<<28 248 LDR tmp3, [ref, tmp4] 249 ORR tmp1, tmp1, tmp2, LSL #8 250 251;// quarter pel 252 LDR tmp2, = 0x80808080 253 MVN tmp3, tmp3 254 UHSUB8 tmp1, tmp1, tmp3 255 EOR tmp1, tmp1, tmp2 256 257 STR tmp1, [mb], #4 258 BCS loop_x 259 260next_y 261 AND tmp3, count, #0x00F00000 ;// partWidth-1 262 SMLABB ref, count, mult_20_01, ref ;// +width 263 ADDS mb, mb, #16 ;// +16, Carry=0 264 SBC mb, mb, tmp3, LSR #20 ;// -(partWidth-1)-1 265 SBC ref, ref, tmp3, LSR #20 ;// -(partWidth-1)-1 266 ADDS count, count, #(1<<28)-(1<<24) 267 BGE loop_y 268 269 ADD sp,sp,#0x1f4 270 LDMFD sp!, {r4-r11, pc} 271 272 END 273 274