1; Copyright (C) 2009 The Android Open Source Project 2; 3; Licensed under the Apache License, Version 2.0 (the "License"); 4; you may not use this file except in compliance with the License. 5; You may obtain a copy of the License at 6; 7; http://www.apache.org/licenses/LICENSE-2.0 8; 9; Unless required by applicable law or agreed to in writing, software 10; distributed under the License is distributed on an "AS IS" BASIS, 11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12; See the License for the specific language governing permissions and 13; limitations under the License. 14 15;------------------------------------------------------------------------------- 16;-- 17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorHalf function 18;-- 19;------------------------------------------------------------------------------- 20 21 22 IF :DEF: H264DEC_WINASM 23 ;// We dont use REQUIRE8 and PRESERVE8 for winasm 24 ELSE 25 REQUIRE8 26 PRESERVE8 27 ENDIF 28 29 AREA |.text|, CODE 30 31;// h264bsdInterpolateHorHalf register allocation 32 33ref RN 0 34 35mb RN 1 36buff RN 1 37 38count RN 2 39x0 RN 2 40 41y0 RN 3 42x_2_0 RN 3 43 44width RN 4 45x_3_1 RN 4 46 47height RN 5 48x_6_4 RN 5 49 50partW RN 6 51x_7_5 RN 6 52 53partH RN 7 54tmp1 RN 7 55 56tmp2 RN 8 57 58tmp3 RN 9 59 60tmp4 RN 10 61 62mult_20_01 RN 11 63mult_20_m5 RN 12 64 65plus16 RN 14 66 67 68;// function exports and imports 69 70 IMPORT h264bsdFillBlock 71 72 EXPORT h264bsdInterpolateHorHalf 73 74;// Horizontal filter approach 75;// 76;// Basic idea in horizontal filtering is to adjust coefficients 77;// like below. Calculation is done with 16-bit maths. 78;// 79;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0 80;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ... 81;// y_0 = 20 1 20 -5 -5 1 82;// y_1 = -5 20 1 1 20 -5 83;// y_2 = 1 -5 -5 20 1 20 84;// y_3 = 1 20 -5 -5 20 1 85 86 87h264bsdInterpolateHorHalf 88 STMFD sp!, {r0-r11, lr} 89 SUB sp, sp, #0x1e4 90 91 CMP x0, #0 92 BLT do_fill ;// (x0 < 0) 93 LDR partW, [sp,#0x220] ;// partWidth 94 ADD tmp4, x0, partW ;// (x0+partWidth) 95 ADD tmp4, tmp4, #5 ;// (y0+partW+5) 96 LDR width, [sp,#0x218] ;// width 97 CMP tmp4, width 98 BHI do_fill ;// (x0+partW)>width 99 100 CMP y0, #0 101 BLT do_fill ;// (y0 < 0) 102 LDR partH, [sp,#0x224] ;// partHeight 103 ADD tmp2, y0, partH ;// (y0+partHeight) 104 LDR height, [sp,#0x21c] ;// height 105 CMP tmp2, height 106 BLS skip_fill ;// no overfill needed 107 108 109do_fill 110 LDR partH, [sp,#0x224] ;// partHeight 111 LDR height, [sp,#0x21c] ;// height 112 LDR partW, [sp,#0x220] ;// partWidth 113 ADD tmp4, partW, #5 ;// tmp4 = partW + 5; 114 STMIB sp, {height, tmp4} ;// sp+4 = height, sp+8 = partWidth+5 115 STR partH, [sp,#0xc] ;// sp+c = partHeight 116 STR tmp4, [sp,#0x10] ;// sp+10 = partWidth+5 117 LDR width, [sp,#0x218] ;// width 118 STR width, [sp,#0] ;// sp+0 = width 119 ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1] 120 BL h264bsdFillBlock 121 122 MOV x0, #0 123 STR x0,[sp,#0x1ec] ;// x0 = 0 124 STR x0,[sp,#0x1f0] ;// y0 = 0 125 ADD ref,sp,#0x28 ;// ref = p1 126 STR tmp4, [sp,#0x218] ;// width = partWidth+5 127 128 129skip_fill 130 LDR x0 ,[sp,#0x1ec] ;// x0 131 LDR y0 ,[sp,#0x1f0] ;// y0 132 LDR width, [sp,#0x218] ;// width 133 MLA tmp2, width, y0, x0 ;// y0*width+x0 134 ADD ref, ref, tmp2 ;// ref += y0*width+x0 135 ADD ref, ref, #8 ;// ref = ref+8 136 LDR mb, [sp, #0x1e8] ;// mb 137 138 ;// pack values to count register 139 ;// [31:28] loop_x (partWidth-1) 140 ;// [27:24] loop_y (partHeight-1) 141 ;// [23:20] partWidth-1 142 ;// [19:16] partHeight-1 143 ;// [15:00] width 144 MOV count, width 145 SUB partW, partW, #1; 146 SUB partH, partH, #1; 147 ADD tmp2, partH, partW, LSL #4 148 ADD count, count, tmp2, LSL #16 149 150 151 LDR mult_20_01, = 0x00140001 152 LDR mult_20_m5, = 0x0014FFFB 153 MOV plus16, #16 154 AND tmp1, count, #0x000F0000 ;// partHeight-1 155 AND tmp3, count, #0x00F00000 ;// partWidth-1 156 ADD count, count, tmp1, LSL #8 157loop_y 158 LDR x_3_1, [ref, #-8] 159 ADD count, count, tmp3, LSL #8 160 LDR x_7_5, [ref, #-4] 161 UXTB16 x_2_0, x_3_1 162 UXTB16 x_3_1, x_3_1, ROR #8 163 UXTB16 x_6_4, x_7_5 164 165loop_x 166 UXTB16 x_7_5, x_7_5, ROR #8 167 168 SMLAD tmp1, x_2_0, mult_20_01, plus16 169 SMLATB tmp3, x_2_0, mult_20_01, plus16 170 SMLATB tmp2, x_2_0, mult_20_m5, plus16 171 SMLATB tmp4, x_3_1, mult_20_01, plus16 172 173 SMLAD tmp1, x_3_1, mult_20_m5, tmp1 174 SMLATB tmp3, x_3_1, mult_20_m5, tmp3 175 SMLAD tmp2, x_3_1, mult_20_01, tmp2 176 LDR x_3_1, [ref], #4 177 SMLAD tmp4, x_6_4, mult_20_m5, tmp4 178 179 SMLABB tmp1, x_6_4, mult_20_m5, tmp1 180 SMLADX tmp3, x_6_4, mult_20_m5, tmp3 181 SMLADX tmp2, x_6_4, mult_20_01, tmp2 182 SMLADX tmp4, x_7_5, mult_20_m5, tmp4 183 184 SMLABB tmp1, x_7_5, mult_20_01, tmp1 185 UXTB16 x_2_0, x_3_1 186 SMLABB tmp2, x_7_5, mult_20_m5, tmp2 187 SMLADX tmp3, x_7_5, mult_20_01, tmp3 188 SMLABB tmp4, x_2_0, mult_20_01, tmp4 189 190 MOV tmp2, tmp2, ASR #5 191 MOV tmp1, tmp1, ASR #5 192 PKHBT tmp2, tmp2, tmp4, LSL #(16-5) 193 PKHBT tmp1, tmp1, tmp3, LSL #(16-5) 194 USAT16 tmp2, #8, tmp2 195 USAT16 tmp1, #8, tmp1 196 197 SUBS count, count, #4<<28 198 ORR tmp1, tmp1, tmp2, LSL #8 199 STR tmp1, [mb], #4 200 BCC next_y 201 202 UXTB16 x_3_1, x_3_1, ROR #8 203 204 SMLAD tmp1, x_6_4, mult_20_01, plus16 205 SMLATB tmp3, x_6_4, mult_20_01, plus16 206 SMLATB tmp2, x_6_4, mult_20_m5, plus16 207 SMLATB tmp4, x_7_5, mult_20_01, plus16 208 209 SMLAD tmp1, x_7_5, mult_20_m5, tmp1 210 SMLATB tmp3, x_7_5, mult_20_m5, tmp3 211 SMLAD tmp2, x_7_5, mult_20_01, tmp2 212 LDR x_7_5, [ref], #4 213 SMLAD tmp4, x_2_0, mult_20_m5, tmp4 214 215 SMLABB tmp1, x_2_0, mult_20_m5, tmp1 216 SMLADX tmp3, x_2_0, mult_20_m5, tmp3 217 SMLADX tmp2, x_2_0, mult_20_01, tmp2 218 SMLADX tmp4, x_3_1, mult_20_m5, tmp4 219 220 SMLABB tmp1, x_3_1, mult_20_01, tmp1 221 UXTB16 x_6_4, x_7_5 222 SMLABB tmp2, x_3_1, mult_20_m5, tmp2 223 SMLADX tmp3, x_3_1, mult_20_01, tmp3 224 SMLABB tmp4, x_6_4, mult_20_01, tmp4 225 226 MOV tmp2, tmp2, ASR #5 227 MOV tmp1, tmp1, ASR #5 228 PKHBT tmp2, tmp2, tmp4, LSL #(16-5) 229 PKHBT tmp1, tmp1, tmp3, LSL #(16-5) 230 USAT16 tmp2, #8, tmp2 231 USAT16 tmp1, #8, tmp1 232 233 SUBS count, count, #4<<28 234 ORR tmp1, tmp1, tmp2, LSL #8 235 STR tmp1, [mb], #4 236 BCS loop_x 237 238next_y 239 AND tmp3, count, #0x00F00000 ;// partWidth-1 240 SMLABB ref, count, mult_20_01, ref ;// +width 241 ADDS mb, mb, #16 ;// +16, Carry=0 242 SBC mb, mb, tmp3, LSR #20 ;// -(partWidth-1)-1 243 SBC ref, ref, tmp3, LSR #20 ;// -(partWidth-1)-1 244 ADDS count, count, #(1<<28)-(1<<24) 245 BGE loop_y 246 247 ADD sp,sp,#0x1f4 248 LDMFD sp!, {r4-r11, pc} 249 250 END 251 252