; Copyright (C) 2009 The Android Open Source Project ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ;------------------------------------------------------------------------------- ;-- ;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorQuarter function ;-- ;------------------------------------------------------------------------------- IF :DEF: H264DEC_WINASM ;// We dont use REQUIRE8 and PRESERVE8 for winasm ELSE REQUIRE8 PRESERVE8 ENDIF AREA |.text|, CODE ;// h264bsdInterpolateHorQuarter register allocation ref RN 0 mb RN 1 buff RN 1 count RN 2 x0 RN 2 y0 RN 3 x_2_0 RN 3 width RN 4 x_3_1 RN 4 height RN 5 x_6_4 RN 5 partW RN 6 x_7_5 RN 6 partH RN 7 tmp1 RN 7 tmp2 RN 8 tmp3 RN 9 tmp4 RN 10 mult_20_01 RN 11 mult_20_m5 RN 12 plus16 RN 14 ;// function exports and imports IMPORT h264bsdFillBlock EXPORT h264bsdInterpolateHorQuarter ;// Horizontal filter approach ;// ;// Basic idea in horizontal filtering is to adjust coefficients ;// like below. Calculation is done with 16-bit maths. ;// ;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0 ;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ... ;// y_0 = 20 1 20 -5 -5 1 ;// y_1 = -5 20 1 1 20 -5 ;// y_2 = 1 -5 -5 20 1 20 ;// y_3 = 1 20 -5 -5 20 1 h264bsdInterpolateHorQuarter STMFD sp!, {r0-r11, lr} SUB sp, sp, #0x1e4 CMP x0, #0 BLT do_fill ;// (x0 < 0) LDR partW, [sp,#0x220] ;// partWidth ADD tmp4, x0, partW ;// (x0+partWidth) ADD tmp4, tmp4, #5 ;// (y0+partW+5) LDR width, [sp,#0x218] ;// width CMP tmp4, width BHI do_fill ;// (x0+partW)>width CMP y0, #0 BLT do_fill ;// (y0 < 0) LDR partH, [sp,#0x224] ;// partHeight ADD tmp2, y0, partH ;// (y0+partHeight) LDR height, [sp,#0x21c] ;// height CMP tmp2, height BLS skip_fill ;// no overfill needed do_fill LDR partH, [sp,#0x224] ;// partHeight LDR height, [sp,#0x21c] ;// height LDR partW, [sp,#0x220] ;// partWidth ADD tmp4, partW, #5 ;// tmp4 = partW + 5; STMIB sp, {height, tmp4} ;// sp+4 = height, sp+8 = partWidth+5 STR partH, [sp,#0xc] ;// sp+c = partHeight STR tmp4, [sp,#0x10] ;// sp+10 = partWidth+5 LDR width, [sp,#0x218] ;// width STR width, [sp,#0] ;// sp+0 = width ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1] BL h264bsdFillBlock MOV x0, #0 STR x0,[sp,#0x1ec] ;// x0 = 0 STR x0,[sp,#0x1f0] ;// y0 = 0 ADD ref,sp,#0x28 ;// ref = p1 STR tmp4, [sp,#0x218] ;// width = partWidth+5 skip_fill LDR x0 ,[sp,#0x1ec] ;// x0 LDR y0 ,[sp,#0x1f0] ;// y0 LDR width, [sp,#0x218] ;// width MLA tmp2, width, y0, x0 ;// y0*width+x0 ADD ref, ref, tmp2 ;// ref += y0*width+x0 ADD ref, ref, #8 ;// ref = ref+8 LDR mb, [sp, #0x1e8] ;// mb ;// pack values to count register ;// [31:28] loop_x (partWidth-1) ;// [27:24] loop_y (partHeight-1) ;// [23:20] partWidth-1 ;// [19:16] partHeight-1 ;// [15:00] width MOV count, width SUB partW, partW, #1; SUB partH, partH, #1; ADD tmp2, partH, partW, LSL #4 ADD count, count, tmp2, LSL #16 LDR mult_20_01, = 0x00140001 LDR mult_20_m5, = 0x0014FFFB MOV plus16, #16 AND tmp1, count, #0x000F0000 ;// partHeight-1 AND tmp3, count, #0x00F00000 ;// partWidth-1 ADD count, count, tmp1, LSL #8 loop_y LDR x_3_1, [ref, #-8] ADD count, count, tmp3, LSL #8 LDR x_7_5, [ref, #-4] UXTB16 x_2_0, x_3_1 UXTB16 x_3_1, x_3_1, ROR #8 UXTB16 x_6_4, x_7_5 loop_x UXTB16 x_7_5, x_7_5, ROR #8 SMLAD tmp1, x_2_0, mult_20_01, plus16 SMLATB tmp3, x_2_0, mult_20_01, plus16 SMLATB tmp2, x_2_0, mult_20_m5, plus16 SMLATB tmp4, x_3_1, mult_20_01, plus16 SMLAD tmp1, x_3_1, mult_20_m5, tmp1 SMLATB tmp3, x_3_1, mult_20_m5, tmp3 SMLAD tmp2, x_3_1, mult_20_01, tmp2 LDR x_3_1, [ref], #4 SMLAD tmp4, x_6_4, mult_20_m5, tmp4 SMLABB tmp1, x_6_4, mult_20_m5, tmp1 SMLADX tmp3, x_6_4, mult_20_m5, tmp3 SMLADX tmp2, x_6_4, mult_20_01, tmp2 SMLADX tmp4, x_7_5, mult_20_m5, tmp4 SMLABB tmp1, x_7_5, mult_20_01, tmp1 UXTB16 x_2_0, x_3_1 SMLABB tmp2, x_7_5, mult_20_m5, tmp2 SMLADX tmp3, x_7_5, mult_20_01, tmp3 SMLABB tmp4, x_2_0, mult_20_01, tmp4 MOV tmp2, tmp2, ASR #5 MOV tmp1, tmp1, ASR #5 PKHBT tmp2, tmp2, tmp4, LSL #(16-5) PKHBT tmp1, tmp1, tmp3, LSL #(16-5) LDR tmp4, [sp, #0x228] USAT16 tmp2, #8, tmp2 USAT16 tmp1, #8, tmp1 SUB tmp4, tmp4, #10 SUBS count, count, #4<<28 LDR tmp3, [ref, tmp4] ORR tmp1, tmp1, tmp2, LSL #8 ;// quarter pel position LDR tmp2, = 0x80808080 MVN tmp3, tmp3 UHSUB8 tmp1, tmp1, tmp3 EOR tmp1, tmp1, tmp2 STR tmp1, [mb], #4 BCC next_y UXTB16 x_3_1, x_3_1, ROR #8 SMLAD tmp1, x_6_4, mult_20_01, plus16 SMLATB tmp3, x_6_4, mult_20_01, plus16 SMLATB tmp2, x_6_4, mult_20_m5, plus16 SMLATB tmp4, x_7_5, mult_20_01, plus16 SMLAD tmp1, x_7_5, mult_20_m5, tmp1 SMLATB tmp3, x_7_5, mult_20_m5, tmp3 SMLAD tmp2, x_7_5, mult_20_01, tmp2 LDR x_7_5, [ref], #4 SMLAD tmp4, x_2_0, mult_20_m5, tmp4 SMLABB tmp1, x_2_0, mult_20_m5, tmp1 SMLADX tmp3, x_2_0, mult_20_m5, tmp3 SMLADX tmp2, x_2_0, mult_20_01, tmp2 SMLADX tmp4, x_3_1, mult_20_m5, tmp4 SMLABB tmp1, x_3_1, mult_20_01, tmp1 UXTB16 x_6_4, x_7_5 SMLABB tmp2, x_3_1, mult_20_m5, tmp2 SMLADX tmp3, x_3_1, mult_20_01, tmp3 SMLABB tmp4, x_6_4, mult_20_01, tmp4 MOV tmp2, tmp2, ASR #5 MOV tmp1, tmp1, ASR #5 PKHBT tmp2, tmp2, tmp4, LSL #(16-5) PKHBT tmp1, tmp1, tmp3, LSL #(16-5) LDR tmp4, [sp, #0x228] USAT16 tmp2, #8, tmp2 USAT16 tmp1, #8, tmp1 SUB tmp4, tmp4, #10 SUBS count, count, #4<<28 LDR tmp3, [ref, tmp4] ORR tmp1, tmp1, tmp2, LSL #8 ;// quarter pel LDR tmp2, = 0x80808080 MVN tmp3, tmp3 UHSUB8 tmp1, tmp1, tmp3 EOR tmp1, tmp1, tmp2 STR tmp1, [mb], #4 BCS loop_x next_y AND tmp3, count, #0x00F00000 ;// partWidth-1 SMLABB ref, count, mult_20_01, ref ;// +width ADDS mb, mb, #16 ;// +16, Carry=0 SBC mb, mb, tmp3, LSR #20 ;// -(partWidth-1)-1 SBC ref, ref, tmp3, LSR #20 ;// -(partWidth-1)-1 ADDS count, count, #(1<<28)-(1<<24) BGE loop_y ADD sp,sp,#0x1f4 LDMFD sp!, {r4-r11, pc} END