; Copyright (C) 2009 The Android Open Source Project ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ;------------------------------------------------------------------------------- ;-- ;-- Abstract : ARMv6 optimized version horizontal part of ;-- h264bsdInterpolateMid functions ;-- ;------------------------------------------------------------------------------- IF :DEF: H264DEC_WINASM ;// We dont use REQUIRE8 and PRESERVE8 for winasm ELSE REQUIRE8 PRESERVE8 ENDIF AREA |.text|, CODE ;// Register allocation ref RN 0 ;// pointer to current position in reference image mb RN 1 ;// pointer to current position in interpolated mb count RN 2 ;// bit-packed width and count values x_2_0 RN 4 x_3_1 RN 5 x_6_4 RN 6 x_7_5 RN 7 tmp1 RN 8 tmp2 RN 9 tmp3 RN 10 tmp4 RN 11 mult_20_01 RN 12 ;// [20, 1] mult_20_m5 RN 14 ;// [20, -5] EXPORT h264bsdInterpolateMidHorPart ;// Horizontal filter approach ;// ;// Basic idea in horizontal filtering is to adjust coefficients ;// like below. Calculation is done with 16-bit maths. ;// ;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0 ;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ... ;// y_0 = 20 1 20 -5 -5 1 ;// y_1 = -5 20 1 1 20 -5 ;// y_2 = 1 -5 -5 20 1 20 ;// y_3 = 1 20 -5 -5 20 1 h264bsdInterpolateMidHorPart STMFD sp!, {r4-r11, lr} ;// pack values to count register ;// [31:28] loop_x (partWidth-1) ;// [27:24] loop_y (partHeight-1) ;// [23:20] partWidth-1 ;// [19:16] partHeight-1 ;// [15:00] width LDR mult_20_01, = 0x00140001 LDR mult_20_m5, = 0x0014FFFB AND tmp3, count, #0x000F0000 ;// partWidth-1 loop_y LDR x_3_1, [ref, #-8] ADD count, count, tmp3, LSL #12 LDR x_7_5, [ref, #-4] UXTB16 x_2_0, x_3_1 UXTB16 x_3_1, x_3_1, ROR #8 UXTB16 x_6_4, x_7_5 loop_x UXTB16 x_7_5, x_7_5, ROR #8 SMUAD tmp1, x_2_0, mult_20_01 SMULTB tmp2, x_2_0, mult_20_m5 SMULTB tmp3, x_2_0, mult_20_01 SMULTB tmp4, x_3_1, mult_20_01 SMLAD tmp1, x_3_1, mult_20_m5, tmp1 SMLAD tmp2, x_3_1, mult_20_01, tmp2 SMLATB tmp3, x_3_1, mult_20_m5, tmp3 LDR x_3_1, [ref], #4 SMLAD tmp4, x_6_4, mult_20_m5, tmp4 SMLABB tmp1, x_6_4, mult_20_m5, tmp1 SMLADX tmp2, x_6_4, mult_20_01, tmp2 SMLADX tmp3, x_6_4, mult_20_m5, tmp3 SMLADX tmp4, x_7_5, mult_20_m5, tmp4 SMLABB tmp1, x_7_5, mult_20_01, tmp1 SMLABB tmp2, x_7_5, mult_20_m5, tmp2 UXTB16 x_2_0, x_3_1 SMLADX tmp3, x_7_5, mult_20_01, tmp3 SMLABB tmp4, x_2_0, mult_20_01, tmp4 SUBS count, count, #4<<28 STR tmp1, [mb], #4 STR tmp2, [mb], #4 STR tmp3, [mb], #4 STR tmp4, [mb], #4 BCC next_y UXTB16 x_3_1, x_3_1, ROR #8 SMUAD tmp1, x_6_4, mult_20_01 SMULTB tmp2, x_6_4, mult_20_m5 SMULTB tmp3, x_6_4, mult_20_01 SMULTB tmp4, x_7_5, mult_20_01 SMLAD tmp1, x_7_5, mult_20_m5, tmp1 SMLAD tmp2, x_7_5, mult_20_01, tmp2 SMLATB tmp3, x_7_5, mult_20_m5, tmp3 LDR x_7_5, [ref], #4 SMLAD tmp4, x_2_0, mult_20_m5, tmp4 SMLABB tmp1, x_2_0, mult_20_m5, tmp1 SMLADX tmp2, x_2_0, mult_20_01, tmp2 SMLADX tmp3, x_2_0, mult_20_m5, tmp3 SMLADX tmp4, x_3_1, mult_20_m5, tmp4 SMLABB tmp1, x_3_1, mult_20_01, tmp1 SMLABB tmp2, x_3_1, mult_20_m5, tmp2 UXTB16 x_6_4, x_7_5 SMLADX tmp3, x_3_1, mult_20_01, tmp3 SMLABB tmp4, x_6_4, mult_20_01, tmp4 SUBS count, count, #4<<28 STR tmp1, [mb], #4 STR tmp2, [mb], #4 STR tmp3, [mb], #4 STR tmp4, [mb], #4 BCS loop_x next_y AND tmp3, count, #0x000F0000 ;// partWidth-1 SMLABB ref, count, mult_20_01, ref ;// +width SBC ref, ref, tmp3, LSR #16 ;// -(partWidth-1)-1 ADDS count, count, #(1<<28)-(1<<20) BGE loop_y LDMFD sp!, {r4-r11, pc} END