1; Copyright (C) 2009 The Android Open Source Project
2;
3; Licensed under the Apache License, Version 2.0 (the "License");
4; you may not use this file except in compliance with the License.
5; You may obtain a copy of the License at
6;
7;      http://www.apache.org/licenses/LICENSE-2.0
8;
9; Unless required by applicable law or agreed to in writing, software
10; distributed under the License is distributed on an "AS IS" BASIS,
11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12; See the License for the specific language governing permissions and
13; limitations under the License.
14
15;-------------------------------------------------------------------------------
16;--
17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorHalf function
18;--
19;-------------------------------------------------------------------------------
20
21
22    IF :DEF: H264DEC_WINASM
23        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
24    ELSE
25        REQUIRE8
26        PRESERVE8
27    ENDIF
28
29    AREA    |.text|, CODE
30
31;// h264bsdInterpolateHorHalf register allocation
32
33ref     RN 0
34
35mb      RN 1
36buff    RN 1
37
38count   RN 2
39x0      RN 2
40
41y0      RN 3
42x_2_0   RN 3
43
44width   RN 4
45x_3_1   RN 4
46
47height  RN 5
48x_6_4   RN 5
49
50partW   RN 6
51x_7_5   RN 6
52
53partH   RN 7
54tmp1    RN 7
55
56tmp2    RN 8
57
58tmp3    RN 9
59
60tmp4    RN 10
61
62mult_20_01  RN 11
63mult_20_m5  RN 12
64
65plus16  RN 14
66
67
68;// function exports and imports
69
70    IMPORT  h264bsdFillBlock
71
72    EXPORT  h264bsdInterpolateHorHalf
73
74;// Horizontal filter approach
75;//
76;// Basic idea in horizontal filtering is to adjust coefficients
77;// like below. Calculation is done with 16-bit maths.
78;//
79;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
80;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
81;// y_0 =   20  1     20 -5        -5         1
82;// y_1 =   -5        20  1      1 20        -5
83;// y_2 =    1        -5        -5 20      1 20
84;// y_3 =              1        20 -5     -5 20         1
85
86
87h264bsdInterpolateHorHalf
88    STMFD   sp!, {r0-r11, lr}
89    SUB     sp, sp, #0x1e4
90
91    CMP     x0, #0
92    BLT     do_fill                 ;// (x0 < 0)
93    LDR     partW, [sp,#0x220]      ;// partWidth
94    ADD     tmp4, x0, partW         ;// (x0+partWidth)
95    ADD     tmp4, tmp4, #5          ;// (y0+partW+5)
96    LDR     width, [sp,#0x218]      ;// width
97    CMP     tmp4, width
98    BHI     do_fill                 ;// (x0+partW)>width
99
100    CMP     y0, #0
101    BLT     do_fill                 ;// (y0 < 0)
102    LDR     partH, [sp,#0x224]      ;// partHeight
103    ADD     tmp2, y0, partH         ;// (y0+partHeight)
104    LDR     height, [sp,#0x21c]     ;// height
105    CMP     tmp2, height
106    BLS     skip_fill               ;// no overfill needed
107
108
109do_fill
110    LDR     partH, [sp,#0x224]      ;// partHeight
111    LDR     height, [sp,#0x21c]     ;// height
112    LDR     partW, [sp,#0x220]      ;// partWidth
113    ADD     tmp4, partW, #5         ;// tmp4 = partW + 5;
114    STMIB   sp, {height, tmp4}      ;// sp+4 = height, sp+8 = partWidth+5
115    STR     partH, [sp,#0xc]        ;// sp+c = partHeight
116    STR     tmp4, [sp,#0x10]        ;// sp+10 = partWidth+5
117    LDR     width, [sp,#0x218]      ;// width
118    STR     width, [sp,#0]          ;// sp+0 = width
119    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
120    BL      h264bsdFillBlock
121
122    MOV     x0, #0
123    STR     x0,[sp,#0x1ec]          ;// x0 = 0
124    STR     x0,[sp,#0x1f0]          ;// y0 = 0
125    ADD     ref,sp,#0x28            ;// ref = p1
126    STR     tmp4, [sp,#0x218]       ;// width = partWidth+5
127
128
129skip_fill
130    LDR     x0 ,[sp,#0x1ec]         ;// x0
131    LDR     y0 ,[sp,#0x1f0]         ;// y0
132    LDR     width, [sp,#0x218]      ;// width
133    MLA     tmp2, width, y0, x0     ;// y0*width+x0
134    ADD     ref, ref, tmp2          ;// ref += y0*width+x0
135    ADD     ref, ref, #8            ;// ref = ref+8
136    LDR     mb, [sp, #0x1e8]        ;// mb
137
138    ;// pack values to count register
139    ;// [31:28] loop_x (partWidth-1)
140    ;// [27:24] loop_y (partHeight-1)
141    ;// [23:20] partWidth-1
142    ;// [19:16] partHeight-1
143    ;// [15:00] width
144    MOV     count, width
145    SUB     partW, partW, #1;
146    SUB     partH, partH, #1;
147    ADD     tmp2, partH, partW, LSL #4
148    ADD     count, count, tmp2, LSL #16
149
150
151    LDR     mult_20_01, = 0x00140001
152    LDR     mult_20_m5, = 0x0014FFFB
153    MOV     plus16, #16
154    AND     tmp1, count, #0x000F0000    ;// partHeight-1
155    AND     tmp3, count, #0x00F00000    ;// partWidth-1
156    ADD     count, count, tmp1, LSL #8
157loop_y
158    LDR     x_3_1, [ref, #-8]
159    ADD     count, count, tmp3, LSL #8
160    LDR     x_7_5, [ref, #-4]
161    UXTB16  x_2_0, x_3_1
162    UXTB16  x_3_1, x_3_1, ROR #8
163    UXTB16  x_6_4, x_7_5
164
165loop_x
166    UXTB16  x_7_5, x_7_5, ROR #8
167
168    SMLAD   tmp1, x_2_0, mult_20_01, plus16
169    SMLATB  tmp3, x_2_0, mult_20_01, plus16
170    SMLATB  tmp2, x_2_0, mult_20_m5, plus16
171    SMLATB  tmp4, x_3_1, mult_20_01, plus16
172
173    SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
174    SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
175    SMLAD   tmp2, x_3_1, mult_20_01, tmp2
176    LDR     x_3_1, [ref], #4
177    SMLAD   tmp4, x_6_4, mult_20_m5, tmp4
178
179    SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
180    SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
181    SMLADX  tmp2, x_6_4, mult_20_01, tmp2
182    SMLADX  tmp4, x_7_5, mult_20_m5, tmp4
183
184    SMLABB  tmp1, x_7_5, mult_20_01, tmp1
185    UXTB16  x_2_0, x_3_1
186    SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
187    SMLADX  tmp3, x_7_5, mult_20_01, tmp3
188    SMLABB  tmp4, x_2_0, mult_20_01, tmp4
189
190    MOV     tmp2, tmp2, ASR #5
191    MOV     tmp1, tmp1, ASR #5
192    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
193    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
194    USAT16  tmp2, #8, tmp2
195    USAT16  tmp1, #8, tmp1
196
197    SUBS    count, count, #4<<28
198    ORR     tmp1, tmp1, tmp2, LSL #8
199    STR     tmp1, [mb], #4
200    BCC     next_y
201
202    UXTB16  x_3_1, x_3_1, ROR #8
203
204    SMLAD   tmp1, x_6_4, mult_20_01, plus16
205    SMLATB  tmp3, x_6_4, mult_20_01, plus16
206    SMLATB  tmp2, x_6_4, mult_20_m5, plus16
207    SMLATB  tmp4, x_7_5, mult_20_01, plus16
208
209    SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
210    SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
211    SMLAD   tmp2, x_7_5, mult_20_01, tmp2
212    LDR     x_7_5, [ref], #4
213    SMLAD   tmp4, x_2_0, mult_20_m5, tmp4
214
215    SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
216    SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
217    SMLADX  tmp2, x_2_0, mult_20_01, tmp2
218    SMLADX  tmp4, x_3_1, mult_20_m5, tmp4
219
220    SMLABB  tmp1, x_3_1, mult_20_01, tmp1
221    UXTB16  x_6_4, x_7_5
222    SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
223    SMLADX  tmp3, x_3_1, mult_20_01, tmp3
224    SMLABB  tmp4, x_6_4, mult_20_01, tmp4
225
226    MOV     tmp2, tmp2, ASR #5
227    MOV     tmp1, tmp1, ASR #5
228    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
229    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
230    USAT16  tmp2, #8, tmp2
231    USAT16  tmp1, #8, tmp1
232
233    SUBS    count, count, #4<<28
234    ORR     tmp1, tmp1, tmp2, LSL #8
235    STR     tmp1, [mb], #4
236    BCS     loop_x
237
238next_y
239    AND     tmp3, count, #0x00F00000    ;// partWidth-1
240    SMLABB  ref, count, mult_20_01, ref ;// +width
241    ADDS    mb, mb, #16                 ;// +16, Carry=0
242    SBC     mb, mb, tmp3, LSR #20       ;// -(partWidth-1)-1
243    SBC     ref, ref, tmp3, LSR #20     ;// -(partWidth-1)-1
244    ADDS    count, count, #(1<<28)-(1<<24)
245    BGE     loop_y
246
247    ADD     sp,sp,#0x1f4
248    LDMFD   sp!, {r4-r11, pc}
249
250    END
251
252