1; Copyright (C) 2009 The Android Open Source Project
2;
3; Licensed under the Apache License, Version 2.0 (the "License");
4; you may not use this file except in compliance with the License.
5; You may obtain a copy of the License at
6;
7;      http://www.apache.org/licenses/LICENSE-2.0
8;
9; Unless required by applicable law or agreed to in writing, software
10; distributed under the License is distributed on an "AS IS" BASIS,
11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12; See the License for the specific language governing permissions and
13; limitations under the License.
14
15;-------------------------------------------------------------------------------
16;--
17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorQuarter function
18;--
19;-------------------------------------------------------------------------------
20
21
22    IF :DEF: H264DEC_WINASM
23        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
24    ELSE
25        REQUIRE8
26        PRESERVE8
27    ENDIF
28
29    AREA    |.text|, CODE
30
31;// h264bsdInterpolateHorQuarter register allocation
32
33ref     RN 0
34
35mb      RN 1
36buff    RN 1
37
38count   RN 2
39x0      RN 2
40
41y0      RN 3
42x_2_0   RN 3
43
44width   RN 4
45x_3_1   RN 4
46
47height  RN 5
48x_6_4   RN 5
49
50partW   RN 6
51x_7_5   RN 6
52
53partH   RN 7
54tmp1    RN 7
55
56tmp2    RN 8
57
58tmp3    RN 9
59
60tmp4    RN 10
61
62mult_20_01  RN 11
63
64mult_20_m5  RN 12
65
66plus16  RN 14
67
68
69;// function exports and imports
70
71    IMPORT  h264bsdFillBlock
72
73    EXPORT  h264bsdInterpolateHorQuarter
74
75
76;// Horizontal filter approach
77;//
78;// Basic idea in horizontal filtering is to adjust coefficients
79;// like below. Calculation is done with 16-bit maths.
80;//
81;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
82;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
83;// y_0 =   20  1     20 -5        -5         1
84;// y_1 =   -5        20  1      1 20        -5
85;// y_2 =    1        -5        -5 20      1 20
86;// y_3 =              1        20 -5     -5 20         1
87
88
89h264bsdInterpolateHorQuarter
90    STMFD   sp!, {r0-r11, lr}
91    SUB     sp, sp, #0x1e4
92
93    CMP     x0, #0
94    BLT     do_fill                 ;// (x0 < 0)
95    LDR     partW, [sp,#0x220]      ;// partWidth
96    ADD     tmp4, x0, partW         ;// (x0+partWidth)
97    ADD     tmp4, tmp4, #5          ;// (y0+partW+5)
98    LDR     width, [sp,#0x218]      ;// width
99    CMP     tmp4, width
100    BHI     do_fill                 ;// (x0+partW)>width
101
102    CMP     y0, #0
103    BLT     do_fill                 ;// (y0 < 0)
104    LDR     partH, [sp,#0x224]      ;// partHeight
105    ADD     tmp2, y0, partH         ;// (y0+partHeight)
106    LDR     height, [sp,#0x21c]     ;// height
107    CMP     tmp2, height
108    BLS     skip_fill               ;// no overfill needed
109
110
111do_fill
112    LDR     partH, [sp,#0x224]      ;// partHeight
113    LDR     height, [sp,#0x21c]     ;// height
114    LDR     partW, [sp,#0x220]      ;// partWidth
115    ADD     tmp4, partW, #5         ;// tmp4 = partW + 5;
116    STMIB   sp, {height, tmp4}      ;// sp+4 = height, sp+8 = partWidth+5
117    STR     partH, [sp,#0xc]        ;// sp+c = partHeight
118    STR     tmp4, [sp,#0x10]        ;// sp+10 = partWidth+5
119    LDR     width, [sp,#0x218]      ;// width
120    STR     width, [sp,#0]          ;// sp+0 = width
121    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
122    BL      h264bsdFillBlock
123
124    MOV     x0, #0
125    STR     x0,[sp,#0x1ec]          ;// x0 = 0
126    STR     x0,[sp,#0x1f0]          ;// y0 = 0
127    ADD     ref,sp,#0x28            ;// ref = p1
128    STR     tmp4, [sp,#0x218]       ;// width = partWidth+5
129
130
131skip_fill
132    LDR     x0 ,[sp,#0x1ec]         ;// x0
133    LDR     y0 ,[sp,#0x1f0]         ;// y0
134    LDR     width, [sp,#0x218]      ;// width
135    MLA     tmp2, width, y0, x0     ;// y0*width+x0
136    ADD     ref, ref, tmp2          ;// ref += y0*width+x0
137    ADD     ref, ref, #8            ;// ref = ref+8
138    LDR     mb, [sp, #0x1e8]        ;// mb
139
140    ;// pack values to count register
141    ;// [31:28] loop_x (partWidth-1)
142    ;// [27:24] loop_y (partHeight-1)
143    ;// [23:20] partWidth-1
144    ;// [19:16] partHeight-1
145    ;// [15:00] width
146    MOV     count, width
147    SUB     partW, partW, #1;
148    SUB     partH, partH, #1;
149    ADD     tmp2, partH, partW, LSL #4
150    ADD     count, count, tmp2, LSL #16
151
152
153    LDR     mult_20_01, = 0x00140001
154    LDR     mult_20_m5, = 0x0014FFFB
155    MOV     plus16, #16
156    AND     tmp1, count, #0x000F0000    ;// partHeight-1
157    AND     tmp3, count, #0x00F00000    ;// partWidth-1
158    ADD     count, count, tmp1, LSL #8
159loop_y
160    LDR     x_3_1, [ref, #-8]
161    ADD     count, count, tmp3, LSL #8
162    LDR     x_7_5, [ref, #-4]
163    UXTB16  x_2_0, x_3_1
164    UXTB16  x_3_1, x_3_1, ROR #8
165    UXTB16  x_6_4, x_7_5
166
167loop_x
168    UXTB16  x_7_5, x_7_5, ROR #8
169
170    SMLAD   tmp1, x_2_0, mult_20_01, plus16
171    SMLATB  tmp3, x_2_0, mult_20_01, plus16
172    SMLATB  tmp2, x_2_0, mult_20_m5, plus16
173    SMLATB  tmp4, x_3_1, mult_20_01, plus16
174
175    SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
176    SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
177    SMLAD   tmp2, x_3_1, mult_20_01, tmp2
178    LDR     x_3_1, [ref], #4
179    SMLAD   tmp4, x_6_4, mult_20_m5, tmp4
180
181    SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
182    SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
183    SMLADX  tmp2, x_6_4, mult_20_01, tmp2
184    SMLADX  tmp4, x_7_5, mult_20_m5, tmp4
185
186    SMLABB  tmp1, x_7_5, mult_20_01, tmp1
187    UXTB16  x_2_0, x_3_1
188    SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
189    SMLADX  tmp3, x_7_5, mult_20_01, tmp3
190    SMLABB  tmp4, x_2_0, mult_20_01, tmp4
191
192    MOV     tmp2, tmp2, ASR #5
193    MOV     tmp1, tmp1, ASR #5
194    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
195    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
196    LDR     tmp4, [sp, #0x228]
197    USAT16  tmp2, #8, tmp2
198    USAT16  tmp1, #8, tmp1
199    SUB     tmp4, tmp4, #10
200
201    SUBS    count, count, #4<<28
202    LDR     tmp3, [ref, tmp4]
203    ORR     tmp1, tmp1, tmp2, LSL #8
204
205;// quarter pel position
206    LDR     tmp2, = 0x80808080
207    MVN     tmp3, tmp3
208    UHSUB8  tmp1, tmp1, tmp3
209    EOR     tmp1, tmp1, tmp2
210    STR     tmp1, [mb], #4
211
212    BCC     next_y
213
214    UXTB16  x_3_1, x_3_1, ROR #8
215
216    SMLAD   tmp1, x_6_4, mult_20_01, plus16
217    SMLATB  tmp3, x_6_4, mult_20_01, plus16
218    SMLATB  tmp2, x_6_4, mult_20_m5, plus16
219    SMLATB  tmp4, x_7_5, mult_20_01, plus16
220
221    SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
222    SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
223    SMLAD   tmp2, x_7_5, mult_20_01, tmp2
224    LDR     x_7_5, [ref], #4
225    SMLAD   tmp4, x_2_0, mult_20_m5, tmp4
226
227    SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
228    SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
229    SMLADX  tmp2, x_2_0, mult_20_01, tmp2
230    SMLADX  tmp4, x_3_1, mult_20_m5, tmp4
231
232    SMLABB  tmp1, x_3_1, mult_20_01, tmp1
233    UXTB16  x_6_4, x_7_5
234    SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
235    SMLADX  tmp3, x_3_1, mult_20_01, tmp3
236    SMLABB  tmp4, x_6_4, mult_20_01, tmp4
237
238    MOV     tmp2, tmp2, ASR #5
239    MOV     tmp1, tmp1, ASR #5
240    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
241    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
242    LDR     tmp4, [sp, #0x228]
243    USAT16  tmp2, #8, tmp2
244    USAT16  tmp1, #8, tmp1
245    SUB     tmp4, tmp4, #10
246
247    SUBS    count, count, #4<<28
248    LDR     tmp3, [ref, tmp4]
249    ORR     tmp1, tmp1, tmp2, LSL #8
250
251;// quarter pel
252    LDR     tmp2, = 0x80808080
253    MVN     tmp3, tmp3
254    UHSUB8  tmp1, tmp1, tmp3
255    EOR     tmp1, tmp1, tmp2
256
257    STR     tmp1, [mb], #4
258    BCS     loop_x
259
260next_y
261    AND     tmp3, count, #0x00F00000    ;// partWidth-1
262    SMLABB  ref, count, mult_20_01, ref ;// +width
263    ADDS    mb, mb, #16                 ;// +16, Carry=0
264    SBC     mb, mb, tmp3, LSR #20       ;// -(partWidth-1)-1
265    SBC     ref, ref, tmp3, LSR #20     ;// -(partWidth-1)-1
266    ADDS    count, count, #(1<<28)-(1<<24)
267    BGE     loop_y
268
269    ADD     sp,sp,#0x1f4
270    LDMFD   sp!, {r4-r11, pc}
271
272    END
273
274