1; Copyright (C) 2009 The Android Open Source Project
2;
3; Licensed under the Apache License, Version 2.0 (the "License");
4; you may not use this file except in compliance with the License.
5; You may obtain a copy of the License at
6;
7;      http://www.apache.org/licenses/LICENSE-2.0
8;
9; Unless required by applicable law or agreed to in writing, software
10; distributed under the License is distributed on an "AS IS" BASIS,
11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12; See the License for the specific language governing permissions and
13; limitations under the License.
14
15;-------------------------------------------------------------------------------
16;--
17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerHalf function
18;--
19;-------------------------------------------------------------------------------
20
21
22    IF :DEF: H264DEC_WINASM
23        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
24    ELSE
25        REQUIRE8
26        PRESERVE8
27    ENDIF
28
29    AREA    |.text|, CODE
30
31;// h264bsdInterpolateVerHalf register allocation
32
33ref     RN 0
34
35mb      RN 1
36buff    RN 1
37
38count   RN 2
39x0      RN 2
40
41res     RN 3
42y0      RN 3
43
44tmp1    RN 4
45
46tmp2    RN 5
47height  RN 5
48
49tmp3    RN 6
50partW   RN 6
51
52tmp4    RN 7
53partH   RN 7
54
55tmp5    RN 8
56tmp6    RN 9
57
58tmpa    RN 10
59tmpb    RN 11
60width   RN 12
61
62plus16  RN 14
63
64
65;// function exports and imports
66
67    IMPORT  h264bsdFillBlock
68
69    EXPORT  h264bsdInterpolateVerHalf
70
71;// Approach to vertical interpolation
72;//
73;// Interpolation is done by using 32-bit loads and stores
74;// and by using 16 bit arithmetic. 4x4 block is processed
75;// in each round.
76;//
77;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
78;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
79;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
80;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
81;//           ..
82;//           ..
83;// |a_m1|a_m1|a_m1|a_m1|...
84;// |b_m1|b_m1|b_m1|b_m1|...
85;// |c_m1|c_m1|c_m1|c_m1|...
86;// |d_m1|d_m1|d_m1|d_m1|...
87
88h264bsdInterpolateVerHalf
89    STMFD   sp!, {r0-r11, lr}
90    SUB     sp, sp, #0x1e4
91
92    CMP     x0, #0
93    BLT     do_fill                 ;// (x0 < 0)
94    LDR     partW, [sp,#0x220]      ;// partWidth
95    ADD     tmp5, x0, partW         ;// (x0+partWidth)
96    LDR     width, [sp,#0x218]      ;// width
97    CMP     tmp5, width
98    BHI     do_fill                 ;// (x0+partW)>width
99
100    CMP     y0, #0
101    BLT     do_fill                 ;// (y0 < 0)
102    LDR     partH, [sp,#0x224]      ;// partHeight
103    ADD     tmp6, y0, partH         ;// (y0+partHeight)
104    ADD     tmp6, tmp6, #5          ;// (y0+partH+5)
105    LDR     height, [sp,#0x21c]     ;// height
106    CMP     tmp6, height
107    BLS     skip_fill               ;// no overfill needed
108
109
110do_fill
111    LDR     partH, [sp,#0x224]      ;// partHeight
112    ADD     tmp5, partH, #5         ;// r2 = partH + 5;
113    LDR     height, [sp,#0x21c]     ;// height
114    LDR     partW, [sp,#0x220]      ;// partWidth
115    STMIB   sp, {height, partW}     ;// sp+4 = height, sp+8 = partWidth
116    STR     tmp5, [sp,#0xc]         ;// sp+c partHeight+5
117    STR     partW, [sp,#0x10]       ;// sp+10 = partWidth
118    LDR     width, [sp,#0x218]      ;// width
119    STR     width, [sp,#0]          ;// sp+0 = width
120    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
121    BL      h264bsdFillBlock
122
123    MOV     x0, #0
124    STR     x0,[sp,#0x1ec]          ;// x0 = 0
125    STR     x0,[sp,#0x1f0]          ;// y0 = 0
126    ADD     ref,sp,#0x28            ;// ref = p1
127    STR     partW, [sp,#0x218]
128
129
130skip_fill
131    LDR     x0 ,[sp,#0x1ec]         ;// x0
132    LDR     y0 ,[sp,#0x1f0]         ;// y0
133    LDR     width, [sp,#0x218]      ;// width
134    MLA     tmp6, width, y0, x0     ;// y0*width+x0
135    ADD     ref, ref, tmp6          ;// ref += y0*width+x0
136    LDR     mb, [sp, #0x1e8]        ;// mb
137
138    ADD     count, partW, partH, LSL #16    ;// |partH|partW|
139    LDR     tmp5, = 0x00010001
140    SSUB16  count, count, tmp5;     ;// |partH-1|partW-1|
141    LDR     plus16, = 0x00100010
142
143    AND     tmp1, count, #0x000000FF ;// partWidth
144
145
146loop_y
147    ADD     count, count, tmp1, LSL #24  ;// partWidth-1 to top byte
148
149loop_x
150    LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
151    LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
152    LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
153    LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
154    LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
155    LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|
156
157    ;// first four pixels
158    UXTB16  tmpa, tmp3                  ;// |g3|g1|
159    UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
160    UXTB16  tmpb, tmp2                  ;// |c3|c1|
161    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
162
163    UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
164    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
165    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
166    UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T
167
168    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
169    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
170
171    USAT16  tmpb, #13, tmpa             ;// saturate
172    LDR     res, = 0x00FF00FF
173    UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
174    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
175    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
176
177    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
178    UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
179    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
180    UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
181    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
182    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T
183
184    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
185    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
186
187    USAT16  tmpb, #13, tmpa             ;// saturate
188    LDR     tmp1, [ref], width
189    LDR     tmpa, = 0xFF00FF00
190
191    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
192    ORR     res, res, tmpa
193    STR     res, [mb], #16              ;// next row (mb)
194
195    ;// tmp2 = |a4|a3|a2|a1|
196    ;// tmp3 = |c4|c3|c2|c1|
197    ;// tmp4 = |g4|g3|g2|g1|
198    ;// tmp5 = |m4|m3|m2|m1|
199    ;// tmp6 = |r4|r3|r2|r1|
200    ;// tmp1 = |t4|t3|t2|t1|
201
202    ;// second four pixels
203    UXTB16  tmpa, tmp4                  ;// |g3|g1|
204    UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
205    UXTB16  tmpb, tmp3                  ;// |c3|c1|
206    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
207    UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
208    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
209    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
210    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T
211
212    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
213    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
214
215    USAT16  tmpb, #13, tmpa             ;// saturate
216    LDR     res, = 0x00FF00FF
217    UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
218    UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
219    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
220
221    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
222    UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
223    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
224    UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
225    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
226    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T
227
228    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
229    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
230
231    USAT16  tmpb, #13, tmpa             ;// saturate
232    LDR     tmp2, [ref], width
233    LDR     tmpa, = 0xFF00FF00
234
235    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
236    ORR     res, res, tmpa
237    STR     res, [mb], #16              ;// next row
238
239    ;// tmp3 = |a4|a3|a2|a1|
240    ;// tmp4 = |c4|c3|c2|c1|
241    ;// tmp5 = |g4|g3|g2|g1|
242    ;// tmp6 = |m4|m3|m2|m1|
243    ;// tmp1 = |r4|r3|r2|r1|
244    ;// tmp2 = |t4|t3|t2|t1|
245
246    ;// third four pixels
247    UXTB16  tmpa, tmp5                  ;// |g3|g1|
248    UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
249    UXTB16  tmpb, tmp4                  ;// |c3|c1|
250    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
251    UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
252    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
253    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
254    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T
255
256    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
257    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
258
259    USAT16  tmpb, #13, tmpa             ;// saturate
260    LDR     res, = 0x00FF00FF
261    UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
262    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
263    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
264
265    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
266    UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
267    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
268    UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
269    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
270    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T
271
272
273    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
274    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
275
276    USAT16  tmpb, #13, tmpa             ;// saturate
277    LDR     tmp3, [ref]
278    LDR     tmpa, = 0xFF00FF00
279
280    ;// decrement loop_x counter
281    SUBS    count, count, #4<<24        ;// (partWidth-1) -= 4;
282
283    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
284    ORR     res, res, tmpa
285    STR     res, [mb], #16              ;// next row
286
287    ;// tmp4 = |a4|a3|a2|a1|
288    ;// tmp5 = |c4|c3|c2|c1|
289    ;// tmp6 = |g4|g3|g2|g1|
290    ;// tmp1 = |m4|m3|m2|m1|
291    ;// tmp2 = |r4|r3|r2|r1|
292    ;// tmp3 = |t4|t3|t2|t1|
293
294    ;// fourth four pixels
295    UXTB16  tmpa, tmp6                  ;// |g3|g1|
296    UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
297    UXTB16  tmpb, tmp5                  ;// |c3|c1|
298    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
299    UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
300    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
301    UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
302    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T
303
304    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
305    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
306
307    USAT16  tmpb, #13, tmpa             ;// saturate
308    LDR     res, = 0x00FF00FF
309    UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
310    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
311    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
312
313    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
314    UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
315    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
316    UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
317    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
318    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T
319
320    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
321    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
322
323    USAT16  tmpb, #13, tmpa             ;// saturate
324    LDR     tmp4, = 0xFF00FF00
325
326    ;// calculate "ref" address for next round
327    SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
328    ADD     ref, ref, #4;               ;// next column (4 pixels)
329    AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
330    ORR     res, res, tmpa
331    STR     res, [mb], #-44
332
333    BCS     loop_x
334
335    ADDS    count, count, #252<<16      ;// (partHeight-1) -= 4;
336    ADD     ref, ref, width, LSL #2     ;// ref += 4*width
337    AND     tmp1, count, #0x000000FF    ;// partWidth-1
338    ADD     tmp2, tmp1, #1              ;// partWidth
339    SUB     ref, ref, tmp2              ;// ref -= partWidth
340    ADD     mb, mb, #64;
341    SUB     mb, mb, tmp2;               ;// mb -= partWidth
342    BGE     loop_y
343
344    ADD     sp,sp,#0x1f4
345    LDMFD   sp!, {r4-r11, pc}
346
347    END
348