1; Copyright (C) 2009 The Android Open Source Project
2;
3; Licensed under the Apache License, Version 2.0 (the "License");
4; you may not use this file except in compliance with the License.
5; You may obtain a copy of the License at
6;
7;      http://www.apache.org/licenses/LICENSE-2.0
8;
9; Unless required by applicable law or agreed to in writing, software
10; distributed under the License is distributed on an "AS IS" BASIS,
11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12; See the License for the specific language governing permissions and
13; limitations under the License.
14
15;-------------------------------------------------------------------------------
16;--
17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateVerQuarter function
18;--
19;-------------------------------------------------------------------------------
20
21    IF :DEF: H264DEC_WINASM
22        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
23    ELSE
24        REQUIRE8
25        PRESERVE8
26    ENDIF
27
28    AREA    |.text|, CODE
29
30;// h264bsdInterpolateVerQuarter register allocation
31
32ref     RN 0
33
34mb      RN 1
35buff    RN 1
36
37count   RN 2
38x0      RN 2
39
40res     RN 3
41y0      RN 3
42
43tmp1    RN 4
44
45tmp2    RN 5
46height  RN 5
47
48tmp3    RN 6
49partW   RN 6
50
51tmp4    RN 7
52partH   RN 7
53
54tmp5    RN 8
55tmp6    RN 9
56
57tmpa    RN 10
58tmpb    RN 11
59width   RN 12
60
61plus16  RN 14
62
63
64;// function exports and imports
65
66    IMPORT  h264bsdFillBlock
67
68    EXPORT  h264bsdInterpolateVerQuarter
69
70;// Approach to vertical interpolation
71;//
72;// Interpolation is done by using 32-bit loads and stores
73;// and by using 16 bit arithmetic. 4x4 block is processed
74;// in each round.
75;//
76;// |a_11|a_11|a_11|a_11|...|a_1n|a_1n|a_1n|a_1n|
77;// |b_11|b_11|b_11|b_11|...|b_1n|b_1n|b_1n|b_1n|
78;// |c_11|c_11|c_11|c_11|...|c_1n|c_1n|c_1n|c_1n|
79;// |d_11|d_11|d_11|d_11|...|d_1n|d_1n|d_1n|d_1n|
80;//           ..
81;//           ..
82;// |a_m1|a_m1|a_m1|a_m1|...
83;// |b_m1|b_m1|b_m1|b_m1|...
84;// |c_m1|c_m1|c_m1|c_m1|...
85;// |d_m1|d_m1|d_m1|d_m1|...
86
87h264bsdInterpolateVerQuarter
88    STMFD   sp!, {r0-r11, lr}
89    SUB     sp, sp, #0x1e4
90
91    CMP     x0, #0
92    BLT     do_fill                 ;// (x0 < 0)
93    LDR     partW, [sp,#0x220]      ;// partWidth
94    ADD     tmp5, x0, partW         ;// (x0+partWidth)
95    LDR     width, [sp,#0x218]      ;// width
96    CMP     tmp5, width
97    BHI     do_fill                 ;// (x0+partW)>width
98
99    CMP     y0, #0
100    BLT     do_fill                 ;// (y0 < 0)
101    LDR     partH, [sp,#0x224]      ;// partHeight
102    ADD     tmp6, y0, partH         ;// (y0+partHeight)
103    ADD     tmp6, tmp6, #5          ;// (y0+partH+5)
104    LDR     height, [sp,#0x21c]     ;// height
105    CMP     tmp6, height
106    BLS     skip_fill               ;// no overfill needed
107
108
109do_fill
110    LDR     partH, [sp,#0x224]      ;// partHeight
111    ADD     tmp5, partH, #5         ;// r2 = partH + 5;
112    LDR     height, [sp,#0x21c]     ;// height
113    LDR     partW, [sp,#0x220]      ;// partWidth
114    STMIB   sp, {height, partW}     ;// sp+4 = height, sp+8 = partWidth
115    STR     tmp5, [sp,#0xc]         ;// sp+c partHeight+5
116    STR     partW, [sp,#0x10]       ;// sp+10 = partWidth
117    LDR     width, [sp,#0x218]      ;// width
118    STR     width, [sp,#0]          ;// sp+0 = width
119    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
120    BL      h264bsdFillBlock
121
122    MOV     x0, #0
123    STR     x0,[sp,#0x1ec]          ;// x0 = 0
124    STR     x0,[sp,#0x1f0]          ;// y0 = 0
125    ADD     ref,sp,#0x28            ;// ref = p1
126    STR     partW, [sp,#0x218]
127
128
129skip_fill
130    LDR     x0 ,[sp,#0x1ec]         ;// x0
131    LDR     y0 ,[sp,#0x1f0]         ;// y0
132    LDR     width, [sp,#0x218]      ;// width
133    MLA     tmp6, width, y0, x0     ;// y0*width+x0
134    ADD     ref, ref, tmp6          ;// ref += y0*width+x0
135    LDR     mb, [sp, #0x1e8]        ;// mb
136
137    ADD     count, partW, partH, LSL #8    ;// |xx|xx|partH|partW|
138    LDR     tmp5, = 0x00010100
139    RSB     count, tmp5, count, LSL #8      ;// |xx|partH-1|partW-1|xx|
140    LDR     tmp2, [sp, #0x228]      ;// verOffset
141    ADD     count, count, tmp2      ;// |xx|partH-1|partW-1|verOffset|
142    LDR     plus16, = 0x00100010
143
144    AND     tmp1, count, #0x0000FF00 ;// partWidth
145
146
147loop_y
148    ADD     count, count, tmp1, LSL #16  ;// partWidth-1 to top byte
149
150loop_x
151    LDR     tmp1, [ref], width     ;// |a4|a3|a2|a1|
152    LDR     tmp2, [ref], width     ;// |c4|c3|c2|c1|
153    LDR     tmp3, [ref], width     ;// |g4|g3|g2|g1|
154    LDR     tmp4, [ref], width     ;// |m4|m3|m2|m1|
155    LDR     tmp5, [ref], width     ;// |r4|r3|r2|r1|
156    LDR     tmp6, [ref], width     ;// |t4|t3|t2|t1|
157
158    ;// first four pixels
159    UXTB16  tmpa, tmp3                  ;// |g3|g1|
160    UXTAB16 tmpa, tmpa, tmp4            ;// |g3+m3|g1+m1|
161    UXTB16  tmpb, tmp2                  ;// |c3|c1|
162    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
163
164    UXTAB16 tmpb, tmpb, tmp5            ;// |c3+r3|c1+r1|
165    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
166    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A
167    UXTAB16 tmpa, tmpa, tmp6            ;// 16+20(G+M)+A+T
168
169    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
170    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
171
172    USAT16  tmpb, #13, tmpa             ;// saturate
173    LDR     res, = 0x00FF00FF
174    UXTB16  tmpa, tmp3, ROR #8          ;// |g4|g2|
175    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// |g4+m4|g2+m2|
176    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
177
178    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
179    UXTB16  tmpb, tmp2, ROR #8          ;// |c4|c2|
180    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
181    UXTAB16 tmpb, tmpb, tmp5, ROR #8    ;// |c4+r4|c2+r2|
182    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A
183    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// 16+20(G+M)+A+T
184
185    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
186    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
187
188    USAT16  tmpb, #13, tmpa             ;// saturate
189    MOVS    tmp1, count, LSL #31        ;// update flags (verOffset)
190    LDR     tmpa, = 0xFF00FF00
191    MVNEQ   tmp1, tmp3                  ;// select verOffset=0
192    MVNNE   tmp1, tmp4                  ;// select verOffset=1
193    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divede by 32
194    ORR     res, res, tmpa
195
196    LDR     tmpa, = 0x80808080
197    UHSUB8  res, res, tmp1              ;// bilinear interpolation
198    LDR     tmp1, [ref], width          ;// load next row
199    EOR     res, res, tmpa              ;// correct sign
200
201    STR     res, [mb], #16              ;// next row (mb)
202
203
204    ;// tmp2 = |a4|a3|a2|a1|
205    ;// tmp3 = |c4|c3|c2|c1|
206    ;// tmp4 = |g4|g3|g2|g1|
207    ;// tmp5 = |m4|m3|m2|m1|
208    ;// tmp6 = |r4|r3|r2|r1|
209    ;// tmp1 = |t4|t3|t2|t1|
210
211    ;// second four pixels
212    UXTB16  tmpa, tmp4                  ;// |g3|g1|
213    UXTAB16 tmpa, tmpa, tmp5            ;// |g3+m3|g1+m1|
214    UXTB16  tmpb, tmp3                  ;// |c3|c1|
215    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
216    UXTAB16 tmpb, tmpb, tmp6            ;// |c3+r3|c1+r1|
217    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
218    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A
219    UXTAB16 tmpa, tmpa, tmp1            ;// 16+20(G+M)+A+T
220
221    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
222    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
223
224    USAT16  tmpb, #13, tmpa             ;// saturate
225    LDR     res, = 0x00FF00FF
226    UXTB16  tmpa, tmp4, ROR #8          ;// |g4|g2|
227    UXTAB16 tmpa, tmpa, tmp5, ROR #8    ;// |g4+m4|g2+m2|
228    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
229
230    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
231    UXTB16  tmpb, tmp3, ROR #8          ;// |c4|c2|
232    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
233    UXTAB16 tmpb, tmpb, tmp6, ROR #8    ;// |c4+r4|c2+r2|
234    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A
235    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// 16+20(G+M)+A+T
236
237    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
238    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
239
240    USAT16  tmpb, #13, tmpa             ;// saturate
241    LDR     tmpa, = 0xFF00FF00
242    MVNEQ   tmp2, tmp4                  ;// select verOffset=0
243    MVNNE   tmp2, tmp5                  ;// select verOffset=1
244
245    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
246    ORR     res, res, tmpa
247    LDR     tmpa, = 0x80808080
248    UHSUB8  res, res, tmp2              ;// bilinear interpolation
249    LDR     tmp2, [ref], width          ;// load next row
250    EOR     res, res, tmpa              ;// correct sign
251    STR     res, [mb], #16              ;// next row
252
253    ;// tmp3 = |a4|a3|a2|a1|
254    ;// tmp4 = |c4|c3|c2|c1|
255    ;// tmp5 = |g4|g3|g2|g1|
256    ;// tmp6 = |m4|m3|m2|m1|
257    ;// tmp1 = |r4|r3|r2|r1|
258    ;// tmp2 = |t4|t3|t2|t1|
259
260    ;// third four pixels
261    UXTB16  tmpa, tmp5                  ;// |g3|g1|
262    UXTAB16 tmpa, tmpa, tmp6            ;// |g3+m3|g1+m1|
263    UXTB16  tmpb, tmp4                  ;// |c3|c1|
264    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
265    UXTAB16 tmpb, tmpb, tmp1            ;// |c3+r3|c1+r1|
266    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
267    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A
268    UXTAB16 tmpa, tmpa, tmp2            ;// 16+20(G+M)+A+T
269
270    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
271    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
272
273    USAT16  tmpb, #13, tmpa             ;// saturate
274    LDR     res, = 0x00FF00FF
275    UXTB16  tmpa, tmp5, ROR #8          ;// |g4|g2|
276    UXTAB16 tmpa, tmpa, tmp6, ROR #8    ;// |g4+m4|g2+m2|
277    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
278
279    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
280    UXTB16  tmpb, tmp4, ROR #8          ;// |c4|c2|
281    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
282    UXTAB16 tmpb, tmpb, tmp1, ROR #8    ;// |c4+r4|c2+r2|
283    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A
284    UXTAB16 tmpa, tmpa, tmp2, ROR #8    ;// 16+20(G+M)+A+T
285
286
287    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
288    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
289
290    USAT16  tmpb, #13, tmpa             ;// saturate
291    LDR     tmpa, = 0xFF00FF00
292    MVNEQ   tmp3, tmp5                  ;// select verOffset=0
293    MVNNE   tmp3, tmp6                  ;// select verOffset=1
294
295    AND     tmpa, tmpa, tmpb, LSL #3    ;// mask and divide by 32
296    ORR     res, res, tmpa
297    LDR     tmpa, = 0x80808080
298    UHSUB8  res, res, tmp3              ;// bilinear interpolation
299    LDR     tmp3, [ref]                 ;// load next row
300    EOR     res, res, tmpa              ;// correct sign
301    STR     res, [mb], #16              ;// next row
302
303    ;// tmp4 = |a4|a3|a2|a1|
304    ;// tmp5 = |c4|c3|c2|c1|
305    ;// tmp6 = |g4|g3|g2|g1|
306    ;// tmp1 = |m4|m3|m2|m1|
307    ;// tmp2 = |r4|r3|r2|r1|
308    ;// tmp3 = |t4|t3|t2|t1|
309
310    ;// fourth four pixels
311    UXTB16  tmpa, tmp6                  ;// |g3|g1|
312    UXTAB16 tmpa, tmpa, tmp1            ;// |g3+m3|g1+m1|
313    UXTB16  tmpb, tmp5                  ;// |c3|c1|
314    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
315    UXTAB16 tmpb, tmpb, tmp2            ;// |c3+r3|c1+r1|
316    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
317    UXTAB16 tmpa, tmpa, tmp4            ;// 16+20(G+M)+A
318    UXTAB16 tmpa, tmpa, tmp3            ;// 16+20(G+M)+A+T
319
320    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
321    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
322
323    USAT16  tmpb, #13, tmpa             ;// saturate
324    LDR     res, = 0x00FF00FF
325    UXTB16  tmpa, tmp6, ROR #8          ;// |g4|g2|
326    UXTAB16 tmpa, tmpa, tmp1, ROR #8    ;// |g4+m4|g2+m2|
327    AND     res, res, tmpb, LSR #5      ;// mask and divide by 32
328
329    ADD     tmpa, tmpa, tmpa, LSL #2    ;// 5(G+M)
330    UXTB16  tmpb, tmp5, ROR #8          ;// |c4|c2|
331    ADD     tmpa, plus16, tmpa, LSL #2  ;// 16+20(G+M)
332    UXTAB16 tmpb, tmpb, tmp2, ROR #8    ;// |c4+r4|c2+r2|
333    UXTAB16 tmpa, tmpa, tmp4, ROR #8    ;// 16+20(G+M)+A
334    UXTAB16 tmpa, tmpa, tmp3, ROR #8    ;// 16+20(G+M)+A+T
335
336    ADD     tmpb, tmpb, tmpb, LSL #2    ;// 5(C+R)
337    SSUB16  tmpa, tmpa, tmpb            ;// 16+20(G+M)+(A+T)-5(C+R)
338
339    USAT16  tmpb, #13, tmpa             ;// saturate
340    LDR     tmp4, = 0xFF00FF00
341    MVNEQ   tmp5, tmp6                  ;// select verOffset=0
342    MVNNE   tmp5, tmp1                  ;// select verOffset=1
343
344    AND     tmpa, tmp4, tmpb, LSL #3    ;// mask and divide by 32
345    ORR     res, res, tmpa
346    LDR     tmpa, = 0x80808080
347    UHSUB8  res, res, tmp5              ;// bilinear interpolation
348
349    ;// decrement loop_x counter
350    SUBS    count, count, #4<<24        ;// (partWidth-1) -= 4;
351
352    ;// calculate "ref" address for next round
353    SUB     ref, ref, width, LSL #3     ;// ref -= 8*width;
354    ADD     ref, ref, #4;               ;// next column (4 pixels)
355
356    EOR     res, res, tmpa              ;// correct sign
357    STR     res, [mb], #-44
358
359    BCS     loop_x
360
361    ADDS    count, count, #252<<16      ;// (partHeight-1) -= 4;
362    ADD     ref, ref, width, LSL #2     ;// ref += 4*width
363    AND     tmp1, count, #0x0000FF00    ;// partWidth-1
364    MOV     tmp2, #1
365    ADD     tmp2, tmp2, tmp1, LSR #8    ;// partWidth
366    SUB     ref, ref, tmp2              ;// ref -= partWidth
367    ADD     mb, mb, #64;
368    SUB     mb, mb, tmp2;               ;// mb -= partWidth
369    BGE     loop_y
370
371    ADD     sp,sp,#0x1f4
372    LDMFD   sp!, {r4-r11, pc}
373
374    END
375