h264bsd_interpolate_chroma_hor_ver.s revision 0c1bc742181ded4930842b46e9507372f0b1b963
1; Copyright (C) 2009 The Android Open Source Project
2;
3; Licensed under the Apache License, Version 2.0 (the "License");
4; you may not use this file except in compliance with the License.
5; You may obtain a copy of the License at
6;
7;      http://www.apache.org/licenses/LICENSE-2.0
8;
9; Unless required by applicable law or agreed to in writing, software
10; distributed under the License is distributed on an "AS IS" BASIS,
11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12; See the License for the specific language governing permissions and
13; limitations under the License.
14
15;-------------------------------------------------------------------------------
16;--
17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHorVer
18;--            function
19;--
20;-------------------------------------------------------------------------------
21
22
23    IF  :DEF: H264DEC_WINASM
24        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
25    ELSE
26        REQUIRE8
27        PRESERVE8
28    ENDIF
29
30    AREA    |.text|, CODE
31
32
33;// h264bsdInterpolateChromaHorVer register allocation
34
35ref     RN 0
36ptrA    RN 0
37
38mb      RN 1
39block   RN 1
40
41x0      RN 2
42count   RN 2
43
44y0      RN 3
45valY    RN 3
46
47width   RN 4
48
49tmp4    RN 5
50height  RN 5
51
52tmp1    RN 6
53
54tmp2    RN 7
55
56tmp3    RN 8
57
58valX    RN 9
59
60tmp5    RN 10
61chrPW   RN 10
62
63tmp6    RN 11
64chrPH   RN 11
65
66xFrac   RN 12
67
68c32     RN 14
69yFrac   RN 14
70
71;// function exports and imports
72
73    IMPORT  h264bsdFillBlock
74
75    EXPORT  h264bsdInterpolateChromaHorVer
76
77;//  Function arguments
78;//
79;//  u8 *ref,                   : 0xc4
80;//  u8 *predPartChroma,        : 0xc8
81;//  i32 x0,                    : 0xcc
82;//  i32 y0,                    : 0xd0
83;//  u32 width,                 : 0xf8
84;//  u32 height,                : 0xfc
85;//  u32 xFrac,                 : 0x100
86;//  u32 yFrac,                 : 0x104
87;//  u32 chromaPartWidth,       : 0x108
88;//  u32 chromaPartHeight       : 0x10c
89
90h264bsdInterpolateChromaHorVer
91    STMFD   sp!, {r0-r11,lr}
92    SUB     sp, sp, #0xc4
93
94    LDR     chrPW, [sp, #0x108]     ;// chromaPartWidth
95    LDR     xFrac, [sp, #0x100]     ;// xFrac
96    LDR     width, [sp, #0xf8]      ;// width
97    CMP     x0, #0
98    BLT     do_fill
99
100    ADD     tmp1, x0, chrPW         ;// tmp1 = x0+ chromaPartWidth
101    ADD     tmp1, tmp1, #1          ;// tmp1 = x0+ chromaPartWidth+1
102    CMP     tmp1, width             ;// x0+chromaPartWidth+1 > width
103    BHI     do_fill
104
105    CMP     y0, #0
106    BLT     do_fill
107    LDR     chrPH, [sp, #0x10c]     ;// chromaPartHeight
108    LDR     height, [sp, #0xfc]     ;// height
109    ADD     tmp1, y0, chrPH         ;// tmp1 = y0 + chromaPartHeight
110    ADD     tmp1, tmp1, #1          ;// tmp1 = y0 + chromaPartHeight + 1
111    CMP     tmp1, height
112    BLS     skip_fill
113
114do_fill
115    LDR     chrPH, [sp, #0x10c]     ;// chromaPartHeight
116    LDR     height, [sp, #0xfc]     ;// height
117    ADD     tmp3, chrPW, #1         ;// tmp3 = chromaPartWidth+1
118    ADD     tmp1, chrPW, #1         ;// tmp1 = chromaPartWidth+1
119    ADD     tmp2, chrPH, #1         ;// tmp2 = chromaPartHeight+1
120    STMIA   sp,{width,height,tmp1,tmp2,tmp3}
121    ADD     block, sp, #0x1c        ;// block
122    BL      h264bsdFillBlock
123
124    LDR     x0, [sp, #0xcc]
125    LDR     y0, [sp, #0xd0]
126    LDR     ref, [sp, #0xc4]        ;// ref
127    STMIA   sp,{width,height,tmp1,tmp2,tmp3}
128    ADD     block, sp, #0x1c        ;// block
129    MLA     ref, height, width, ref ;// ref += width * height;
130    MLA     block, tmp2, tmp1, block;// block + (chromaPW+1)*(chromaPH+1)
131    BL      h264bsdFillBlock
132
133    MOV     x0, #0                  ;// x0 = 0
134    MOV     y0, #0                  ;// y0 = 0
135    STR     x0, [sp, #0xcc]
136    STR     y0, [sp, #0xd0]
137    ADD     ref, sp, #0x1c          ;// ref = block
138    STR     ref, [sp, #0xc4]        ;// ref
139
140    STR     tmp2, [sp, #0xfc]       ;// height
141    STR     tmp1, [sp, #0xf8]       ;// width
142    MOV     width, tmp1
143
144skip_fill
145    MLA     tmp3, y0, width, x0     ;// tmp3 = y0*width+x0
146    LDR     yFrac, [sp, #0x104]     ;// yFrac
147    LDR     xFrac, [sp, #0x100]
148    ADD     ptrA, ref, tmp3         ;// ptrA = ref + y0*width+x0
149    RSB     valX, xFrac, #8         ;// valX = 8-xFrac
150    RSB     valY, yFrac, #8         ;// valY = 8-yFrac
151
152    LDR     mb, [sp, #0xc8]         ;// predPartChroma
153
154
155    ;// pack values to count register
156    ;// [31:28] loop_x (chromaPartWidth-1)
157    ;// [27:24] loop_y (chromaPartHeight-1)
158    ;// [23:20] chromaPartWidth-1
159    ;// [19:16] chromaPartHeight-1
160    ;// [15:00] nothing
161
162    SUB     tmp2, chrPH, #1             ;// chromaPartHeight-1
163    SUB     tmp1, chrPW, #1             ;// chromaPartWidth-1
164    ADD     count, count, tmp2, LSL #16 ;// chromaPartHeight-1
165    ADD     count, count, tmp2, LSL #24 ;// loop_y
166    ADD     count, count, tmp1, LSL #20 ;// chromaPartWidth-1
167    AND     tmp2, count, #0x00F00000    ;// loop_x
168    PKHBT   valY, valY, yFrac, LSL #16  ;// |yFrac|valY |
169    MOV     c32, #32
170
171
172    ;///////////////////////////////////////////////////////////////////////////
173    ;// Cb
174    ;///////////////////////////////////////////////////////////////////////////
175
176    ;// 2x2 pels per iteration
177    ;// bilinear vertical and horizontal interpolation
178
179loop1_y
180    LDRB    tmp1, [ptrA]
181    LDRB    tmp3, [ptrA, width]
182    LDRB    tmp5, [ptrA, width, LSL #1]
183
184    PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
185    PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
186
187    SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
188    SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
189
190    ADD     count, count, tmp2, LSL #8
191loop1_x
192    ;// first
193    LDRB    tmp2, [ptrA, #1]!
194    LDRB    tmp4, [ptrA, width]
195    LDRB    tmp6, [ptrA, width, LSL #1]
196
197    PKHBT   tmp2, tmp2, tmp4, LSL #16   ;// |t4|t2|
198    PKHBT   tmp4, tmp4, tmp6, LSL #16   ;// |t6|t4|
199
200    SMUAD   tmp2, tmp2, valY            ;// t2=(t2*valY + t4*yFrac)
201    MLA     tmp5, tmp1, valX, c32       ;// t5=t1*valX+32
202    MLA     tmp5, tmp2, xFrac, tmp5     ;// t5=t2*xFrac+t5
203
204    SMUAD   tmp4, tmp4, valY            ;// t4=(t4*valY + t6*yFrac)
205    MLA     tmp6, tmp3, valX, c32       ;// t3=t3*valX+32
206    MLA     tmp6, tmp4, xFrac, tmp6     ;// t6=t4*xFrac+t6
207
208    MOV     tmp6, tmp6, LSR #6          ;// scale down
209    STRB    tmp6, [mb, #8]              ;// store pixel
210    MOV     tmp5, tmp5, LSR #6          ;// scale down
211    STRB    tmp5, [mb], #1              ;// store pixel
212
213    ;// second
214    LDRB    tmp1, [ptrA, #1]!
215    LDRB    tmp3, [ptrA, width]
216    LDRB    tmp5, [ptrA, width, LSL #1]
217
218    PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
219    PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
220
221    SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
222    MLA     tmp5, tmp1, xFrac, c32      ;// t1=t1*xFrac+32
223    MLA     tmp5, tmp2, valX, tmp5      ;// t5=t2*valX+t5
224
225    SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
226    MLA     tmp6, tmp3, xFrac, c32      ;// t3=t3*xFrac+32
227    MLA     tmp6, tmp4, valX, tmp6      ;// t6=t4*valX+t6
228
229    MOV     tmp6, tmp6, LSR #6          ;// scale down
230    STRB    tmp6, [mb, #8]              ;// store pixel
231    MOV     tmp5, tmp5, LSR #6          ;// scale down
232    STRB    tmp5, [mb], #1              ;// store pixel
233
234    SUBS    count, count, #2<<28
235    BCS     loop1_x
236
237    AND     tmp2, count, #0x00F00000
238
239    ADDS    mb, mb, #16
240    SBC     mb, mb, tmp2, LSR #20
241    ADD     ptrA, ptrA, width, LSL #1
242    SBC     ptrA, ptrA, tmp2, LSR #20
243
244    ADDS    count, count, #0xE << 24
245    BGE     loop1_y
246
247    ;///////////////////////////////////////////////////////////////////////////
248    ;// Cr
249    ;///////////////////////////////////////////////////////////////////////////
250    LDR     height, [sp,#0xfc]          ;// height
251    LDR     ref, [sp, #0xc4]            ;// ref
252    LDR     tmp1, [sp, #0xd0]           ;// y0
253    LDR     tmp2, [sp, #0xcc]           ;// x0
254    LDR     mb, [sp, #0xc8]             ;// predPartChroma
255
256    ADD     tmp1, height, tmp1
257    MLA     tmp3, tmp1, width, tmp2
258    ADD     ptrA, ref, tmp3
259    ADD     mb, mb, #64
260
261    AND     count, count, #0x00FFFFFF
262    AND     tmp1, count, #0x000F0000
263    ADD     count, count, tmp1, LSL #8
264    AND     tmp2, count, #0x00F00000
265
266    ;// 2x2 pels per iteration
267    ;// bilinear vertical and horizontal interpolation
268loop2_y
269    LDRB    tmp1, [ptrA]
270    LDRB    tmp3, [ptrA, width]
271    LDRB    tmp5, [ptrA, width, LSL #1]
272
273    PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
274    PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
275
276    SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
277    SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
278
279    ADD     count, count, tmp2, LSL #8
280loop2_x
281    ;// first
282    LDRB    tmp2, [ptrA, #1]!
283    LDRB    tmp4, [ptrA, width]
284    LDRB    tmp6, [ptrA, width, LSL #1]
285
286    PKHBT   tmp2, tmp2, tmp4, LSL #16   ;// |t4|t2|
287    PKHBT   tmp4, tmp4, tmp6, LSL #16   ;// |t6|t4|
288
289    SMUAD   tmp2, tmp2, valY            ;// t2=(t2*valY + t4*yFrac)
290    MLA     tmp5, tmp1, valX, c32       ;// t5=t1*valX+32
291    MLA     tmp5, tmp2, xFrac, tmp5     ;// t5=t2*xFrac+t5
292
293    SMUAD   tmp4, tmp4, valY            ;// t4=(t4*valY + t6*yFrac)
294    MLA     tmp6, tmp3, valX, c32       ;// t3=t3*valX+32
295    MLA     tmp6, tmp4, xFrac, tmp6     ;// t6=t4*xFrac+t6
296
297    MOV     tmp6, tmp6, LSR #6          ;// scale down
298    STRB    tmp6, [mb, #8]              ;// store pixel
299    MOV     tmp5, tmp5, LSR #6          ;// scale down
300    STRB    tmp5, [mb], #1              ;// store pixel
301
302    ;// second
303    LDRB    tmp1, [ptrA, #1]!
304    LDRB    tmp3, [ptrA, width]
305    LDRB    tmp5, [ptrA, width, LSL #1]
306
307    PKHBT   tmp1, tmp1, tmp3, LSL #16   ;// |t3|t1|
308    PKHBT   tmp3, tmp3, tmp5, LSL #16   ;// |t5|t3|
309
310    SMUAD   tmp1, tmp1, valY            ;// t1=(t1*valY + t3*yFrac)
311    MLA     tmp5, tmp1, xFrac, c32      ;// t1=t1*xFrac+32
312    MLA     tmp5, tmp2, valX, tmp5      ;// t5=t2*valX+t5
313
314    SMUAD   tmp3, tmp3, valY            ;// t3=(t3*valY + t5*yFrac)
315    MLA     tmp6, tmp3, xFrac, c32      ;// t3=t3*xFrac+32
316    MLA     tmp6, tmp4, valX, tmp6      ;// t6=t4*valX+t6
317
318    MOV     tmp6, tmp6, LSR #6          ;// scale down
319    STRB    tmp6, [mb, #8]              ;// store pixel
320    MOV     tmp5, tmp5, LSR #6          ;// scale down
321    STRB    tmp5, [mb], #1              ;// store pixel
322
323    SUBS    count, count, #2<<28
324    BCS     loop2_x
325
326    AND     tmp2, count, #0x00F00000
327
328    ADDS    mb, mb, #16
329    SBC     mb, mb, tmp2, LSR #20
330    ADD     ptrA, ptrA, width, LSL #1
331    SBC     ptrA, ptrA, tmp2, LSR #20
332
333    ADDS    count, count, #0xE << 24
334    BGE     loop2_y
335
336    ADD     sp,sp,#0xd4
337    LDMFD   sp!,{r4-r11,pc}
338
339    END
340