1; Copyright (C) 2009 The Android Open Source Project
2;
3; Licensed under the Apache License, Version 2.0 (the "License");
4; you may not use this file except in compliance with the License.
5; You may obtain a copy of the License at
6;
7;      http://www.apache.org/licenses/LICENSE-2.0
8;
9; Unless required by applicable law or agreed to in writing, software
10; distributed under the License is distributed on an "AS IS" BASIS,
11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12; See the License for the specific language governing permissions and
13; limitations under the License.
14
15;-------------------------------------------------------------------------------
16;--
17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaHor function
18;--
19;-------------------------------------------------------------------------------
20
21
22    IF  :DEF: H264DEC_WINASM
23        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
24    ELSE
25        REQUIRE8
26        PRESERVE8
27    ENDIF
28
29    AREA    |.text|, CODE
30
31
32;// h264bsdInterpolateChromaHor register allocation
33
34ref     RN 0
35ptrA    RN 0
36
37mb      RN 1
38block   RN 1
39
40x0      RN 2
41count   RN 2
42
43y0      RN 3
44valX    RN 3
45
46width   RN 4
47
48height  RN 5
49tmp7    RN 5
50
51chrPW   RN 6
52tmp8    RN 6
53
54tmp1    RN 7
55chrPH   RN 7
56
57tmp2    RN 8
58
59tmp3    RN 9
60
61tmp4    RN 10
62
63tmp5    RN 11
64
65tmp6    RN 12
66
67c32     RN 14
68xFrac   RN 14
69
70;// Function exports and imports
71
72    IMPORT  h264bsdFillBlock
73
74    EXPORT  h264bsdInterpolateChromaHor
75
76;//  Function arguments
77;//
78;//  u8 *ref,                   : 0xc4
79;//  u8 *predPartChroma,        : 0xc8
80;//  i32 x0,                    : 0xcc
81;//  i32 y0,                    : 0xd0
82;//  u32 width,                 : 0xf8
83;//  u32 height,                : 0xfc
84;//  u32 xFrac,                 : 0x100
85;//  u32 chromaPartWidth,       : 0x104
86;//  u32 chromaPartHeight       : 0x108
87
88h264bsdInterpolateChromaHor
89    STMFD   sp!, {r0-r11,lr}
90    SUB     sp, sp, #0xc4
91
92    LDR     chrPW, [sp, #0x104]     ;// chromaPartWidth
93    LDR     width, [sp, #0xf8]      ;// width
94    CMP     x0, #0
95    BLT     do_fill
96
97    ADD     tmp6, x0, chrPW         ;// tmp6 = x0+ chromaPartWidth
98    ADD     tmp6, tmp6, #1          ;// tmp6 = x0 + chromaPartWidth + 1
99    CMP     tmp6, width             ;// x0+chromaPartWidth+1 > width
100    BHI     do_fill
101
102    CMP     y0, #0
103    BLT     do_fill
104    LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
105    LDR     height, [sp, #0xfc]     ;// height
106    ADD     tmp6, y0, chrPH         ;// tmp6 = y0 + chromaPartHeight
107    CMP     tmp6, height
108    BLS     skip_fill
109
110do_fill
111    LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
112    LDR     height, [sp, #0xfc]     ;// height
113    ADD     tmp8, chrPW, #1         ;// tmp8 = chromaPartWidth+1
114    MOV     tmp2, tmp8              ;// tmp2 = chromaPartWidth+1
115    STMIA   sp,{width,height,tmp8,chrPH,tmp2}
116    ADD     block, sp, #0x1c        ;// block
117    BL      h264bsdFillBlock
118
119    LDR     x0, [sp, #0xcc]
120    LDR     y0, [sp, #0xd0]
121    LDR     ref, [sp, #0xc4]        ;// ref
122    STMIA   sp,{width,height,tmp8,chrPH,tmp2}
123    ADD     block, sp, #0x1c        ;// block
124    MLA     ref, height, width, ref ;// ref += width * height;
125    MLA     block, chrPH, tmp8, block;// block + (chromaPH)*(chromaPW+1)
126    BL      h264bsdFillBlock
127
128    MOV     x0, #0                  ;// x0 = 0
129    MOV     y0, #0                  ;// y0 = 0
130    STR     x0, [sp, #0xcc]
131    STR     y0, [sp, #0xd0]
132    ADD     ref, sp, #0x1c          ;// ref = block
133    STR     ref, [sp, #0xc4]        ;// ref
134
135    STR     chrPH, [sp, #0xfc]      ;// height
136    STR     tmp8, [sp, #0xf8]       ;// width
137    MOV     width, tmp8
138    SUB     chrPW, chrPW, #1
139
140skip_fill
141    MLA     tmp3, y0, width, x0     ;// tmp3 = y0*width+x0
142    LDR     xFrac, [sp, #0x100]     ;// xFrac
143    ADD     ptrA, ref, tmp3         ;// ptrA = ref + y0*width+x0
144    RSB     valX, xFrac, #8         ;// valX = 8-xFrac
145
146    LDR     mb, [sp, #0xc8]         ;// predPartChroma
147
148
149    ;// pack values to count register
150    ;// [31:28] loop_x (chromaPartWidth-1)
151    ;// [27:24] loop_y (chromaPartHeight-1)
152    ;// [23:20] chromaPartWidth-1
153    ;// [19:16] chromaPartHeight-1
154    ;// [15:00] nothing
155
156    SUB     tmp2, chrPH, #1             ;// chromaPartHeight-1
157    SUB     tmp1, chrPW, #1             ;// chromaPartWidth-1
158    ADD     count, count, tmp2, LSL #16 ;// chromaPartHeight-1
159    ADD     count, count, tmp2, LSL #24 ;// loop_y
160    ADD     count, count, tmp1, LSL #20 ;// chromaPartWidth-1
161    AND     tmp2, count, #0x00F00000    ;// loop_x
162    PKHBT   valX, valX, xFrac, LSL #16  ;// |xFrac|valX |
163    MOV     valX, valX, LSL #3          ;// multiply by 8 in advance
164    MOV     c32, #32
165
166
167    ;///////////////////////////////////////////////////////////////////////////
168    ;// Cb
169    ;///////////////////////////////////////////////////////////////////////////
170
171    ;// 2x2 pels per iteration
172    ;// bilinear vertical interpolation
173
174loop1_y
175    ADD     count, count, tmp2, LSL #8
176    LDRB    tmp1, [ptrA, width]
177    LDRB    tmp2, [ptrA], #1
178
179loop1_x
180    LDRB    tmp3, [ptrA, width]
181    LDRB    tmp4, [ptrA], #1
182
183    PKHBT   tmp5, tmp1, tmp3, LSL #16
184    PKHBT   tmp6, tmp2, tmp4, LSL #16
185
186    LDRB    tmp1, [ptrA, width]
187    LDRB    tmp2, [ptrA], #1
188
189    SMLAD   tmp5, tmp5, valX, c32       ;// multiply
190    SMLAD   tmp6, tmp6, valX, c32       ;// multiply
191
192    PKHBT   tmp7, tmp3, tmp1, LSL #16
193    PKHBT   tmp8, tmp4, tmp2, LSL #16
194
195    SMLAD   tmp7, tmp7, valX, c32       ;// multiply
196    SMLAD   tmp8, tmp8, valX, c32       ;// multiply
197
198    MOV     tmp5, tmp5, LSR #6          ;// scale down
199    STRB    tmp5, [mb,#8]               ;// store row 2 col 1
200
201    MOV     tmp6, tmp6, LSR #6          ;// scale down
202    STRB    tmp6, [mb],#1               ;// store row 1 col 1
203
204    MOV     tmp7, tmp7, LSR #6          ;// scale down
205    STRB    tmp7, [mb,#8]               ;// store row 2 col 2
206
207    MOV     tmp8, tmp8, LSR #6          ;// scale down
208    STRB    tmp8, [mb],#1               ;// store row 1 col 2
209
210    SUBS    count, count, #2<<28
211    BCS     loop1_x
212
213    AND     tmp2, count, #0x00F00000
214
215    ADDS    mb, mb, #16
216    SBC     mb, mb, tmp2, LSR #20
217    ADD     ptrA, ptrA, width, LSL #1
218    SBC     ptrA, ptrA, tmp2, LSR #20
219    SUB     ptrA, ptrA, #1
220
221    ADDS    count, count, #0xE << 24
222    BGE     loop1_y
223
224    ;///////////////////////////////////////////////////////////////////////////
225    ;// Cr
226    ;///////////////////////////////////////////////////////////////////////////
227    LDR     height, [sp,#0xfc]          ;// height
228    LDR     ref, [sp, #0xc4]            ;// ref
229    LDR     tmp1, [sp, #0xd0]           ;// y0
230    LDR     tmp2, [sp, #0xcc]           ;// x0
231    LDR     mb, [sp, #0xc8]             ;// predPartChroma
232
233    ADD     tmp1, height, tmp1
234    MLA     tmp3, tmp1, width, tmp2
235    ADD     ptrA, ref, tmp3
236    ADD     mb, mb, #64
237
238    AND     count, count, #0x00FFFFFF
239    AND     tmp1, count, #0x000F0000
240    ADD     count, count, tmp1, LSL #8
241    AND     tmp2, count, #0x00F00000
242
243    ;// 2x2 pels per iteration
244    ;// bilinear vertical interpolation
245loop2_y
246    ADD     count, count, tmp2, LSL #8
247    LDRB    tmp1, [ptrA, width]
248    LDRB    tmp2, [ptrA], #1
249
250loop2_x
251    LDRB    tmp3, [ptrA, width]
252    LDRB    tmp4, [ptrA], #1
253
254    PKHBT   tmp5, tmp1, tmp3, LSL #16
255    PKHBT   tmp6, tmp2, tmp4, LSL #16
256
257    LDRB    tmp1, [ptrA, width]
258    LDRB    tmp2, [ptrA], #1
259
260    SMLAD   tmp5, tmp5, valX, c32       ;// multiply
261    SMLAD   tmp6, tmp6, valX, c32       ;// multiply
262
263    PKHBT   tmp7, tmp3, tmp1, LSL #16
264    PKHBT   tmp8, tmp4, tmp2, LSL #16
265
266    SMLAD   tmp7, tmp7, valX, c32       ;// multiply
267    SMLAD   tmp8, tmp8, valX, c32       ;// multiply
268
269    MOV     tmp5, tmp5, LSR #6          ;// scale down
270    STRB    tmp5, [mb,#8]               ;// store row 2 col 1
271
272    MOV     tmp6, tmp6, LSR #6          ;// scale down
273    STRB    tmp6, [mb],#1               ;// store row 1 col 1
274
275    MOV     tmp7, tmp7, LSR #6          ;// scale down
276    STRB    tmp7, [mb,#8]               ;// store row 2 col 2
277
278    MOV     tmp8, tmp8, LSR #6          ;// scale down
279    STRB    tmp8, [mb],#1               ;// store row 1 col 2
280
281    SUBS    count, count, #2<<28
282    BCS     loop2_x
283
284    AND     tmp2, count, #0x00F00000
285
286    ADDS    mb, mb, #16
287    SBC     mb, mb, tmp2, LSR #20
288    ADD     ptrA, ptrA, width, LSL #1
289    SBC     ptrA, ptrA, tmp2, LSR #20
290    SUB     ptrA, ptrA, #1
291
292    ADDS    count, count, #0xE << 24
293    BGE     loop2_y
294
295    ADD     sp,sp,#0xd4
296    LDMFD   sp!, {r4-r11,pc}
297
298    END
299