1; Copyright (C) 2009 The Android Open Source Project
2;
3; Licensed under the Apache License, Version 2.0 (the "License");
4; you may not use this file except in compliance with the License.
5; You may obtain a copy of the License at
6;
7;      http://www.apache.org/licenses/LICENSE-2.0
8;
9; Unless required by applicable law or agreed to in writing, software
10; distributed under the License is distributed on an "AS IS" BASIS,
11; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12; See the License for the specific language governing permissions and
13; limitations under the License.
14
15;-------------------------------------------------------------------------------
16;--
17;-- Abstract : ARMv6 optimized version of h264bsdInterpolateChromaVer function
18;--
19;-------------------------------------------------------------------------------
20
21
22    IF :DEF: H264DEC_WINASM
23        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
24    ELSE
25        REQUIRE8
26        PRESERVE8
27    ENDIF
28
29    AREA    |.text|, CODE
30
31;// h264bsdInterpolateChromaVer register allocation
32
33ref     RN 0
34ptrA    RN 0
35
36mb      RN 1
37block   RN 1
38
39x0      RN 2
40count   RN 2
41
42y0      RN 3
43valY    RN 3
44
45width   RN 4
46
47height  RN 5
48tmp7    RN 5
49
50chrPW   RN 6
51tmp8    RN 6
52
53tmp1    RN 7
54
55tmp2    RN 8
56
57tmp3    RN 9
58
59tmp4    RN 10
60
61tmp5    RN 11
62chrPH   RN 11
63
64tmp6    RN 12
65
66c32     RN 14
67yFrac   RN 14
68
69;// Function exports and imports
70
71    IMPORT  h264bsdFillBlock
72
73    EXPORT  h264bsdInterpolateChromaVer
74
75;//  Function arguments
76;//
77;//  u8 *ref,                   : 0xc4
78;//  u8 *predPartChroma,        : 0xc8
79;//  i32 x0,                    : 0xcc
80;//  i32 y0,                    : 0xd0
81;//  u32 width,                 : 0xf8
82;//  u32 height,                : 0xfc
83;//  u32 yFrac,                 : 0x100
84;//  u32 chromaPartWidth,       : 0x104
85;//  u32 chromaPartHeight       : 0x108
86
87h264bsdInterpolateChromaVer
88    STMFD   sp!, {r0-r11,lr}
89    SUB     sp, sp, #0xc4
90
91    LDR     chrPW, [sp, #0x104]     ;// chromaPartWidth
92    LDR     width, [sp, #0xf8]      ;// width
93    CMP     x0, #0
94    BLT     do_fill
95
96    ADD     tmp1, x0, chrPW         ;// tmp1 = x0+ chromaPartWidth
97    CMP     tmp1, width             ;// x0+chromaPartWidth > width
98    BHI     do_fill
99
100    CMP     y0, #0
101    BLT     do_fill
102    LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
103    LDR     height, [sp, #0xfc]     ;// height
104    ADD     tmp1, y0, chrPH         ;// tmp1 = y0 + chromaPartHeight
105    ADD     tmp1, tmp1, #1          ;// tmp1 = y0 + chromaPartHeight + 1
106    CMP     tmp1, height
107    BLS     skip_fill
108
109do_fill
110    LDR     chrPH, [sp, #0x108]     ;// chromaPartHeight
111    LDR     height, [sp, #0xfc]     ;// height
112    ADD     tmp1, chrPH, #1         ;// tmp1 = chromaPartHeight+1
113    MOV     tmp2, chrPW             ;// tmp2 = chromaPartWidth
114    STMIA   sp,{width,height,chrPW,tmp1,tmp2}
115    ADD     block, sp, #0x1c        ;// block
116    BL      h264bsdFillBlock
117
118    LDR     x0, [sp, #0xcc]
119    LDR     y0, [sp, #0xd0]
120    LDR     ref, [sp, #0xc4]        ;// ref
121    STMIA   sp,{width,height,chrPW,tmp1,tmp2}
122    ADD     block, sp, #0x1c        ;// block
123    MLA     ref, height, width, ref ;// ref += width * height;
124    MLA     block, chrPW, tmp1, block;// block + (chromaPW)*(chromaPH+1)
125    BL      h264bsdFillBlock
126
127    MOV     x0, #0                  ;// x0 = 0
128    MOV     y0, #0                  ;// y0 = 0
129    STR     x0, [sp, #0xcc]
130    STR     y0, [sp, #0xd0]
131    ADD     ref, sp, #0x1c          ;// ref = block
132    STR     ref, [sp, #0xc4]        ;// ref
133
134    STR     tmp1, [sp, #0xfc]       ;// height
135    STR     chrPW, [sp, #0xf8]      ;// width
136    MOV     width, chrPW
137
138skip_fill
139    MLA     tmp3, y0, width, x0     ;// tmp3 = y0*width+x0
140    LDR     yFrac, [sp, #0x100]     ;// yFrac
141    ADD     ptrA, ref, tmp3         ;// ptrA = ref + y0*width+x0
142    RSB     valY, yFrac, #8         ;// valY = 8-yFrac
143
144    LDR     mb, [sp, #0xc8]         ;// predPartChroma
145
146
147    ;// pack values to count register
148    ;// [31:28] loop_x (chromaPartWidth-1)
149    ;// [27:24] loop_y (chromaPartHeight-1)
150    ;// [23:20] chromaPartWidth-1
151    ;// [19:16] chromaPartHeight-1
152    ;// [15:00] nothing
153
154    SUB     tmp2, chrPH, #1             ;// chromaPartHeight-1
155    SUB     tmp1, chrPW, #1             ;// chromaPartWidth-1
156    ADD     count, count, tmp2, LSL #16 ;// chromaPartHeight-1
157    ADD     count, count, tmp2, LSL #24 ;// loop_y
158    ADD     count, count, tmp1, LSL #20 ;// chromaPartWidth-1
159    AND     tmp2, count, #0x00F00000    ;// loop_x
160    PKHBT   valY, valY, yFrac, LSL #16  ;// |yFrac|valY |
161    MOV     valY, valY, LSL #3          ;// multiply by 8 in advance
162    MOV     c32, #32
163
164
165    ;///////////////////////////////////////////////////////////////////////////
166    ;// Cb
167    ;///////////////////////////////////////////////////////////////////////////
168
169    ;// 2x2 pels per iteration
170    ;// bilinear vertical interpolation
171
172loop1_y
173    ADD     count, count, tmp2, LSL #8
174loop1_x
175    ;// Process 2x2 block
176    LDRB    tmp2, [ptrA,width]          ;// 2 row, 1 col
177    LDRB    tmp3, [ptrA,width, LSL #1]  ;// 3 row, 1 col
178    LDRB    tmp1, [ptrA],#1             ;// 1 row, 1 col
179
180    LDRB    tmp5, [ptrA,width]          ;// 2 row, 2 col
181    LDRB    tmp6, [ptrA,width, LSL #1]  ;// 3 row, 2 col
182    LDRB    tmp4, [ptrA],#1             ;// 1 row, 2 col
183
184    PKHBT   tmp1, tmp1, tmp2, LSL #16   ;// |B|A|
185    PKHBT   tmp2, tmp2, tmp3, LSL #16   ;// |C|B|
186    PKHBT   tmp4, tmp4, tmp5, LSL #16   ;// |B|A|
187
188    SMLAD   tmp7, tmp2, valY, c32       ;// multiply
189    PKHBT   tmp5, tmp5, tmp6, LSL #16   ;// |C|B|
190    SMLAD   tmp2, tmp1, valY, c32       ;// multiply
191    SMLAD   tmp8, tmp5, valY, c32       ;// multiply
192    SMLAD   tmp5, tmp4, valY, c32       ;// multiply
193
194    MOV     tmp7, tmp7, LSR #6          ;// scale down
195    STRB    tmp7, [mb,#8]               ;// store row 2 col 1
196    MOV     tmp2, tmp2, LSR #6          ;// scale down
197    STRB    tmp2, [mb],#1               ;// store row 1 col 1
198
199    MOV     tmp8, tmp8, LSR #6          ;// scale down
200    STRB    tmp8, [mb,#8]               ;// store row 2 col 2
201    MOV     tmp5, tmp5, LSR #6          ;// scale down
202    STRB    tmp5, [mb],#1               ;// store row 1 col 2
203
204
205    SUBS    count, count, #2<<28
206    BCS     loop1_x
207
208    AND     tmp2, count, #0x00F00000
209
210    ADDS    mb, mb, #16
211    SBC     mb, mb, tmp2, LSR #20
212    ADD     ptrA, ptrA, width, LSL #1
213    SBC     ptrA, ptrA, tmp2, LSR #20
214
215    ADDS    count, count, #0xE << 24
216    BGE     loop1_y
217
218    ;///////////////////////////////////////////////////////////////////////////
219    ;// Cr
220    ;///////////////////////////////////////////////////////////////////////////
221    LDR     height, [sp,#0xfc]          ;// height
222    LDR     ref, [sp, #0xc4]            ;// ref
223    LDR     tmp1, [sp, #0xd0]           ;// y0
224    LDR     tmp2, [sp, #0xcc]           ;// x0
225    LDR     mb, [sp, #0xc8]             ;// predPartChroma
226
227    ADD     tmp1, height, tmp1
228    MLA     tmp3, tmp1, width, tmp2
229    ADD     ptrA, ref, tmp3
230    ADD     mb, mb, #64
231
232    AND     count, count, #0x00FFFFFF
233    AND     tmp1, count, #0x000F0000
234    ADD     count, count, tmp1, LSL #8
235    AND     tmp2, count, #0x00F00000
236
237    ;// 2x2 pels per iteration
238    ;// bilinear vertical interpolation
239loop2_y
240    ADD     count, count, tmp2, LSL #8
241loop2_x
242    ;// Process 2x2 block
243    LDRB    tmp2, [ptrA,width]          ;// 2 row, 1 col
244    LDRB    tmp3, [ptrA,width, LSL #1]  ;// 3 row, 1 col
245    LDRB    tmp1, [ptrA],#1             ;// 1 row, 1 col
246
247    LDRB    tmp5, [ptrA,width]          ;// 2 row, 2 col
248    LDRB    tmp6, [ptrA,width, LSL #1]  ;// 3 row, 2 col
249    LDRB    tmp4, [ptrA],#1             ;// 1 row, 2 col
250
251    PKHBT   tmp1, tmp1, tmp2, LSL #16   ;// |B|A|
252    PKHBT   tmp2, tmp2, tmp3, LSL #16   ;// |C|B|
253    PKHBT   tmp4, tmp4, tmp5, LSL #16   ;// |B|A|
254
255    SMLAD   tmp7, tmp2, valY, c32       ;// multiply
256    PKHBT   tmp5, tmp5, tmp6, LSL #16   ;// |C|B|
257    SMLAD   tmp2, tmp1, valY, c32       ;// multiply
258    SMLAD   tmp8, tmp5, valY, c32       ;// multiply
259    SMLAD   tmp5, tmp4, valY, c32       ;// multiply
260
261    MOV     tmp7, tmp7, LSR #6          ;// scale down
262    STRB    tmp7, [mb,#8]               ;// store row 2 col 1
263    MOV     tmp2, tmp2, LSR #6          ;// scale down
264    STRB    tmp2, [mb],#1               ;// store row 1 col 1
265
266    MOV     tmp8, tmp8, LSR #6          ;// scale down
267    STRB    tmp8, [mb,#8]               ;// store row 2 col 2
268    MOV     tmp5, tmp5, LSR #6          ;// scale down
269    STRB    tmp5, [mb],#1               ;// store row 1 col 2
270
271
272    SUBS    count, count, #2<<28
273    BCS     loop2_x
274
275    AND     tmp2, count, #0x00F00000
276
277    ADDS    mb, mb, #16
278    SBC     mb, mb, tmp2, LSR #20
279    ADD     ptrA, ptrA, width, LSL #1
280    SBC     ptrA, ptrA, tmp2, LSR #20
281
282    ADDS    count, count, #0xE << 24
283    BGE     loop2_y
284
285    ADD     sp,sp,#0xd4
286    LDMFD   sp!, {r4-r11,pc}
287
288    END
289