1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_sixtap_predict8x4_armv6|
13
14    AREA    |.text|, CODE, READONLY  ; name this block of code
15;-------------------------------------
16; r0    unsigned char *src_ptr,
17; r1    int  src_pixels_per_line,
18; r2    int  xoffset,
19; r3    int  yoffset,
20; stack unsigned char *dst_ptr,
21; stack int  dst_pitch
22;-------------------------------------
23;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
24;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
25;and the result is stored in transpose.
26|vp8_sixtap_predict8x4_armv6| PROC
27    stmdb       sp!, {r4 - r11, lr}
28    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset
29
30    cmp         r2, #0                      ;skip first_pass filter if xoffset=0
31    add         lr, sp, #4                  ;point to temporary buffer
32    beq         skip_firstpass_filter
33
34;first-pass filter
35    adr         r12, filter8_coeff
36    sub         r0, r0, r1, lsl #1
37
38    add         r3, r1, #10                 ; preload next low
39    pld         [r0, r3]
40
41    add         r2, r12, r2, lsl #4         ;calculate filter location
42    add         r0, r0, #3                  ;adjust src only for loading convinience
43
44    ldr         r3, [r2]                    ; load up packed filter coefficients
45    ldr         r4, [r2, #4]
46    ldr         r5, [r2, #8]
47
48    mov         r2, #0x90000                ; height=9 is top part of counter
49
50    sub         r1, r1, #8
51
52|first_pass_hloop_v6|
53    ldrb        r6, [r0, #-5]               ; load source data
54    ldrb        r7, [r0, #-4]
55    ldrb        r8, [r0, #-3]
56    ldrb        r9, [r0, #-2]
57    ldrb        r10, [r0, #-1]
58
59    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2
60
61    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
62    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7
63
64    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
65    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9
66
67|first_pass_wloop_v6|
68    smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1]
69    smuad       r12, r7, r3
70
71    ldrb        r6, [r0], #1
72
73    smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3]
74    ldrb        r7, [r0], #1
75    smlad       r12, r9, r4, r12
76
77    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
78    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
79    smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5]
80    smlad       r12, r6, r5, r12
81
82    sub         r2, r2, #1
83
84    add         r11, r11, #0x40             ; round_shift_and_clamp
85    tst         r2, #0xff                   ; test loop counter
86    usat        r11, #8, r11, asr #7
87    add         r12, r12, #0x40
88    strh        r11, [lr], #20              ; result is transposed and stored, which
89    usat        r12, #8, r12, asr #7
90
91    strh        r12, [lr], #20
92
93    movne       r11, r6
94    movne       r12, r7
95
96    movne       r6, r8
97    movne       r7, r9
98    movne       r8, r10
99    movne       r9, r11
100    movne       r10, r12
101
102    bne         first_pass_wloop_v6
103
104    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
105    ;;IF ARCHITECTURE=6
106    ;pld        [src, ppl]
107    ;;pld       [src, r9]
108    ;;ENDIF
109
110    subs        r2, r2, #0x10000
111
112    sub         lr, lr, #158
113
114    add         r0, r0, r1                  ; move to next input line
115
116    add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier
117    pld         [r0, r11]
118
119    bne         first_pass_hloop_v6
120
121;second pass filter
122secondpass_filter
123    ldr         r3, [sp], #4                ; load back yoffset
124    ldr         r0, [sp, #216]              ; load dst address from stack 180+36
125    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
126
127    cmp         r3, #0
128    beq         skip_secondpass_filter
129
130    adr         r12, filter8_coeff
131    add         lr, r12, r3, lsl #4         ;calculate filter location
132
133    mov         r2, #0x00080000
134
135    ldr         r3, [lr]                    ; load up packed filter coefficients
136    ldr         r4, [lr, #4]
137    ldr         r5, [lr, #8]
138
139    pkhbt       r12, r4, r3                 ; pack the filter differently
140    pkhbt       r11, r5, r4
141
142second_pass_hloop_v6
143    ldr         r6, [sp]                    ; load the data
144    ldr         r7, [sp, #4]
145
146    orr         r2, r2, #2                  ; loop counter
147
148second_pass_wloop_v6
149    smuad       lr, r3, r6                  ; apply filter
150    smulbt      r10, r3, r6
151
152    ldr         r8, [sp, #8]
153
154    smlad       lr, r4, r7, lr
155    smladx      r10, r12, r7, r10
156
157    ldrh        r9, [sp, #12]
158
159    smlad       lr, r5, r8, lr
160    smladx      r10, r11, r8, r10
161
162    add         sp, sp, #4
163    smlatb      r10, r5, r9, r10
164
165    sub         r2, r2, #1
166
167    add         lr, lr, #0x40               ; round_shift_and_clamp
168    tst         r2, #0xff
169    usat        lr, #8, lr, asr #7
170    add         r10, r10, #0x40
171    strb        lr, [r0], r1                ; the result is transposed back and stored
172    usat        r10, #8, r10, asr #7
173
174    strb        r10, [r0],r1
175
176    movne       r6, r7
177    movne       r7, r8
178
179    bne         second_pass_wloop_v6
180
181    subs        r2, r2, #0x10000
182    add         sp, sp, #12                 ; updata src for next loop (20-8)
183    sub         r0, r0, r1, lsl #2
184    add         r0, r0, #1
185
186    bne         second_pass_hloop_v6
187
188    add         sp, sp, #20
189    ldmia       sp!, {r4 - r11, pc}
190
191;--------------------
192skip_firstpass_filter
193    sub         r0, r0, r1, lsl #1
194    sub         r1, r1, #8
195    mov         r2, #9
196
197skip_firstpass_hloop
198    ldrb        r4, [r0], #1                ; load data
199    subs        r2, r2, #1
200    ldrb        r5, [r0], #1
201    strh        r4, [lr], #20               ; store it to immediate buffer
202    ldrb        r6, [r0], #1                ; load data
203    strh        r5, [lr], #20
204    ldrb        r7, [r0], #1
205    strh        r6, [lr], #20
206    ldrb        r8, [r0], #1
207    strh        r7, [lr], #20
208    ldrb        r9, [r0], #1
209    strh        r8, [lr], #20
210    ldrb        r10, [r0], #1
211    strh        r9, [lr], #20
212    ldrb        r11, [r0], #1
213    strh        r10, [lr], #20
214    add         r0, r0, r1                  ; move to next input line
215    strh        r11, [lr], #20
216
217    sub         lr, lr, #158                ; move over to next column
218    bne         skip_firstpass_hloop
219
220    b           secondpass_filter
221
222;--------------------
223skip_secondpass_filter
224    mov         r2, #8
225    add         sp, sp, #4                  ;start from src[0] instead of src[-2]
226
227skip_secondpass_hloop
228    ldr         r6, [sp], #4
229    subs        r2, r2, #1
230    ldr         r8, [sp], #4
231
232    mov         r7, r6, lsr #16             ; unpack
233    strb        r6, [r0], r1
234    mov         r9, r8, lsr #16
235    strb        r7, [r0], r1
236    add         sp, sp, #12                 ; 20-8
237    strb        r8, [r0], r1
238    strb        r9, [r0], r1
239
240    sub         r0, r0, r1, lsl #2
241    add         r0, r0, #1
242
243    bne         skip_secondpass_hloop
244
245    add         sp, sp, #16                 ; 180 - (160 +4)
246
247    ldmia       sp!, {r4 - r11, pc}
248
249    ENDP
250
251;-----------------
252;One word each is reserved. Label filter_coeff can be used to access the data.
253;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
254filter8_coeff
255    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
256    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
257    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
258    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
259    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
260    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
261    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
262    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000
263
264    ;DCD        0,  0,  128,    0,   0,  0
265    ;DCD        0, -6,  123,   12,  -1,  0
266    ;DCD        2, -11, 108,   36,  -8,  1
267    ;DCD        0, -9,   93,   50,  -6,  0
268    ;DCD        3, -16,  77,   77, -16,  3
269    ;DCD        0, -6,   50,   93,  -9,  0
270    ;DCD        1, -8,   36,  108, -11,  2
271    ;DCD        0, -1,   12,  123,  -6,  0
272
273    END
274