1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_sixtap_predict8x4_armv6|
13
14    AREA    |.text|, CODE, READONLY  ; name this block of code
15;-------------------------------------
16; r0    unsigned char *src_ptr,
17; r1    int  src_pixels_per_line,
18; r2    int  xoffset,
19; r3    int  yoffset,
20; stack unsigned char *dst_ptr,
21; stack int  dst_pitch
22;-------------------------------------
23;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
24;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
25;and the result is stored in transpose.
26|vp8_sixtap_predict8x4_armv6| PROC
27    stmdb       sp!, {r4 - r11, lr}
28    str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset
29
30    cmp         r2, #0                      ;skip first_pass filter if xoffset=0
31    add         lr, sp, #4                  ;point to temporary buffer
32    beq         skip_firstpass_filter
33
34;first-pass filter
35    ldr         r12, _filter8_coeff_
36    sub         r0, r0, r1, lsl #1
37
38    add         r2, r12, r2, lsl #4         ;calculate filter location
39    add         r0, r0, #3                  ;adjust src only for loading convinience
40
41    ldr         r3, [r2]                    ; load up packed filter coefficients
42    ldr         r4, [r2, #4]
43    ldr         r5, [r2, #8]
44
45    mov         r2, #0x90000                ; height=9 is top part of counter
46
47    sub         r1, r1, #8
48
49|first_pass_hloop_v6|
50    ldrb        r6, [r0, #-5]               ; load source data
51    ldrb        r7, [r0, #-4]
52    ldrb        r8, [r0, #-3]
53    ldrb        r9, [r0, #-2]
54    ldrb        r10, [r0, #-1]
55
56    orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2
57
58    pkhbt       r6, r6, r7, lsl #16         ; r7 | r6
59    pkhbt       r7, r7, r8, lsl #16         ; r8 | r7
60
61    pkhbt       r8, r8, r9, lsl #16         ; r9 | r8
62    pkhbt       r9, r9, r10, lsl #16        ; r10 | r9
63
64|first_pass_wloop_v6|
65    smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1]
66    smuad       r12, r7, r3
67
68    ldrb        r6, [r0], #1
69
70    smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3]
71    ldrb        r7, [r0], #1
72    smlad       r12, r9, r4, r12
73
74    pkhbt       r10, r10, r6, lsl #16       ; r10 | r9
75    pkhbt       r6, r6, r7, lsl #16         ; r11 | r10
76    smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5]
77    smlad       r12, r6, r5, r12
78
79    sub         r2, r2, #1
80
81    add         r11, r11, #0x40             ; round_shift_and_clamp
82    tst         r2, #0xff                   ; test loop counter
83    usat        r11, #8, r11, asr #7
84    add         r12, r12, #0x40
85    strh        r11, [lr], #20              ; result is transposed and stored, which
86    usat        r12, #8, r12, asr #7
87
88    strh        r12, [lr], #20
89
90    movne       r11, r6
91    movne       r12, r7
92
93    movne       r6, r8
94    movne       r7, r9
95    movne       r8, r10
96    movne       r9, r11
97    movne       r10, r12
98
99    bne         first_pass_wloop_v6
100
101    ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines
102    ;;IF ARCHITECTURE=6
103    ;pld        [src, ppl]
104    ;;pld       [src, r9]
105    ;;ENDIF
106
107    subs        r2, r2, #0x10000
108
109    sub         lr, lr, #158
110
111    add         r0, r0, r1                  ; move to next input line
112
113    bne         first_pass_hloop_v6
114
115;second pass filter
116secondpass_filter
117    ldr         r3, [sp], #4                ; load back yoffset
118    ldr         r0, [sp, #216]              ; load dst address from stack 180+36
119    ldr         r1, [sp, #220]              ; load dst stride from stack 180+40
120
121    cmp         r3, #0
122    beq         skip_secondpass_filter
123
124    ldr         r12, _filter8_coeff_
125    add         lr, r12, r3, lsl #4         ;calculate filter location
126
127    mov         r2, #0x00080000
128
129    ldr         r3, [lr]                    ; load up packed filter coefficients
130    ldr         r4, [lr, #4]
131    ldr         r5, [lr, #8]
132
133    pkhbt       r12, r4, r3                 ; pack the filter differently
134    pkhbt       r11, r5, r4
135
136second_pass_hloop_v6
137    ldr         r6, [sp]                    ; load the data
138    ldr         r7, [sp, #4]
139
140    orr         r2, r2, #2                  ; loop counter
141
142second_pass_wloop_v6
143    smuad       lr, r3, r6                  ; apply filter
144    smulbt      r10, r3, r6
145
146    ldr         r8, [sp, #8]
147
148    smlad       lr, r4, r7, lr
149    smladx      r10, r12, r7, r10
150
151    ldrh        r9, [sp, #12]
152
153    smlad       lr, r5, r8, lr
154    smladx      r10, r11, r8, r10
155
156    add         sp, sp, #4
157    smlatb      r10, r5, r9, r10
158
159    sub         r2, r2, #1
160
161    add         lr, lr, #0x40               ; round_shift_and_clamp
162    tst         r2, #0xff
163    usat        lr, #8, lr, asr #7
164    add         r10, r10, #0x40
165    strb        lr, [r0], r1                ; the result is transposed back and stored
166    usat        r10, #8, r10, asr #7
167
168    strb        r10, [r0],r1
169
170    movne       r6, r7
171    movne       r7, r8
172
173    bne         second_pass_wloop_v6
174
175    subs        r2, r2, #0x10000
176    add         sp, sp, #12                 ; updata src for next loop (20-8)
177    sub         r0, r0, r1, lsl #2
178    add         r0, r0, #1
179
180    bne         second_pass_hloop_v6
181
182    add         sp, sp, #20
183    ldmia       sp!, {r4 - r11, pc}
184
185;--------------------
186skip_firstpass_filter
187    sub         r0, r0, r1, lsl #1
188    sub         r1, r1, #8
189    mov         r2, #9
190
191skip_firstpass_hloop
192    ldrb        r4, [r0], #1                ; load data
193    subs        r2, r2, #1
194    ldrb        r5, [r0], #1
195    strh        r4, [lr], #20               ; store it to immediate buffer
196    ldrb        r6, [r0], #1                ; load data
197    strh        r5, [lr], #20
198    ldrb        r7, [r0], #1
199    strh        r6, [lr], #20
200    ldrb        r8, [r0], #1
201    strh        r7, [lr], #20
202    ldrb        r9, [r0], #1
203    strh        r8, [lr], #20
204    ldrb        r10, [r0], #1
205    strh        r9, [lr], #20
206    ldrb        r11, [r0], #1
207    strh        r10, [lr], #20
208    add         r0, r0, r1                  ; move to next input line
209    strh        r11, [lr], #20
210
211    sub         lr, lr, #158                ; move over to next column
212    bne         skip_firstpass_hloop
213
214    b           secondpass_filter
215
216;--------------------
217skip_secondpass_filter
218    mov         r2, #8
219    add         sp, sp, #4                  ;start from src[0] instead of src[-2]
220
221skip_secondpass_hloop
222    ldr         r6, [sp], #4
223    subs        r2, r2, #1
224    ldr         r8, [sp], #4
225
226    mov         r7, r6, lsr #16             ; unpack
227    strb        r6, [r0], r1
228    mov         r9, r8, lsr #16
229    strb        r7, [r0], r1
230    add         sp, sp, #12                 ; 20-8
231    strb        r8, [r0], r1
232    strb        r9, [r0], r1
233
234    sub         r0, r0, r1, lsl #2
235    add         r0, r0, #1
236
237    bne         skip_secondpass_hloop
238
239    add         sp, sp, #16                 ; 180 - (160 +4)
240
241    ldmia       sp!, {r4 - r11, pc}
242
243    ENDP
244
245;-----------------
246    AREA    subpelfilters8_dat, DATA, READWRITE         ;read/write by default
247;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
248;One word each is reserved. Label filter_coeff can be used to access the data.
249;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
250_filter8_coeff_
251    DCD     filter8_coeff
252filter8_coeff
253    DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000
254    DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000
255    DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000
256    DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000
257    DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000
258    DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000
259    DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000
260    DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000
261
262    ;DCD        0,  0,  128,    0,   0,  0
263    ;DCD        0, -6,  123,   12,  -1,  0
264    ;DCD        2, -11, 108,   36,  -8,  1
265    ;DCD        0, -9,   93,   50,  -6,  0
266    ;DCD        3, -16,  77,   77, -16,  3
267    ;DCD        0, -6,   50,   93,  -9,  0
268    ;DCD        1, -8,   36,  108, -11,  2
269    ;DCD        0, -1,   12,  123,  -6,  0
270
271    END
272