1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11    EXPORT |vp8_short_walsh4x4_armv6|
12
13    ARM
14    REQUIRE8
15    PRESERVE8
16
17    AREA    |.text|, CODE, READONLY  ; name this block of code
18
19;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
20; r0    short *input,
21; r1    short *output,
22; r2    int pitch
23|vp8_short_walsh4x4_armv6| PROC
24
25    stmdb       sp!, {r4 - r11, lr}
26
27    ldrd        r4, r5, [r0], r2
28    ldr         lr, c00040004
29    ldrd        r6, r7, [r0], r2
30
31    ; 0-3
32    qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2]
33    qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2]
34
35    ldrd        r8, r9, [r0], r2
36    ; 4-7
37    qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6]
38    qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6]
39
40    ldrd        r10, r11, [r0]
41    ; 8-11
42    qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10]
43    qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10]
44
45    ; 12-15
46    qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14]
47    qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14]
48
49
50    lsls        r2, r3, #16
51    smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2
52    addne       r11, r11, #1        ; A0 += (a1!=0)
53
54    lsls        r2, r7, #16
55    smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2
56    addne       r12, r12, #1        ; C0 += (a1!=0)
57
58    add         r0, r11, r12        ; a1_0 = A0 + C0
59    sub         r11, r11, r12       ; b1_0 = A0 - C0
60
61    lsls        r2, r5, #16
62    smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2
63    addne       r12, r12, #1        ; B0 += (a1!=0)
64
65    lsls        r2, r9, #16
66    smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2
67    addne       r2, r2, #1          ; D0 += (a1!=0)
68
69    add         lr, r12, r2         ; d1_0 = B0 + D0
70    sub         r12, r12, r2        ; c1_0 = B0 - D0
71
72    ; op[0,4,8,12]
73    adds        r2, r0, lr          ; a2 = a1_0 + d1_0
74    addmi       r2, r2, #1          ; += a2 < 0
75    add         r2, r2, #3          ; += 3
76    subs        r0, r0, lr          ; d2 = a1_0 - d1_0
77    mov         r2, r2, asr #3      ; >> 3
78    strh        r2, [r1]            ; op[0]
79
80    addmi       r0, r0, #1          ; += a2 < 0
81    add         r0, r0, #3          ; += 3
82    ldr         lr, c00040004
83    mov         r0, r0, asr #3      ; >> 3
84    strh        r0, [r1, #24]       ; op[12]
85
86    adds        r2, r11, r12        ; b2 = b1_0 + c1_0
87    addmi       r2, r2, #1          ; += a2 < 0
88    add         r2, r2, #3          ; += 3
89    subs        r0, r11, r12        ; c2 = b1_0 - c1_0
90    mov         r2, r2, asr #3      ; >> 3
91    strh        r2, [r1, #8]        ; op[4]
92
93    addmi       r0, r0, #1          ; += a2 < 0
94    add         r0, r0, #3          ; += 3
95    smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2
96    smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2
97    mov         r0, r0, asr #3      ; >> 3
98    strh        r0, [r1, #16]       ; op[8]
99
100
101    ; op[3,7,11,15]
102    add         r0, r3, r7          ; a1_3 = A3 + C3
103    sub         r3, r3, r7          ; b1_3 = A3 - C3
104
105    smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2
106    smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2
107    add         r7, r5, r9          ; d1_3 = B3 + D3
108    sub         r5, r5, r9          ; c1_3 = B3 - D3
109
110    adds        r2, r0, r7          ; a2 = a1_3 + d1_3
111    addmi       r2, r2, #1          ; += a2 < 0
112    add         r2, r2, #3          ; += 3
113    adds        r9, r3, r5          ; b2 = b1_3 + c1_3
114    mov         r2, r2, asr #3      ; >> 3
115    strh        r2, [r1, #6]        ; op[3]
116
117    addmi       r9, r9, #1          ; += a2 < 0
118    add         r9, r9, #3          ; += 3
119    subs        r2, r3, r5          ; c2 = b1_3 - c1_3
120    mov         r9, r9, asr #3      ; >> 3
121    strh        r9, [r1, #14]       ; op[7]
122
123    addmi       r2, r2, #1          ; += a2 < 0
124    add         r2, r2, #3          ; += 3
125    subs        r9, r0, r7          ; d2 = a1_3 - d1_3
126    mov         r2, r2, asr #3      ; >> 3
127    strh        r2, [r1, #22]       ; op[11]
128
129    addmi       r9, r9, #1          ; += a2 < 0
130    add         r9, r9, #3          ; += 3
131    smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2
132    smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2
133    mov         r9, r9, asr #3      ; >> 3
134    strh        r9, [r1, #30]       ; op[15]
135
136    ; op[1,5,9,13]
137    add         r0, r3, r5          ; a1_1 = A1 + C1
138    sub         r3, r3, r5          ; b1_1 = A1 - C1
139
140    smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2
141    smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2
142    add         r5, r7, r9          ; d1_1 = B1 + D1
143    sub         r7, r7, r9          ; c1_1 = B1 - D1
144
145    adds        r2, r0, r5          ; a2 = a1_1 + d1_1
146    addmi       r2, r2, #1          ; += a2 < 0
147    add         r2, r2, #3          ; += 3
148    adds        r9, r3, r7          ; b2 = b1_1 + c1_1
149    mov         r2, r2, asr #3      ; >> 3
150    strh        r2, [r1, #2]        ; op[1]
151
152    addmi       r9, r9, #1          ; += a2 < 0
153    add         r9, r9, #3          ; += 3
154    subs        r2, r3, r7          ; c2 = b1_1 - c1_1
155    mov         r9, r9, asr #3      ; >> 3
156    strh        r9, [r1, #10]       ; op[5]
157
158    addmi       r2, r2, #1          ; += a2 < 0
159    add         r2, r2, #3          ; += 3
160    subs        r9, r0, r5          ; d2 = a1_1 - d1_1
161    mov         r2, r2, asr #3      ; >> 3
162    strh        r2, [r1, #18]       ; op[9]
163
164    addmi       r9, r9, #1          ; += a2 < 0
165    add         r9, r9, #3          ; += 3
166    smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2
167    smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2
168    mov         r9, r9, asr #3      ; >> 3
169    strh        r9, [r1, #26]       ; op[13]
170
171
172    ; op[2,6,10,14]
173    add         r11, r4, r8         ; a1_2 = A2 + C2
174    sub         r12, r4, r8         ; b1_2 = A2 - C2
175
176    smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2
177    smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2
178    add         r4, r6, r10         ; d1_2 = B2 + D2
179    sub         r8, r6, r10         ; c1_2 = B2 - D2
180
181    adds        r2, r11, r4         ; a2 = a1_2 + d1_2
182    addmi       r2, r2, #1          ; += a2 < 0
183    add         r2, r2, #3          ; += 3
184    adds        r9, r12, r8         ; b2 = b1_2 + c1_2
185    mov         r2, r2, asr #3      ; >> 3
186    strh        r2, [r1, #4]        ; op[2]
187
188    addmi       r9, r9, #1          ; += a2 < 0
189    add         r9, r9, #3          ; += 3
190    subs        r2, r12, r8         ; c2 = b1_2 - c1_2
191    mov         r9, r9, asr #3      ; >> 3
192    strh        r9, [r1, #12]       ; op[6]
193
194    addmi       r2, r2, #1          ; += a2 < 0
195    add         r2, r2, #3          ; += 3
196    subs        r9, r11, r4         ; d2 = a1_2 - d1_2
197    mov         r2, r2, asr #3      ; >> 3
198    strh        r2, [r1, #20]       ; op[10]
199
200    addmi       r9, r9, #1          ; += a2 < 0
201    add         r9, r9, #3          ; += 3
202    mov         r9, r9, asr #3      ; >> 3
203    strh        r9, [r1, #28]       ; op[14]
204
205
206    ldmia       sp!, {r4 - r11, pc}
207    ENDP        ; |vp8_short_walsh4x4_armv6|
208
209c00040004
210    DCD         0x00040004
211
212    END
213