convolve_neon.s revision f3664ae9369a861ffbc2354e8e93e48983802062
1@/*
2@ ** Copyright 2003-2010, VisualOn, Inc.
3@ **
4@ ** Licensed under the Apache License, Version 2.0 (the "License");
5@ ** you may not use this file except in compliance with the License.
6@ ** You may obtain a copy of the License at
7@ **
8@ **     http://www.apache.org/licenses/LICENSE-2.0
9@ **
10@ ** Unless required by applicable law or agreed to in writing, software
11@ ** distributed under the License is distributed on an "AS IS" BASIS,
12@ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13@ ** See the License for the specific language governing permissions and
14@ ** limitations under the License.
15@ */
16@
17@*void Convolve (
18@*    Word16 x[],        /* (i)     : input vector                           */
19@*    Word16 h[],        /* (i)     : impulse response                       */
20@*    Word16 y[],        /* (o)     : output vector                          */
21@*    Word16 L           /* (i)     : vector size                            */
22@*)
23@
24@ r0 --- x[]
25@ r1 --- h[]
26@ r2 --- y[]
27@ r3 --- L
28
29	.section  .text
30        .global   Convolve_asm
31
32Convolve_asm:
33
34        STMFD          r13!, {r4 - r12, r14}
35        MOV            r3,  #0
36	MOV            r11, #0x8000
37
38LOOP:
39        @MOV            r8, #0                            @ s = 0
40        ADD            r4, r1, r3, LSL #1                @ tmpH address
41        ADD            r5, r3, #1                        @ i = n + 1
42        MOV            r6, r0
43        LDRSH          r9,  [r6], #2                     @ *tmpX++
44        LDRSH          r10, [r4]                         @ *tmpH--
45        SUB            r5, r5, #1
46        VMOV.S32       Q10, #0
47        MUL            r8,  r9, r10
48
49LOOP1:
50        CMP            r5, #0
51        BLE            L1
52        SUB            r4, r4, #8
53        MOV            r9, r4
54        VLD1.S16       D0, [r6]!
55        VLD1.S16       D1, [r9]!
56        VREV64.16      D1, D1
57        SUBS           r5, r5, #4
58        VMLAL.S16      Q10, D0, D1
59        B              LOOP1
60L1:
61        VADD.S32       D20, D20, D21
62        VPADD.S32      D20, D20, D20
63        VMOV.S32       r5, D20[0]
64        ADD            r5, r5, r8
65        ADD            r5, r11, r5, LSL #1
66        MOV            r5, r5, LSR #16                   @extract_h(s)
67        ADD            r3, r3, #1
68        STRH           r5, [r2], #2                      @y[n]
69
70
71        @MOV            r8, #0
72        ADD            r4, r1, r3, LSL #1                @tmpH address
73        ADD            r5, r3, #1
74        MOV            r6, r0
75        LDRSH          r9,  [r6], #2                     @ *tmpX++
76        LDRSH          r10, [r4], #-2
77        LDRSH          r12, [r6], #2
78        LDRSH          r14, [r4]
79
80        MUL            r8, r9, r10
81        SUB            r5, r5, #2
82        MLA            r8, r12, r14, r8
83
84        VMOV.S32       Q10, #0
85LOOP2:
86        CMP            r5, #0
87        BLE            L2
88        SUB            r4, r4, #8
89        MOV            r9, r4
90        VLD1.S16       D0, [r6]!
91        VLD1.S16       D1, [r9]!
92        SUBS           r5, r5, #4
93        VREV64.16      D1, D1
94        VMLAL.S16      Q10, D0, D1
95        B              LOOP2
96L2:
97        VADD.S32       D20, D20, D21
98        VPADD.S32      D20, D20, D20
99        VMOV.S32       r5, D20[0]
100        ADD            r8, r8, r5
101        ADD            r8, r11, r8, LSL #1
102        MOV            r8, r8, LSR #16                   @extract_h(s)
103        ADD            r3, r3, #1
104        STRH           r8, [r2], #2                      @y[n]
105
106
107        @MOV            r8, #0
108        ADD            r4, r1, r3, LSL #1
109        ADD            r5, r3, #1
110        MOV            r6, r0
111        LDRSH          r9,  [r6], #2
112        LDRSH          r10, [r4], #-2
113        LDRSH          r12, [r6], #2
114        LDRSH          r14, [r4], #-2
115        MUL            r8, r9, r10
116        LDRSH          r9,  [r6], #2
117        LDRSH          r10, [r4]
118        MLA            r8, r12, r14, r8
119        SUB            r5, r5, #3
120        MLA            r8, r9, r10, r8
121
122        VMOV.S32       Q10, #0
123LOOP3:
124        CMP            r5, #0
125        BLE            L3
126        SUB            r4, r4, #8
127        MOV            r9, r4
128        VLD1.S16       D0, [r6]!
129        VLD1.S16       D1, [r9]!
130        VREV64.16      D1, D1
131        SUBS           r5, r5, #4
132        VMLAL.S16      Q10, D0, D1
133        B              LOOP3
134
135L3:
136        VADD.S32       D20, D20, D21
137        VPADD.S32      D20, D20, D20
138        VMOV.S32       r5, D20[0]
139        ADD            r8, r8, r5
140        ADD            r8, r11, r8, LSL #1
141        MOV            r8, r8, LSR #16                   @extract_h(s)
142        ADD            r3, r3, #1
143        STRH           r8, [r2], #2                      @y[n]
144
145        ADD            r5, r3, #1                        @ i = n + 1
146        ADD            r4, r1, r5, LSL #1                @ tmpH address
147        MOV            r6, r0
148        VMOV.S32       Q10, #0
149LOOP4:
150        CMP            r5, #0
151        BLE            L4
152        SUB            r4, r4, #8
153        MOV            r9, r4
154        VLD1.S16       D0, [r6]!
155        VLD1.S16       D1, [r9]!
156        VREV64.16      D1, D1
157        SUBS           r5, r5, #4
158        VMLAL.S16      Q10, D0, D1
159        B              LOOP4
160L4:
161        VADD.S32       D20, D20, D21
162        VPADD.S32      D20, D20, D20
163        VMOV.S32       r5,  D20[0]
164        ADD            r5, r11, r5, LSL #1
165        MOV            r5, r5, LSR #16                   @extract_h(s)
166        ADD            r3, r3, #1
167        STRH           r5, [r2], #2                      @y[n]
168
169        CMP            r3, #64
170        BLT            LOOP
171
172Convolve_asm_end:
173
174        LDMFD      r13!, {r4 - r12, r15}
175
176        @ENDFUNC
177        .END
178
179