memset16_neon.S revision 40528743dbb9ce7f39f093e0cdc47849ac8887cf
1/***************************************************************************
2 Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
3
4 Licensed under the Apache License, Version 2.0 (the "License"); you
5 may not use this file except in compliance with the License.  You may
6 obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13 implied.  See the License for the specific language governing
14 permissions and limitations under the License.
15 ***************************************************************************/
16
17/***************************************************************************
18  Neon memset: Attempts to do a memset with Neon registers if possible,
19     Inputs:
20        s: The buffer to write to
21        c: The integer data to write to the buffer
22        n: The size_t count.
23     Outputs:
24
25***************************************************************************/
26
27        .code 32
28        .fpu neon
29        .align 4
30        .globl memset16_neon
31        .func
32
33memset16_neon:
34        cmp             r2, #0
35        bxeq            lr
36
37        /* Keep in mind that r2 -- the count argument -- is for the
38         * number of 16-bit items to copy.
39         */
40        lsl             r2, r2, #1
41
42        push            {r0}
43
44        /* If we have < 8 bytes, just do a quick loop to handle that */
45        cmp             r2, #8
46        bgt             memset_gt4
47memset_smallcopy_loop:
48        strh            r1, [r0], #2
49        subs            r2, r2, #2
50        bne             memset_smallcopy_loop
51memset_smallcopy_done:
52        pop             {r0}
53        bx              lr
54
55memset_gt4:
56        /*
57         * Duplicate the r1 lowest 16-bits across r1. The idea is to have
58         * a register with two 16-bit-values we can copy. We do this by
59         * duplicating lowest 16-bits of r1 to upper 16-bits.
60         */
61        orr             r1, r1, r1, lsl #16
62        /*
63         * If we're copying > 64 bytes, then we may want to get
64         * onto a 16-byte boundary to improve speed even more.
65         */
66        cmp             r2, #64
67        blt             memset_route
68        ands            r12, r0, #0xf
69        beq             memset_route
70        /*
71         * Determine the number of bytes to move forward to get to the 16-byte
72         * boundary.  Note that this will be a multiple of 4, since we
73         * already are word-aligned.
74         */
75        rsb             r12, r12, #16
76        sub             r2, r2, r12
77        lsls            r12, r12, #29
78        strmi           r1, [r0], #4
79        strcs           r1, [r0], #4
80        strcs           r1, [r0], #4
81        lsls            r12, r12, #2
82        strcsh          r1, [r0], #2
83memset_route:
84        /*
85         * Decide where to route for the maximum copy sizes.  Note that we
86         * build q0 and q1 depending on if we'll need it, so that's
87         * interwoven here as well.
88         */
89        vdup.u32        d0, r1
90        cmp             r2, #16
91        blt             memset_8
92        vmov            d1, d0
93        cmp             r2, #64
94        blt             memset_16
95        vmov            q1, q0
96        cmp             r2, #128
97        blt             memset_32
98memset_128:
99        mov             r12, r2, lsr #7
100memset_128_loop:
101        vst1.64         {q0, q1}, [r0]!
102        vst1.64         {q0, q1}, [r0]!
103        vst1.64         {q0, q1}, [r0]!
104        vst1.64         {q0, q1}, [r0]!
105        subs            r12, r12, #1
106        bne             memset_128_loop
107        ands            r2, r2, #0x7f
108        beq             memset_end
109memset_32:
110        movs            r12, r2, lsr #5
111        beq             memset_16
112memset_32_loop:
113        subs            r12, r12, #1
114        vst1.64         {q0, q1}, [r0]!
115        bne             memset_32_loop
116        ands            r2, r2, #0x1f
117        beq             memset_end
118memset_16:
119        movs            r12, r2, lsr #4
120        beq             memset_8
121memset_16_loop:
122        subs            r12, r12, #1
123        vst1.32         {q0}, [r0]!
124        bne             memset_16_loop
125        ands            r2, r2, #0xf
126        beq             memset_end
127        /*
128         * memset_8 isn't a loop, since we try to do our loops at 16
129         * bytes and above.  We should loop there, then drop down here
130         * to finish the <16-byte versions.  Same for memset_4 and
131         * memset_1.
132         */
133memset_8:
134        cmp             r2, #8
135        blt             memset_4
136        subs            r2, r2, #8
137        vst1.32         {d0}, [r0]!
138memset_4:
139        cmp             r2, #4
140        blt             memset_2
141        subs            r2, r2, #4
142        str             r1, [r0], #4
143memset_2:
144        cmp             r2, #0
145        ble             memset_end
146        strh            r1, [r0], #2
147memset_end:
148        pop             {r0}
149        bx              lr
150
151        .endfunc
152        .end
153