1/***************************************************************************
2 * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 ***************************************************************************/
7
8/***************************************************************************
9  Neon memset: Attempts to do a memset with Neon registers if possible,
10     Inputs:
11        s: The buffer to write to
12        c: The integer data to write to the buffer
13        n: The size_t count.
14     Outputs:
15
16***************************************************************************/
17
18        .syntax unified
19
20        .code 32
21        .fpu neon
22        .align 4
23        .globl memset16_neon
24
25memset16_neon:
26        cmp             r2, #0
27        bxeq            lr
28
29        /* Keep in mind that r2 -- the count argument -- is for the
30         * number of 16-bit items to copy.
31         */
32        lsl             r2, r2, #1
33
34        push            {r0}
35
36        /* If we have < 8 bytes, just do a quick loop to handle that */
37        cmp             r2, #8
38        bgt             memset_gt4
39memset_smallcopy_loop:
40        strh            r1, [r0], #2
41        subs            r2, r2, #2
42        bne             memset_smallcopy_loop
43memset_smallcopy_done:
44        pop             {r0}
45        bx              lr
46
47memset_gt4:
48        /*
49         * Duplicate the r1 lowest 16-bits across r1. The idea is to have
50         * a register with two 16-bit-values we can copy. We do this by
51         * duplicating lowest 16-bits of r1 to upper 16-bits.
52         */
53        orr             r1, r1, r1, lsl #16
54        /*
55         * If we're copying > 64 bytes, then we may want to get
56         * onto a 16-byte boundary to improve speed even more.
57         */
58        cmp             r2, #64
59        blt             memset_route
60        ands            r12, r0, #0xf
61        beq             memset_route
62        /*
63         * Determine the number of bytes to move forward to get to the 16-byte
64         * boundary.  Note that this will be a multiple of 4, since we
65         * already are word-aligned.
66         */
67        rsb             r12, r12, #16
68        sub             r2, r2, r12
69        lsls            r12, r12, #29
70        strmi           r1, [r0], #4
71        strcs           r1, [r0], #4
72        strcs           r1, [r0], #4
73        lsls            r12, r12, #2
74        strhcs          r1, [r0], #2
75memset_route:
76        /*
77         * Decide where to route for the maximum copy sizes.  Note that we
78         * build q0 and q1 depending on if we'll need it, so that's
79         * interwoven here as well.
80         */
81        vdup.u32        d0, r1
82        cmp             r2, #16
83        blt             memset_8
84        vmov            d1, d0
85        cmp             r2, #64
86        blt             memset_16
87        vmov            q1, q0
88        cmp             r2, #128
89        blt             memset_32
90memset_128:
91        mov             r12, r2, lsr #7
92memset_128_loop:
93        vst1.64         {q0, q1}, [r0]!
94        vst1.64         {q0, q1}, [r0]!
95        vst1.64         {q0, q1}, [r0]!
96        vst1.64         {q0, q1}, [r0]!
97        subs            r12, r12, #1
98        bne             memset_128_loop
99        ands            r2, r2, #0x7f
100        beq             memset_end
101memset_32:
102        movs            r12, r2, lsr #5
103        beq             memset_16
104memset_32_loop:
105        subs            r12, r12, #1
106        vst1.64         {q0, q1}, [r0]!
107        bne             memset_32_loop
108        ands            r2, r2, #0x1f
109        beq             memset_end
110memset_16:
111        movs            r12, r2, lsr #4
112        beq             memset_8
113memset_16_loop:
114        subs            r12, r12, #1
115        vst1.32         {q0}, [r0]!
116        bne             memset_16_loop
117        ands            r2, r2, #0xf
118        beq             memset_end
119        /*
120         * memset_8 isn't a loop, since we try to do our loops at 16
121         * bytes and above.  We should loop there, then drop down here
122         * to finish the <16-byte versions.  Same for memset_4 and
123         * memset_1.
124         */
125memset_8:
126        cmp             r2, #8
127        blt             memset_4
128        subs            r2, r2, #8
129        vst1.32         {d0}, [r0]!
130memset_4:
131        cmp             r2, #4
132        blt             memset_2
133        subs            r2, r2, #4
134        str             r1, [r0], #4
135memset_2:
136        cmp             r2, #0
137        ble             memset_end
138        strh            r1, [r0], #2
139memset_end:
140        pop             {r0}
141        bx              lr
142
143        .end
144