1/***************************************************************************
2 * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 ***************************************************************************/
7
8/***************************************************************************
9  Neon memset: Attempts to do a memset with Neon registers if possible,
10     Inputs:
11        s: The buffer to write to
12        c: The integer data to write to the buffer
13        n: The size_t count.
14     Outputs:
15
16***************************************************************************/
17
18        .code 32
19        .fpu neon
20        .align 4
21        .globl memset16_neon
22        .func
23
24memset16_neon:
25        cmp             r2, #0
26        bxeq            lr
27
28        /* Keep in mind that r2 -- the count argument -- is for the
29         * number of 16-bit items to copy.
30         */
31        lsl             r2, r2, #1
32
33        push            {r0}
34
35        /* If we have < 8 bytes, just do a quick loop to handle that */
36        cmp             r2, #8
37        bgt             memset_gt4
38memset_smallcopy_loop:
39        strh            r1, [r0], #2
40        subs            r2, r2, #2
41        bne             memset_smallcopy_loop
42memset_smallcopy_done:
43        pop             {r0}
44        bx              lr
45
46memset_gt4:
47        /*
48         * Duplicate the r1 lowest 16-bits across r1. The idea is to have
49         * a register with two 16-bit-values we can copy. We do this by
50         * duplicating lowest 16-bits of r1 to upper 16-bits.
51         */
52        orr             r1, r1, r1, lsl #16
53        /*
54         * If we're copying > 64 bytes, then we may want to get
55         * onto a 16-byte boundary to improve speed even more.
56         */
57        cmp             r2, #64
58        blt             memset_route
59        ands            r12, r0, #0xf
60        beq             memset_route
61        /*
62         * Determine the number of bytes to move forward to get to the 16-byte
63         * boundary.  Note that this will be a multiple of 4, since we
64         * already are word-aligned.
65         */
66        rsb             r12, r12, #16
67        sub             r2, r2, r12
68        lsls            r12, r12, #29
69        strmi           r1, [r0], #4
70        strcs           r1, [r0], #4
71        strcs           r1, [r0], #4
72        lsls            r12, r12, #2
73        strcsh          r1, [r0], #2
74memset_route:
75        /*
76         * Decide where to route for the maximum copy sizes.  Note that we
77         * build q0 and q1 depending on if we'll need it, so that's
78         * interwoven here as well.
79         */
80        vdup.u32        d0, r1
81        cmp             r2, #16
82        blt             memset_8
83        vmov            d1, d0
84        cmp             r2, #64
85        blt             memset_16
86        vmov            q1, q0
87        cmp             r2, #128
88        blt             memset_32
89memset_128:
90        mov             r12, r2, lsr #7
91memset_128_loop:
92        vst1.64         {q0, q1}, [r0]!
93        vst1.64         {q0, q1}, [r0]!
94        vst1.64         {q0, q1}, [r0]!
95        vst1.64         {q0, q1}, [r0]!
96        subs            r12, r12, #1
97        bne             memset_128_loop
98        ands            r2, r2, #0x7f
99        beq             memset_end
100memset_32:
101        movs            r12, r2, lsr #5
102        beq             memset_16
103memset_32_loop:
104        subs            r12, r12, #1
105        vst1.64         {q0, q1}, [r0]!
106        bne             memset_32_loop
107        ands            r2, r2, #0x1f
108        beq             memset_end
109memset_16:
110        movs            r12, r2, lsr #4
111        beq             memset_8
112memset_16_loop:
113        subs            r12, r12, #1
114        vst1.32         {q0}, [r0]!
115        bne             memset_16_loop
116        ands            r2, r2, #0xf
117        beq             memset_end
118        /*
119         * memset_8 isn't a loop, since we try to do our loops at 16
120         * bytes and above.  We should loop there, then drop down here
121         * to finish the <16-byte versions.  Same for memset_4 and
122         * memset_1.
123         */
124memset_8:
125        cmp             r2, #8
126        blt             memset_4
127        subs            r2, r2, #8
128        vst1.32         {d0}, [r0]!
129memset_4:
130        cmp             r2, #4
131        blt             memset_2
132        subs            r2, r2, #4
133        str             r1, [r0], #4
134memset_2:
135        cmp             r2, #0
136        ble             memset_end
137        strh            r1, [r0], #2
138memset_end:
139        pop             {r0}
140        bx              lr
141
142        .endfunc
143        .end
144