1aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org/***************************************************************************
2fd03db0fe9c7a7f72df560b2039f2c3050c2fab9epoger@google.com * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
3fd03db0fe9c7a7f72df560b2039f2c3050c2fab9epoger@google.com *
4fd03db0fe9c7a7f72df560b2039f2c3050c2fab9epoger@google.com * Use of this source code is governed by a BSD-style license that can be
5fd03db0fe9c7a7f72df560b2039f2c3050c2fab9epoger@google.com * found in the LICENSE file.
6aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org ***************************************************************************/
7aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org
8aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org/***************************************************************************
9aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org  Neon memset: Attempts to do a memset with Neon registers if possible,
10aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org     Inputs:
11aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        s: The buffer to write to
12aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        c: The integer data to write to the buffer
13aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        n: The size_t count.
14aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org     Outputs:
15aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org
16aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org***************************************************************************/
17aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org
18aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        .code 32
19aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        .fpu neon
20aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        .align 4
21aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        .globl memset16_neon
22aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        .func
23aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org
24aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset16_neon:
25aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        cmp             r2, #0
26aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        bxeq            lr
27aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org
28aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        /* Keep in mind that r2 -- the count argument -- is for the
29aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * number of 16-bit items to copy.
30aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         */
31aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        lsl             r2, r2, #1
32aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org
33aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        push            {r0}
34aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org
35aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        /* If we have < 8 bytes, just do a quick loop to handle that */
36aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        cmp             r2, #8
37aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        bgt             memset_gt4
38aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_smallcopy_loop:
39aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        strh            r1, [r0], #2
40aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        subs            r2, r2, #2
41aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        bne             memset_smallcopy_loop
42aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_smallcopy_done:
43aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        pop             {r0}
44aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        bx              lr
45aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org
46aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_gt4:
47aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        /*
48aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * Duplicate the r1 lowest 16-bits across r1. The idea is to have
49aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * a register with two 16-bit-values we can copy. We do this by
50aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * duplicating lowest 16-bits of r1 to upper 16-bits.
51aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         */
52aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        orr             r1, r1, r1, lsl #16
53aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        /*
54aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * If we're copying > 64 bytes, then we may want to get
55aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * onto a 16-byte boundary to improve speed even more.
56aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         */
57aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        cmp             r2, #64
58aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        blt             memset_route
59aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        ands            r12, r0, #0xf
60aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        beq             memset_route
61aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        /*
62aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * Determine the number of bytes to move forward to get to the 16-byte
63aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * boundary.  Note that this will be a multiple of 4, since we
64aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * already are word-aligned.
65aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         */
66aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        rsb             r12, r12, #16
67aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        sub             r2, r2, r12
68aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        lsls            r12, r12, #29
69aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        strmi           r1, [r0], #4
70aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        strcs           r1, [r0], #4
71aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        strcs           r1, [r0], #4
72aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        lsls            r12, r12, #2
73aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        strcsh          r1, [r0], #2
74aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_route:
75aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        /*
76aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * Decide where to route for the maximum copy sizes.  Note that we
77aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * build q0 and q1 depending on if we'll need it, so that's
78aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * interwoven here as well.
79aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         */
80aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        vdup.u32        d0, r1
81aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        cmp             r2, #16
82aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        blt             memset_8
83aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        vmov            d1, d0
84aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        cmp             r2, #64
85aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        blt             memset_16
86aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        vmov            q1, q0
87aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        cmp             r2, #128
88aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        blt             memset_32
89aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_128:
90aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        mov             r12, r2, lsr #7
91aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_128_loop:
92aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        vst1.64         {q0, q1}, [r0]!
93aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        vst1.64         {q0, q1}, [r0]!
94aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        vst1.64         {q0, q1}, [r0]!
95aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        vst1.64         {q0, q1}, [r0]!
96aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        subs            r12, r12, #1
97aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        bne             memset_128_loop
98aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        ands            r2, r2, #0x7f
99aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        beq             memset_end
100aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_32:
101aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        movs            r12, r2, lsr #5
102aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        beq             memset_16
103aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_32_loop:
104aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        subs            r12, r12, #1
105aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        vst1.64         {q0, q1}, [r0]!
106aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        bne             memset_32_loop
107aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        ands            r2, r2, #0x1f
108aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        beq             memset_end
109aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_16:
110aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        movs            r12, r2, lsr #4
111aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        beq             memset_8
112aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_16_loop:
113aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        subs            r12, r12, #1
114aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        vst1.32         {q0}, [r0]!
115aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        bne             memset_16_loop
116aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        ands            r2, r2, #0xf
117aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        beq             memset_end
118aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        /*
119aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * memset_8 isn't a loop, since we try to do our loops at 16
120aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * bytes and above.  We should loop there, then drop down here
121aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * to finish the <16-byte versions.  Same for memset_4 and
122aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         * memset_1.
123aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org         */
124aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_8:
125aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        cmp             r2, #8
126aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        blt             memset_4
127aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        subs            r2, r2, #8
128aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        vst1.32         {d0}, [r0]!
129aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_4:
130aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        cmp             r2, #4
131aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        blt             memset_2
132aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        subs            r2, r2, #4
133aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        str             r1, [r0], #4
134aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_2:
135aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        cmp             r2, #0
136aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        ble             memset_end
137aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        strh            r1, [r0], #2
138aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.orgmemset_end:
139aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        pop             {r0}
140aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        bx              lr
141aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org
142aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        .endfunc
143aab4090b57ded8ad9dce95f92ad678c9829ab8bcagl@chromium.org        .end
144