140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger/***************************************************************************
21cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
31cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger *
41cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * Use of this source code is governed by a BSD-style license that can be
51cab2921ab279367f8206cdadc9259d12e603548Derek Sollenberger * found in the LICENSE file.
640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger ***************************************************************************/
740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger
840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger/***************************************************************************
940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger  Neon memset: Attempts to do a memset with Neon registers if possible,
1040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger     Inputs:
1140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        s: The buffer to write to
1240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        c: The integer data to write to the buffer
1340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        n: The size_t count.
1440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger     Outputs:
1540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger
1640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger***************************************************************************/
1740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger
1840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        .code 32
1940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        .fpu neon
2040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        .align 4
2140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        .globl memset16_neon
2240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        .func
2340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger
2440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset16_neon:
2540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        cmp             r2, #0
2640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        bxeq            lr
2740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger
2840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        /* Keep in mind that r2 -- the count argument -- is for the
2940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * number of 16-bit items to copy.
3040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         */
3140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        lsl             r2, r2, #1
3240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger
3340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        push            {r0}
3440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger
3540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        /* If we have < 8 bytes, just do a quick loop to handle that */
3640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        cmp             r2, #8
3740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        bgt             memset_gt4
3840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_smallcopy_loop:
3940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        strh            r1, [r0], #2
4040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        subs            r2, r2, #2
4140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        bne             memset_smallcopy_loop
4240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_smallcopy_done:
4340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        pop             {r0}
4440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        bx              lr
4540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger
4640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_gt4:
4740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        /*
4840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * Duplicate the r1 lowest 16-bits across r1. The idea is to have
4940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * a register with two 16-bit-values we can copy. We do this by
5040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * duplicating lowest 16-bits of r1 to upper 16-bits.
5140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         */
5240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        orr             r1, r1, r1, lsl #16
5340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        /*
5440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * If we're copying > 64 bytes, then we may want to get
5540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * onto a 16-byte boundary to improve speed even more.
5640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         */
5740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        cmp             r2, #64
5840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        blt             memset_route
5940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        ands            r12, r0, #0xf
6040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        beq             memset_route
6140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        /*
6240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * Determine the number of bytes to move forward to get to the 16-byte
6340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * boundary.  Note that this will be a multiple of 4, since we
6440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * already are word-aligned.
6540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         */
6640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        rsb             r12, r12, #16
6740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        sub             r2, r2, r12
6840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        lsls            r12, r12, #29
6940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        strmi           r1, [r0], #4
7040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        strcs           r1, [r0], #4
7140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        strcs           r1, [r0], #4
7240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        lsls            r12, r12, #2
7340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        strcsh          r1, [r0], #2
7440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_route:
7540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        /*
7640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * Decide where to route for the maximum copy sizes.  Note that we
7740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * build q0 and q1 depending on if we'll need it, so that's
7840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * interwoven here as well.
7940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         */
8040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        vdup.u32        d0, r1
8140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        cmp             r2, #16
8240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        blt             memset_8
8340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        vmov            d1, d0
8440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        cmp             r2, #64
8540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        blt             memset_16
8640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        vmov            q1, q0
8740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        cmp             r2, #128
8840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        blt             memset_32
8940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_128:
9040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        mov             r12, r2, lsr #7
9140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_128_loop:
9240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        vst1.64         {q0, q1}, [r0]!
9340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        vst1.64         {q0, q1}, [r0]!
9440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        vst1.64         {q0, q1}, [r0]!
9540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        vst1.64         {q0, q1}, [r0]!
9640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        subs            r12, r12, #1
9740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        bne             memset_128_loop
9840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        ands            r2, r2, #0x7f
9940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        beq             memset_end
10040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_32:
10140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        movs            r12, r2, lsr #5
10240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        beq             memset_16
10340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_32_loop:
10440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        subs            r12, r12, #1
10540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        vst1.64         {q0, q1}, [r0]!
10640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        bne             memset_32_loop
10740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        ands            r2, r2, #0x1f
10840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        beq             memset_end
10940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_16:
11040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        movs            r12, r2, lsr #4
11140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        beq             memset_8
11240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_16_loop:
11340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        subs            r12, r12, #1
11440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        vst1.32         {q0}, [r0]!
11540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        bne             memset_16_loop
11640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        ands            r2, r2, #0xf
11740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        beq             memset_end
11840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        /*
11940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * memset_8 isn't a loop, since we try to do our loops at 16
12040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * bytes and above.  We should loop there, then drop down here
12140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * to finish the <16-byte versions.  Same for memset_4 and
12240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         * memset_1.
12340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger         */
12440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_8:
12540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        cmp             r2, #8
12640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        blt             memset_4
12740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        subs            r2, r2, #8
12840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        vst1.32         {d0}, [r0]!
12940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_4:
13040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        cmp             r2, #4
13140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        blt             memset_2
13240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        subs            r2, r2, #4
13340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        str             r1, [r0], #4
13440528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_2:
13540528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        cmp             r2, #0
13640528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        ble             memset_end
13740528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        strh            r1, [r0], #2
13840528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenbergermemset_end:
13940528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        pop             {r0}
14040528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        bx              lr
14140528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger
14240528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        .endfunc
14340528743dbb9ce7f39f093e0cdc47849ac8887cfDerek Sollenberger        .end
144