1/*
2 * Copyright 2010 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8/* Changes:
9 * 2010-08-11 Steve McIntyre <steve.mcintyre@arm.com>
10 *    Added small changes to the two functions to make them work on the
11 *    specified number of 16- or 32-bit values rather than the original
12 *    code which was specified as a count of bytes. More verbose comments
13 *    to aid future maintenance.
14 */
15
16    .text
17    .align 4
18    .syntax unified
19
20    .global arm_memset32
21    .type   arm_memset32, %function
22    .global arm_memset16
23    .type   arm_memset16, %function
24
25/*
26 * Optimized memset functions for ARM.
27 *
28 * void arm_memset16(uint16_t* dst, uint16_t value, int count);
29 * void arm_memset32(uint32_t* dst, uint32_t value, int count);
30 *
31 */
32arm_memset16:
33        .fnstart
34        push        {lr}
35
36        /* if count is equal to zero then abort */
37        teq         r2, #0
38        ble         .Lfinish
39
40        /* Multiply count by 2 - go from the number of 16-bit shorts
41         * to the number of bytes desired. */
42        mov         r2, r2, lsl #1
43
44        /* expand the data to 32 bits */
45        orr         r1, r1, r1, lsl #16
46
47        /* align to 32 bits */
48        tst         r0, #2
49        strhne      r1, [r0], #2
50        subne       r2, r2, #2
51
52        /* Now jump into the main loop below. */
53        b           .Lwork_32
54        .fnend
55
56arm_memset32:
57        .fnstart
58        push        {lr}
59
60        /* if count is equal to zero then abort */
61        teq         r2, #0
62        ble         .Lfinish
63
64        /* Multiply count by 4 - go from the number of 32-bit words to
65         * the number of bytes desired. */
66        mov         r2, r2, lsl #2
67
68.Lwork_32:
69        /* Set up registers ready for writing them out. */
70        mov         ip, r1
71        mov         lr, r1
72
73        /* Try to align the destination to a cache line. Assume 32
74         * byte (8 word) cache lines, it's the common case. */
75        rsb         r3, r0, #0
76        ands        r3, r3, #0x1C
77        beq         .Laligned32
78        cmp         r3, r2
79        andhi       r3, r2, #0x1C
80        sub         r2, r2, r3
81
82        /* (Optionally) write any unaligned leading bytes.
83         * (0-28 bytes, length in r3) */
84        movs        r3, r3, lsl #28
85        stmiacs     r0!, {r1, lr}
86        stmiacs     r0!, {r1, lr}
87        stmiami     r0!, {r1, lr}
88        movs        r3, r3, lsl #2
89        strcs       r1, [r0], #4
90
91        /* Now quickly loop through the cache-aligned data. */
92.Laligned32:
93        mov         r3, r1
941:      subs        r2, r2, #32
95        stmiahs     r0!, {r1,r3,ip,lr}
96        stmiahs     r0!, {r1,r3,ip,lr}
97        bhs         1b
98        add         r2, r2, #32
99
100        /* (Optionally) store any remaining trailing bytes.
101         * (0-30 bytes, length in r2) */
102        movs        r2, r2, lsl #28
103        stmiacs     r0!, {r1,r3,ip,lr}
104        stmiami     r0!, {r1,lr}
105        movs        r2, r2, lsl #2
106        strcs       r1, [r0], #4
107        strhmi      lr, [r0], #2
108
109.Lfinish:
110        pop         {pc}
111        .fnend
112