1/*
2 * Copyright 2010 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8/* Changes:
9 * 2010-08-11 Steve McIntyre <steve.mcintyre@arm.com>
10 *    Added small changes to the two functions to make them work on the
11 *    specified number of 16- or 32-bit values rather than the original
12 *    code which was specified as a count of bytes. More verbose comments
13 *    to aid future maintenance.
14 */
15
16    .text
17    .align
18
19    .global arm_memset32
20    .type   arm_memset32, %function
21    .global arm_memset16
22    .type   arm_memset16, %function
23
24/*
25 * Optimized memset functions for ARM.
26 *
27 * void arm_memset16(uint16_t* dst, uint16_t value, int count);
28 * void arm_memset32(uint32_t* dst, uint32_t value, int count);
29 *
30 */
31arm_memset16:
32        .fnstart
33        push        {lr}
34
35        /* if count is equal to zero then abort */
36        teq         r2, #0
37        ble         .Lfinish
38
39        /* Multiply count by 2 - go from the number of 16-bit shorts
40         * to the number of bytes desired. */
41        mov         r2, r2, lsl #1
42
43        /* expand the data to 32 bits */
44        orr         r1, r1, lsl #16
45
46        /* align to 32 bits */
47        tst         r0, #2
48        strneh      r1, [r0], #2
49        subne       r2, r2, #2
50
51        /* Now jump into the main loop below. */
52        b           .Lwork_32
53        .fnend
54
55arm_memset32:
56        .fnstart
57        push        {lr}
58
59        /* if count is equal to zero then abort */
60        teq         r2, #0
61        ble         .Lfinish
62
63        /* Multiply count by 4 - go from the number of 32-bit words to
64         * the number of bytes desired. */
65        mov         r2, r2, lsl #2
66
67.Lwork_32:
68        /* Set up registers ready for writing them out. */
69        mov         ip, r1
70        mov         lr, r1
71
72        /* Try to align the destination to a cache line. Assume 32
73         * byte (8 word) cache lines, it's the common case. */
74        rsb         r3, r0, #0
75        ands        r3, r3, #0x1C
76        beq         .Laligned32
77        cmp         r3, r2
78        andhi       r3, r2, #0x1C
79        sub         r2, r2, r3
80
81        /* (Optionally) write any unaligned leading bytes.
82         * (0-28 bytes, length in r3) */
83        movs        r3, r3, lsl #28
84        stmcsia     r0!, {r1, lr}
85        stmcsia     r0!, {r1, lr}
86        stmmiia     r0!, {r1, lr}
87        movs        r3, r3, lsl #2
88        strcs       r1, [r0], #4
89
90        /* Now quickly loop through the cache-aligned data. */
91.Laligned32:
92        mov         r3, r1
931:      subs        r2, r2, #32
94        stmhsia     r0!, {r1,r3,ip,lr}
95        stmhsia     r0!, {r1,r3,ip,lr}
96        bhs         1b
97        add         r2, r2, #32
98
99        /* (Optionally) store any remaining trailing bytes.
100         * (0-30 bytes, length in r2) */
101        movs        r2, r2, lsl #28
102        stmcsia     r0!, {r1,r3,ip,lr}
103        stmmiia     r0!, {r1,lr}
104        movs        r2, r2, lsl #2
105        strcs       r1, [r0], #4
106        strmih      lr, [r0], #2
107
108.Lfinish:
109        pop         {pc}
110        .fnend
111