memset.S revision ce46f5576ad0c9aefd842492949c4d2965e23e89
15b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang/*
25b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * Copyright (C) 2013 The Android Open Source Project
35b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * Copyright (c) 2014, NVIDIA CORPORATION.  All rights reserved.
45b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * All rights reserved.
55b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *
65b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * Redistribution and use in source and binary forms, with or without
75b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * modification, are permitted provided that the following conditions
85b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * are met:
95b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *  * Redistributions of source code must retain the above copyright
105b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *    notice, this list of conditions and the following disclaimer.
115b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *  * Redistributions in binary form must reproduce the above copyright
125b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *    notice, this list of conditions and the following disclaimer in
135b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *    the documentation and/or other materials provided with the
145b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *    distribution.
155b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang *
165b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
175b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
185b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
195b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
205b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
215b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
225b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
235b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
245b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
255b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
265b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
275b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang * SUCH DAMAGE.
285b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang */
295b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
305b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang#include <machine/cpu-features.h>
315b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang#include <private/bionic_asm.h>
325b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang#include <private/libc_events.h>
335b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
345b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /*
355b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang         * Optimized memset() for ARM.
365b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang         *
375b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang         * memset() returns its first argument.
385b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang         */
395b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
40ce46f5576ad0c9aefd842492949c4d2965e23e89Bernhard Rosenkränzer        .cpu        cortex-a15
415b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .fpu        neon
425b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .syntax     unified
435b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
445b5d6e7045dece4e112553e9a2516240ea32f812Shu ZhangENTRY(__memset_chk)
455b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        cmp         r2, r3
465b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bls         .L_done
475b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
485b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // Preserve lr for backtrace.
495b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        push        {lr}
505b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .cfi_def_cfa_offset 4
515b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .cfi_rel_offset lr, 0
525b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
535b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
545b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ldr         r0, error_message
555b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ldr         r1, error_code
565b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:
575b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        add         r0, pc
585b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bl          __fortify_chk_fail
595b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhangerror_code:
605b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .word       BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
615b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhangerror_message:
625b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .word       error_string-(1b+8)
635b5d6e7045dece4e112553e9a2516240ea32f812Shu ZhangEND(__memset_chk)
645b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
655b5d6e7045dece4e112553e9a2516240ea32f812Shu ZhangENTRY(bzero)
665b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        mov         r2, r1
675b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        mov         r1, #0
685b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_done:
695b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // Fall through to memset...
705b5d6e7045dece4e112553e9a2516240ea32f812Shu ZhangEND(bzero)
715b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
725b5d6e7045dece4e112553e9a2516240ea32f812Shu ZhangENTRY(memset)
735b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pldw        [r0]
745b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        mov         r3, r0
755b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
765b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // Duplicate the low byte of r1
775b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        mov         r1, r1, lsl #24
785b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        orr         r1, r1, r1, lsr #8
795b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        orr         r1, r1, r1, lsr #16
805b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
815b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        cmp         r2, #16
825b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        blo         .L_less_than_16
835b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
845b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // This section handles regions 16 bytes or larger
855b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        //
865b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // Use aligned vst1.8 and vstm when possible.  Register values will be:
875b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        //   ip is scratch
885b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        //   q0, q1, and r1 contain the memset value
895b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        //   r2 is the number of bytes to set
905b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        //   r3 is the advancing destination pointer
915b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vdup.32     q0, r1
925b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
935b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        ands        ip, r3, 0xF
945b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        beq         .L_memset_aligned
955b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
965b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // Align dest pointer to 16-byte boundary.
975b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pldw        [r0, #64]
985b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        rsb         ip, ip, #16
995b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1005b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // Pre-adjust the byte count to reflect post-aligment value.  Expecting
1015b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // 8-byte alignment to be rather common so we special case that one.
1025b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        sub         r2, r2, ip
1035b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1045b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* set 1 byte */
1055b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        tst         ip, #1
1065b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        it          ne
1075b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strbne      r1, [r3], #1
1085b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* set 2 bytes */
1095b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        tst         ip, #2
1105b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        it          ne
1115b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strhne      r1, [r3], #2
1125b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* set 4 bytes */
1135b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, ip, lsl #29
1145b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        it          mi
1155b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strmi       r1, [r3], #4
1165b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        /* set 8 bytes */
1175b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        itt         cs
1185b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strcs       r1, [r3], #4
1195b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strcs       r1, [r3], #4
1205b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1215b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_memset_aligned:
1225b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // Destination is now 16-byte aligned.  Determine how to handle
1235b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // remaining bytes.
1245b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vmov        q1, q0
1255b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        cmp         r2, #128
1265b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        blo         .L_less_than_128
1275b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1285b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // We need to set a larger block of memory.  Use four Q regs to
1295b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // set a full cache line in one instruction.  Pre-decrement
1305b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // r2 to simplify end-of-loop detection
1315b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vmov        q2, q0
1325b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vmov        q3, q0
1335b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pldw        [r0, #128]
1345b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        sub         r2, r2, #128
1355b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .align 4
1365b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_memset_loop_128:
1375b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        pldw        [r3, #192]
1385b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vstm        r3!, {q0, q1, q2, q3}
1395b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vstm        r3!, {q0, q1, q2, q3}
1405b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        subs        r2, r2, #128
1415b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bhs         .L_memset_loop_128
1425b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1435b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // Un-bias r2 so it contains the number of bytes left.  Early
1445b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // exit if we are done.
1455b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        adds        r2, r2, #128
1465b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        beq         2f
1475b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1485b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .align 4
1495b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_less_than_128:
1505b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // set 64 bytes
1515b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r2, lsl #26
1525b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bcc         1f
1535b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r3, :128]!
1545b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r3, :128]!
1555b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        beq         2f
1565b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:
1575b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // set 32 bytes
1585b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bpl         1f
1595b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0, q1}, [r3, :128]!
1605b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:
1615b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // set 16 bytes
1625b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r2, lsl #28
1635b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bcc         1f
1645b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {q0}, [r3, :128]!
1655b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        beq         2f
1665b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:
1675b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // set 8 bytes
1685b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bpl         1f
1695b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        vst1.8      {d0}, [r3, :64]!
1705b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:
1715b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // set 4 bytes
1725b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        tst         r2, #4
1735b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        it          ne
1745b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strne       r1, [r3], #4
1755b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:
1765b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // set 2 bytes
1775b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r2, lsl #31
1785b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        it          cs
1795b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strhcs      r1, [r3], #2
1805b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // set 1 byte
1815b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        it          mi
1825b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strbmi      r1, [r3]
1835b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang2:
1845b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bx          lr
1855b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
1865b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang.L_less_than_16:
1875b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        // Store up to 15 bytes without worrying about byte alignment
1885b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r2, lsl #29
1895b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bcc         1f
1905b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        str         r1, [r3], #4
1915b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        str         r1, [r3], #4
1925b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        beq         2f
1935b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang1:
1945b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        it          mi
1955b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strmi       r1, [r3], #4
1965b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        movs        ip, r2, lsl #31
1975b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        it          mi
1985b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strbmi      r1, [r3], #1
1995b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        itt         cs
2005b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strbcs      r1, [r3], #1
2015b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        strbcs      r1, [r3]
2025b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang2:
2035b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        bx          lr
2045b5d6e7045dece4e112553e9a2516240ea32f812Shu ZhangEND(memset)
2055b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang
2065b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .data
2075b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhangerror_string:
2085b5d6e7045dece4e112553e9a2516240ea32f812Shu Zhang        .string     "memset: prevented write past end of buffer"
209