1/* 2 * Copyright (C) 2013 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <private/bionic_asm.h> 30 31 /* 32 * Optimized memset() for ARM. 33 * 34 * memset() returns its first argument. 35 */ 36 37 .fpu neon 38 .syntax unified 39 40ENTRY(__memset_chk) 41 cmp r2, r3 42 bls memset 43 44 // Preserve lr for backtrace. 45 push {lr} 46 .cfi_def_cfa_offset 4 47 .cfi_rel_offset lr, 0 48 49 bl __memset_chk_fail 50END(__memset_chk) 51 52ENTRY(memset) 53 mov r3, r0 54 // At this point only d0, d1 are going to be used below. 55 vdup.8 q0, r1 56 cmp r2, #16 57 blo .L_set_less_than_16_unknown_align 58 59.L_check_alignment: 60 // Align destination to a double word to avoid the store crossing 61 // a cache line boundary. 62 ands ip, r3, #7 63 bne .L_do_double_word_align 64 65.L_double_word_aligned: 66 // Duplicate since the less than 64 can use d2, d3. 67 vmov q1, q0 68 subs r2, #64 69 blo .L_set_less_than_64 70 71 // Duplicate the copy value so that we can store 64 bytes at a time. 72 vmov q2, q0 73 vmov q3, q0 74 751: // Main loop stores 64 bytes at a time. 76 subs r2, #64 77 vstmia r3!, {d0 - d7} 78 bge 1b 79 80.L_set_less_than_64: 81 // Restore r2 to the count of bytes left to set. 82 add r2, #64 83 lsls ip, r2, #27 84 bcc .L_set_less_than_32 85 // Set 32 bytes. 86 vstmia r3!, {d0 - d3} 87 88.L_set_less_than_32: 89 bpl .L_set_less_than_16 90 // Set 16 bytes. 91 vstmia r3!, {d0, d1} 92 93.L_set_less_than_16: 94 // Less than 16 bytes to set. 95 lsls ip, r2, #29 96 bcc .L_set_less_than_8 97 98 // Set 8 bytes. 99 vstmia r3!, {d0} 100 101.L_set_less_than_8: 102 bpl .L_set_less_than_4 103 // Set 4 bytes 104 vst1.32 {d0[0]}, [r3]! 105 106.L_set_less_than_4: 107 lsls ip, r2, #31 108 it ne 109 strbne r1, [r3], #1 110 itt cs 111 strbcs r1, [r3], #1 112 strbcs r1, [r3] 113 bx lr 114 115.L_do_double_word_align: 116 rsb ip, ip, #8 117 sub r2, r2, ip 118 119 // Do this comparison now, otherwise we'll need to save a 120 // register to the stack since we've used all available 121 // registers. 122 cmp ip, #4 123 blo 1f 124 125 // Need to do a four byte copy. 126 movs ip, ip, lsl #31 127 it mi 128 strbmi r1, [r3], #1 129 itt cs 130 strbcs r1, [r3], #1 131 strbcs r1, [r3], #1 132 vst1.32 {d0[0]}, [r3]! 133 b .L_double_word_aligned 134 1351: 136 // No four byte copy. 137 movs ip, ip, lsl #31 138 it mi 139 strbmi r1, [r3], #1 140 itt cs 141 strbcs r1, [r3], #1 142 strbcs r1, [r3], #1 143 b .L_double_word_aligned 144 145.L_set_less_than_16_unknown_align: 146 // Set up to 15 bytes. 147 movs ip, r2, lsl #29 148 bcc 1f 149 vst1.8 {d0}, [r3]! 1501: bge 2f 151 vst1.32 {d0[0]}, [r3]! 1522: movs ip, r2, lsl #31 153 it mi 154 strbmi r1, [r3], #1 155 itt cs 156 strbcs r1, [r3], #1 157 strbcs r1, [r3], #1 158 bx lr 159END(memset) 160