1/* Copyright (c) 2014, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*/ 27 28/* Assumptions: 29 * 30 * ARMv8-a, AArch64 31 * Unaligned accesses 32 * wchar_t is 4 bytes 33 */ 34 35#include <private/bionic_asm.h> 36 37/* Parameters and result. */ 38#define dstin x0 39#define src x1 40#define count x2 41#define tmp1 x3 42#define tmp1w w3 43#define tmp2 x4 44#define tmp2w w4 45#define tmp3 x5 46#define tmp3w w5 47#define dst x6 48 49#define A_l x7 50#define A_h x8 51#define B_l x9 52#define B_h x10 53#define C_l x11 54#define C_h x12 55#define D_l x13 56#define D_h x14 57 58#if defined(WMEMMOVE) 59ENTRY(wmemmove) 60 lsl count, count, #2 61#else 62ENTRY(memmove) 63#endif 64 cmp dstin, src 65 b.lo .Ldownwards 66 add tmp1, src, count 67 cmp dstin, tmp1 68 b.hs memcpy /* No overlap. */ 69 70 /* Upwards move with potential overlap. 71 * Need to move from the tail backwards. SRC and DST point one 72 * byte beyond the remaining data to move. */ 73 add dst, dstin, count 74 add src, src, count 75 cmp count, #64 76 b.ge .Lmov_not_short_up 77 78 /* Deal with small moves quickly by dropping straight into the 79 * exit block. */ 80.Ltail63up: 81 /* Move up to 48 bytes of data. At this point we only need the 82 * bottom 6 bits of count to be accurate. */ 83 ands tmp1, count, #0x30 84 b.eq .Ltail15up 85 sub dst, dst, tmp1 86 sub src, src, tmp1 87 cmp tmp1w, #0x20 88 b.eq 1f 89 b.lt 2f 90 ldp A_l, A_h, [src, #32] 91 stp A_l, A_h, [dst, #32] 921: 93 ldp A_l, A_h, [src, #16] 94 stp A_l, A_h, [dst, #16] 952: 96 ldp A_l, A_h, [src] 97 stp A_l, A_h, [dst] 98.Ltail15up: 99 /* Move up to 15 bytes of data. Does not assume additional data 100 * being moved. */ 101 tbz count, #3, 1f 102 ldr tmp1, [src, #-8]! 103 str tmp1, [dst, #-8]! 1041: 105 tbz count, #2, 1f 106 ldr tmp1w, [src, #-4]! 107 str tmp1w, [dst, #-4]! 1081: 109 tbz count, #1, 1f 110 ldrh tmp1w, [src, #-2]! 111 strh tmp1w, [dst, #-2]! 1121: 113 tbz count, #0, 1f 114 ldrb tmp1w, [src, #-1] 115 strb tmp1w, [dst, #-1] 1161: 117 ret 118 119.Lmov_not_short_up: 120 /* We don't much care about the alignment of DST, but we want SRC 121 * to be 128-bit (16 byte) aligned so that we don't cross cache line 122 * boundaries on both loads and stores. */ 123 ands tmp2, src, #15 /* Bytes to reach alignment. */ 124 b.eq 2f 125 sub count, count, tmp2 126 /* Move enough data to reach alignment; unlike memcpy, we have to 127 * be aware of the overlap, which means we can't move data twice. */ 128 tbz tmp2, #3, 1f 129 ldr tmp1, [src, #-8]! 130 str tmp1, [dst, #-8]! 1311: 132 tbz tmp2, #2, 1f 133 ldr tmp1w, [src, #-4]! 134 str tmp1w, [dst, #-4]! 1351: 136 tbz tmp2, #1, 1f 137 ldrh tmp1w, [src, #-2]! 138 strh tmp1w, [dst, #-2]! 1391: 140 tbz tmp2, #0, 1f 141 ldrb tmp1w, [src, #-1]! 142 strb tmp1w, [dst, #-1]! 1431: 144 145 /* There may be less than 63 bytes to go now. */ 146 cmp count, #63 147 b.le .Ltail63up 1482: 149 subs count, count, #128 150 b.ge .Lmov_body_large_up 151 /* Less than 128 bytes to move, so handle 64 here and then jump 152 * to the tail. */ 153 ldp A_l, A_h, [src, #-64]! 154 ldp B_l, B_h, [src, #16] 155 ldp C_l, C_h, [src, #32] 156 ldp D_l, D_h, [src, #48] 157 stp A_l, A_h, [dst, #-64]! 158 stp B_l, B_h, [dst, #16] 159 stp C_l, C_h, [dst, #32] 160 stp D_l, D_h, [dst, #48] 161 tst count, #0x3f 162 b.ne .Ltail63up 163 ret 164 165 /* Critical loop. Start at a new Icache line boundary. Assuming 166 * 64 bytes per line this ensures the entire loop is in one line. */ 167 .p2align 6 168.Lmov_body_large_up: 169 /* There are at least 128 bytes to move. */ 170 ldp A_l, A_h, [src, #-16] 171 ldp B_l, B_h, [src, #-32] 172 ldp C_l, C_h, [src, #-48] 173 ldp D_l, D_h, [src, #-64]! 1741: 175 stp A_l, A_h, [dst, #-16] 176 ldp A_l, A_h, [src, #-16] 177 stp B_l, B_h, [dst, #-32] 178 ldp B_l, B_h, [src, #-32] 179 stp C_l, C_h, [dst, #-48] 180 ldp C_l, C_h, [src, #-48] 181 stp D_l, D_h, [dst, #-64]! 182 ldp D_l, D_h, [src, #-64]! 183 subs count, count, #64 184 b.ge 1b 185 stp A_l, A_h, [dst, #-16] 186 stp B_l, B_h, [dst, #-32] 187 stp C_l, C_h, [dst, #-48] 188 stp D_l, D_h, [dst, #-64]! 189 tst count, #0x3f 190 b.ne .Ltail63up 191 ret 192 193 194.Ldownwards: 195 /* For a downwards move we can safely use memcpy provided that 196 * DST is more than 16 bytes away from SRC. */ 197 sub tmp1, src, #16 198 cmp dstin, tmp1 199 b.ls memcpy /* May overlap, but not critically. */ 200 201 mov dst, dstin /* Preserve DSTIN for return value. */ 202 cmp count, #64 203 b.ge .Lmov_not_short_down 204 205 /* Deal with small moves quickly by dropping straight into the 206 * exit block. */ 207.Ltail63down: 208 /* Move up to 48 bytes of data. At this point we only need the 209 * bottom 6 bits of count to be accurate. */ 210 ands tmp1, count, #0x30 211 b.eq .Ltail15down 212 add dst, dst, tmp1 213 add src, src, tmp1 214 cmp tmp1w, #0x20 215 b.eq 1f 216 b.lt 2f 217 ldp A_l, A_h, [src, #-48] 218 stp A_l, A_h, [dst, #-48] 2191: 220 ldp A_l, A_h, [src, #-32] 221 stp A_l, A_h, [dst, #-32] 2222: 223 ldp A_l, A_h, [src, #-16] 224 stp A_l, A_h, [dst, #-16] 225.Ltail15down: 226 /* Move up to 15 bytes of data. Does not assume additional data 227 being moved. */ 228 tbz count, #3, 1f 229 ldr tmp1, [src], #8 230 str tmp1, [dst], #8 2311: 232 tbz count, #2, 1f 233 ldr tmp1w, [src], #4 234 str tmp1w, [dst], #4 2351: 236 tbz count, #1, 1f 237 ldrh tmp1w, [src], #2 238 strh tmp1w, [dst], #2 2391: 240 tbz count, #0, 1f 241 ldrb tmp1w, [src] 242 strb tmp1w, [dst] 2431: 244 ret 245 246.Lmov_not_short_down: 247 /* We don't much care about the alignment of DST, but we want SRC 248 * to be 128-bit (16 byte) aligned so that we don't cross cache line 249 * boundaries on both loads and stores. */ 250 neg tmp2, src 251 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ 252 b.eq 2f 253 sub count, count, tmp2 254 /* Move enough data to reach alignment; unlike memcpy, we have to 255 * be aware of the overlap, which means we can't move data twice. */ 256 tbz tmp2, #3, 1f 257 ldr tmp1, [src], #8 258 str tmp1, [dst], #8 2591: 260 tbz tmp2, #2, 1f 261 ldr tmp1w, [src], #4 262 str tmp1w, [dst], #4 2631: 264 tbz tmp2, #1, 1f 265 ldrh tmp1w, [src], #2 266 strh tmp1w, [dst], #2 2671: 268 tbz tmp2, #0, 1f 269 ldrb tmp1w, [src], #1 270 strb tmp1w, [dst], #1 2711: 272 273 /* There may be less than 63 bytes to go now. */ 274 cmp count, #63 275 b.le .Ltail63down 2762: 277 subs count, count, #128 278 b.ge .Lmov_body_large_down 279 /* Less than 128 bytes to move, so handle 64 here and then jump 280 * to the tail. */ 281 ldp A_l, A_h, [src] 282 ldp B_l, B_h, [src, #16] 283 ldp C_l, C_h, [src, #32] 284 ldp D_l, D_h, [src, #48] 285 stp A_l, A_h, [dst] 286 stp B_l, B_h, [dst, #16] 287 stp C_l, C_h, [dst, #32] 288 stp D_l, D_h, [dst, #48] 289 tst count, #0x3f 290 add src, src, #64 291 add dst, dst, #64 292 b.ne .Ltail63down 293 ret 294 295 /* Critical loop. Start at a new cache line boundary. Assuming 296 * 64 bytes per line this ensures the entire loop is in one line. */ 297 .p2align 6 298.Lmov_body_large_down: 299 /* There are at least 128 bytes to move. */ 300 ldp A_l, A_h, [src, #0] 301 sub dst, dst, #16 /* Pre-bias. */ 302 ldp B_l, B_h, [src, #16] 303 ldp C_l, C_h, [src, #32] 304 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ 3051: 306 stp A_l, A_h, [dst, #16] 307 ldp A_l, A_h, [src, #16] 308 stp B_l, B_h, [dst, #32] 309 ldp B_l, B_h, [src, #32] 310 stp C_l, C_h, [dst, #48] 311 ldp C_l, C_h, [src, #48] 312 stp D_l, D_h, [dst, #64]! 313 ldp D_l, D_h, [src, #64]! 314 subs count, count, #64 315 b.ge 1b 316 stp A_l, A_h, [dst, #16] 317 stp B_l, B_h, [dst, #32] 318 stp C_l, C_h, [dst, #48] 319 stp D_l, D_h, [dst, #64] 320 add src, src, #16 321 add dst, dst, #64 + 16 322 tst count, #0x3f 323 b.ne .Ltail63down 324 ret 325#if defined(WMEMMOVE) 326END(wmemmove) 327#else 328END(memmove) 329#endif 330