memmove.S revision 8167dd7cb98e87ffe9b40e4993c330b244ca2234
1/* Copyright (c) 2014, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions are met: 6 * Redistributions of source code must retain the above copyright 7 notice, this list of conditions and the following disclaimer. 8 * Redistributions in binary form must reproduce the above copyright 9 notice, this list of conditions and the following disclaimer in the 10 documentation and/or other materials provided with the distribution. 11 * Neither the name of the Linaro nor the 12 names of its contributors may be used to endorse or promote products 13 derived from this software without specific prior written permission. 14 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*/ 27 28/* Assumptions: 29 * 30 * ARMv8-a, AArch64 31 * Unaligned accesses 32 * wchar_t is 4 bytes 33 */ 34 35#include <private/bionic_asm.h> 36 37/* Parameters and result. */ 38#ifdef BCOPY 39#define dstin x1 40#define src x0 41#else 42#define dstin x0 43#define src x1 44#endif 45#define count x2 46#define tmp1 x3 47#define tmp1w w3 48#define tmp2 x4 49#define tmp2w w4 50#define tmp3 x5 51#define tmp3w w5 52#define dst x6 53 54#define A_l x7 55#define A_h x8 56#define B_l x9 57#define B_h x10 58#define C_l x11 59#define C_h x12 60#define D_l x13 61#define D_h x14 62 63#ifdef BCOPY 64ENTRY(bcopy) 65#elif defined(WMEMMOVE) 66ENTRY(wmemmove) 67 lsl count, count, #2 68#else 69ENTRY(memmove) 70#endif 71 cmp dstin, src 72 b.lo .Ldownwards 73 add tmp1, src, count 74 cmp dstin, tmp1 75 b.hs memcpy /* No overlap. */ 76 77 /* Upwards move with potential overlap. 78 * Need to move from the tail backwards. SRC and DST point one 79 * byte beyond the remaining data to move. */ 80 add dst, dstin, count 81 add src, src, count 82 cmp count, #64 83 b.ge .Lmov_not_short_up 84 85 /* Deal with small moves quickly by dropping straight into the 86 * exit block. */ 87.Ltail63up: 88 /* Move up to 48 bytes of data. At this point we only need the 89 * bottom 6 bits of count to be accurate. */ 90 ands tmp1, count, #0x30 91 b.eq .Ltail15up 92 sub dst, dst, tmp1 93 sub src, src, tmp1 94 cmp tmp1w, #0x20 95 b.eq 1f 96 b.lt 2f 97 ldp A_l, A_h, [src, #32] 98 stp A_l, A_h, [dst, #32] 991: 100 ldp A_l, A_h, [src, #16] 101 stp A_l, A_h, [dst, #16] 1022: 103 ldp A_l, A_h, [src] 104 stp A_l, A_h, [dst] 105.Ltail15up: 106 /* Move up to 15 bytes of data. Does not assume additional data 107 * being moved. */ 108 tbz count, #3, 1f 109 ldr tmp1, [src, #-8]! 110 str tmp1, [dst, #-8]! 1111: 112 tbz count, #2, 1f 113 ldr tmp1w, [src, #-4]! 114 str tmp1w, [dst, #-4]! 1151: 116 tbz count, #1, 1f 117 ldrh tmp1w, [src, #-2]! 118 strh tmp1w, [dst, #-2]! 1191: 120 tbz count, #0, 1f 121 ldrb tmp1w, [src, #-1] 122 strb tmp1w, [dst, #-1] 1231: 124 ret 125 126.Lmov_not_short_up: 127 /* We don't much care about the alignment of DST, but we want SRC 128 * to be 128-bit (16 byte) aligned so that we don't cross cache line 129 * boundaries on both loads and stores. */ 130 ands tmp2, src, #15 /* Bytes to reach alignment. */ 131 b.eq 2f 132 sub count, count, tmp2 133 /* Move enough data to reach alignment; unlike memcpy, we have to 134 * be aware of the overlap, which means we can't move data twice. */ 135 tbz tmp2, #3, 1f 136 ldr tmp1, [src, #-8]! 137 str tmp1, [dst, #-8]! 1381: 139 tbz tmp2, #2, 1f 140 ldr tmp1w, [src, #-4]! 141 str tmp1w, [dst, #-4]! 1421: 143 tbz tmp2, #1, 1f 144 ldrh tmp1w, [src, #-2]! 145 strh tmp1w, [dst, #-2]! 1461: 147 tbz tmp2, #0, 1f 148 ldrb tmp1w, [src, #-1]! 149 strb tmp1w, [dst, #-1]! 1501: 151 152 /* There may be less than 63 bytes to go now. */ 153 cmp count, #63 154 b.le .Ltail63up 1552: 156 subs count, count, #128 157 b.ge .Lmov_body_large_up 158 /* Less than 128 bytes to move, so handle 64 here and then jump 159 * to the tail. */ 160 ldp A_l, A_h, [src, #-64]! 161 ldp B_l, B_h, [src, #16] 162 ldp C_l, C_h, [src, #32] 163 ldp D_l, D_h, [src, #48] 164 stp A_l, A_h, [dst, #-64]! 165 stp B_l, B_h, [dst, #16] 166 stp C_l, C_h, [dst, #32] 167 stp D_l, D_h, [dst, #48] 168 tst count, #0x3f 169 b.ne .Ltail63up 170 ret 171 172 /* Critical loop. Start at a new Icache line boundary. Assuming 173 * 64 bytes per line this ensures the entire loop is in one line. */ 174 .p2align 6 175.Lmov_body_large_up: 176 /* There are at least 128 bytes to move. */ 177 ldp A_l, A_h, [src, #-16] 178 ldp B_l, B_h, [src, #-32] 179 ldp C_l, C_h, [src, #-48] 180 ldp D_l, D_h, [src, #-64]! 1811: 182 stp A_l, A_h, [dst, #-16] 183 ldp A_l, A_h, [src, #-16] 184 stp B_l, B_h, [dst, #-32] 185 ldp B_l, B_h, [src, #-32] 186 stp C_l, C_h, [dst, #-48] 187 ldp C_l, C_h, [src, #-48] 188 stp D_l, D_h, [dst, #-64]! 189 ldp D_l, D_h, [src, #-64]! 190 subs count, count, #64 191 b.ge 1b 192 stp A_l, A_h, [dst, #-16] 193 stp B_l, B_h, [dst, #-32] 194 stp C_l, C_h, [dst, #-48] 195 stp D_l, D_h, [dst, #-64]! 196 tst count, #0x3f 197 b.ne .Ltail63up 198 ret 199 200 201.Ldownwards: 202 /* For a downwards move we can safely use memcpy provided that 203 * DST is more than 16 bytes away from SRC. */ 204 sub tmp1, src, #16 205 cmp dstin, tmp1 206 b.ls memcpy /* May overlap, but not critically. */ 207 208 mov dst, dstin /* Preserve DSTIN for return value. */ 209 cmp count, #64 210 b.ge .Lmov_not_short_down 211 212 /* Deal with small moves quickly by dropping straight into the 213 * exit block. */ 214.Ltail63down: 215 /* Move up to 48 bytes of data. At this point we only need the 216 * bottom 6 bits of count to be accurate. */ 217 ands tmp1, count, #0x30 218 b.eq .Ltail15down 219 add dst, dst, tmp1 220 add src, src, tmp1 221 cmp tmp1w, #0x20 222 b.eq 1f 223 b.lt 2f 224 ldp A_l, A_h, [src, #-48] 225 stp A_l, A_h, [dst, #-48] 2261: 227 ldp A_l, A_h, [src, #-32] 228 stp A_l, A_h, [dst, #-32] 2292: 230 ldp A_l, A_h, [src, #-16] 231 stp A_l, A_h, [dst, #-16] 232.Ltail15down: 233 /* Move up to 15 bytes of data. Does not assume additional data 234 being moved. */ 235 tbz count, #3, 1f 236 ldr tmp1, [src], #8 237 str tmp1, [dst], #8 2381: 239 tbz count, #2, 1f 240 ldr tmp1w, [src], #4 241 str tmp1w, [dst], #4 2421: 243 tbz count, #1, 1f 244 ldrh tmp1w, [src], #2 245 strh tmp1w, [dst], #2 2461: 247 tbz count, #0, 1f 248 ldrb tmp1w, [src] 249 strb tmp1w, [dst] 2501: 251 ret 252 253.Lmov_not_short_down: 254 /* We don't much care about the alignment of DST, but we want SRC 255 * to be 128-bit (16 byte) aligned so that we don't cross cache line 256 * boundaries on both loads and stores. */ 257 neg tmp2, src 258 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */ 259 b.eq 2f 260 sub count, count, tmp2 261 /* Move enough data to reach alignment; unlike memcpy, we have to 262 * be aware of the overlap, which means we can't move data twice. */ 263 tbz tmp2, #3, 1f 264 ldr tmp1, [src], #8 265 str tmp1, [dst], #8 2661: 267 tbz tmp2, #2, 1f 268 ldr tmp1w, [src], #4 269 str tmp1w, [dst], #4 2701: 271 tbz tmp2, #1, 1f 272 ldrh tmp1w, [src], #2 273 strh tmp1w, [dst], #2 2741: 275 tbz tmp2, #0, 1f 276 ldrb tmp1w, [src], #1 277 strb tmp1w, [dst], #1 2781: 279 280 /* There may be less than 63 bytes to go now. */ 281 cmp count, #63 282 b.le .Ltail63down 2832: 284 subs count, count, #128 285 b.ge .Lmov_body_large_down 286 /* Less than 128 bytes to move, so handle 64 here and then jump 287 * to the tail. */ 288 ldp A_l, A_h, [src] 289 ldp B_l, B_h, [src, #16] 290 ldp C_l, C_h, [src, #32] 291 ldp D_l, D_h, [src, #48] 292 stp A_l, A_h, [dst] 293 stp B_l, B_h, [dst, #16] 294 stp C_l, C_h, [dst, #32] 295 stp D_l, D_h, [dst, #48] 296 tst count, #0x3f 297 add src, src, #64 298 add dst, dst, #64 299 b.ne .Ltail63down 300 ret 301 302 /* Critical loop. Start at a new cache line boundary. Assuming 303 * 64 bytes per line this ensures the entire loop is in one line. */ 304 .p2align 6 305.Lmov_body_large_down: 306 /* There are at least 128 bytes to move. */ 307 ldp A_l, A_h, [src, #0] 308 sub dst, dst, #16 /* Pre-bias. */ 309 ldp B_l, B_h, [src, #16] 310 ldp C_l, C_h, [src, #32] 311 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */ 3121: 313 stp A_l, A_h, [dst, #16] 314 ldp A_l, A_h, [src, #16] 315 stp B_l, B_h, [dst, #32] 316 ldp B_l, B_h, [src, #32] 317 stp C_l, C_h, [dst, #48] 318 ldp C_l, C_h, [src, #48] 319 stp D_l, D_h, [dst, #64]! 320 ldp D_l, D_h, [src, #64]! 321 subs count, count, #64 322 b.ge 1b 323 stp A_l, A_h, [dst, #16] 324 stp B_l, B_h, [dst, #32] 325 stp C_l, C_h, [dst, #48] 326 stp D_l, D_h, [dst, #64] 327 add src, src, #16 328 add dst, dst, #64 + 16 329 tst count, #0x3f 330 b.ne .Ltail63down 331 ret 332#ifdef BCOPY 333END(bcopy) 334#elif defined(WMEMMOVE) 335END(wmemmove) 336#else 337END(memmove) 338#endif 339