memcmp.S revision 40bc7cd4ed9fb848a7b3d934f69669f64ceed707
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <machine/cpu-features.h> 30#include <machine/asm.h> 31 32 33#ifdef HAVE_32_BYTE_CACHE_LINE 34#define CACHE_LINE_SIZE 32 35#else 36#define CACHE_LINE_SIZE 64 37#endif 38 39/* 40 * Optimized memcmp() for Cortex-A9. 41 */ 42 43ENTRY(memcmp) 44 pld [r0, #(CACHE_LINE_SIZE * 0)] 45 pld [r0, #(CACHE_LINE_SIZE * 1)] 46 47 /* take of the case where length is 0 or the buffers are the same */ 48 cmp r0, r1 49 moveq r0, #0 50 bxeq lr 51 52 pld [r1, #(CACHE_LINE_SIZE * 0)] 53 pld [r1, #(CACHE_LINE_SIZE * 1)] 54 55 /* make sure we have at least 8+4 bytes, this simplify things below 56 * and avoid some overhead for small blocks 57 */ 58 cmp r2, #(8+4) 59 bmi 10f 60/* 61 * Neon optimization 62 * Comparing 32 bytes at a time 63 */ 64#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS) 65 subs r2, r2, #32 66 blo 3f 67 68 /* preload all the cache lines we need. */ 69 pld [r0, #(CACHE_LINE_SIZE * 2)] 70 pld [r1, #(CACHE_LINE_SIZE * 2)] 71 721: /* The main loop compares 32 bytes at a time */ 73 vld1.8 {d0 - d3}, [r0]! 74 pld [r0, #(CACHE_LINE_SIZE * 2)] 75 vld1.8 {d4 - d7}, [r1]! 76 pld [r1, #(CACHE_LINE_SIZE * 2)] 77 78 /* Start subtracting the values and merge results */ 79 vsub.i8 q0, q2 80 vsub.i8 q1, q3 81 vorr q2, q0, q1 82 vorr d4, d5 83 vmov r3, ip, d4 84 /* Check if there are any differences among the 32 bytes */ 85 orrs r3, ip 86 bne 2f 87 subs r2, r2, #32 88 bhs 1b 89 b 3f 902: 91 /* Check if the difference was in the first or last 16 bytes */ 92 sub r0, #32 93 vorr d0, d1 94 sub r1, #32 95 vmov r3, ip, d0 96 orrs r3, ip 97 /* if the first 16 bytes are equal, we only have to rewind 16 bytes */ 98 ittt eq 99 subeq r2, #16 100 addeq r0, #16 101 addeq r1, #16 102 1033: /* fix-up the remaining count */ 104 add r2, r2, #32 105 106 cmp r2, #(8+4) 107 bmi 10f 108#endif 109 110 /* save registers */ 111 .save {r4, lr} 112 stmfd sp!, {r4, lr} 113 .cfi_def_cfa_offset 8 114 .cfi_rel_offset r4, 0 115 .cfi_rel_offset lr, 4 116 117 /* since r0 hold the result, move the first source 118 * pointer somewhere else 119 */ 120 mov r4, r0 121 122 /* align first pointer to word boundary 123 * offset = -src & 3 124 */ 125 rsb r3, r4, #0 126 ands r3, r3, #3 127 beq 0f 128 129 /* align first pointer */ 130 sub r2, r2, r3 1311: ldrb r0, [r4], #1 132 ldrb ip, [r1], #1 133 subs r0, r0, ip 134 bne 9f 135 subs r3, r3, #1 136 bne 1b 137 138 1390: /* here the first pointer is aligned, and we have at least 4 bytes 140 * to process. 141 */ 142 143 /* see if the pointers are congruent */ 144 eor r0, r4, r1 145 ands r0, r0, #3 146 bne 5f 147 148 /* congruent case, 32 bytes per iteration 149 * We need to make sure there are at least 32+4 bytes left 150 * because we effectively read ahead one word, and we could 151 * read past the buffer (and segfault) if we're not careful. 152 */ 153 154 ldr ip, [r1] 155 subs r2, r2, #(32 + 4) 156 bmi 1f 157 1580: pld [r4, #(CACHE_LINE_SIZE * 2)] 159 pld [r1, #(CACHE_LINE_SIZE * 2)] 160 ldr r0, [r4], #4 161 ldr lr, [r1, #4]! 162 eors r0, r0, ip 163 ldreq r0, [r4], #4 164 ldreq ip, [r1, #4]! 165 eoreqs r0, r0, lr 166 ldreq r0, [r4], #4 167 ldreq lr, [r1, #4]! 168 eoreqs r0, r0, ip 169 ldreq r0, [r4], #4 170 ldreq ip, [r1, #4]! 171 eoreqs r0, r0, lr 172 ldreq r0, [r4], #4 173 ldreq lr, [r1, #4]! 174 eoreqs r0, r0, ip 175 ldreq r0, [r4], #4 176 ldreq ip, [r1, #4]! 177 eoreqs r0, r0, lr 178 ldreq r0, [r4], #4 179 ldreq lr, [r1, #4]! 180 eoreqs r0, r0, ip 181 ldreq r0, [r4], #4 182 ldreq ip, [r1, #4]! 183 eoreqs r0, r0, lr 184 bne 2f 185 subs r2, r2, #32 186 bhs 0b 187 188 /* do we have at least 4 bytes left? */ 1891: adds r2, r2, #(32 - 4 + 4) 190 bmi 4f 191 192 /* finish off 4 bytes at a time */ 1933: ldr r0, [r4], #4 194 ldr ip, [r1], #4 195 eors r0, r0, ip 196 bne 2f 197 subs r2, r2, #4 198 bhs 3b 199 200 /* are we done? */ 2014: adds r2, r2, #4 202 moveq r0, #0 203 beq 9f 204 205 /* finish off the remaining bytes */ 206 b 8f 207 2082: /* the last 4 bytes are different, restart them */ 209 sub r4, r4, #4 210 sub r1, r1, #4 211 mov r2, #4 212 213 /* process the last few bytes */ 2148: ldrb r0, [r4], #1 215 ldrb ip, [r1], #1 216 // stall 217 subs r0, r0, ip 218 bne 9f 219 subs r2, r2, #1 220 bne 8b 221 2229: /* restore registers and return */ 223 ldmfd sp!, {r4, lr} 224 bx lr 225 22610: /* process less than 12 bytes */ 227 cmp r2, #0 228 moveq r0, #0 229 bxeq lr 230 mov r3, r0 23111: 232 ldrb r0, [r3], #1 233 ldrb ip, [r1], #1 234 subs r0, ip 235 bxne lr 236 subs r2, r2, #1 237 bne 11b 238 bx lr 239 2405: /*************** non-congruent case ***************/ 241 and r0, r1, #3 242 cmp r0, #2 243 bne 4f 244 245 /* here, offset is 2 (16-bits aligned, special cased) */ 246 247 /* make sure we have at least 16 bytes to process */ 248 subs r2, r2, #16 249 addmi r2, r2, #16 250 bmi 8b 251 252 /* align the unaligned pointer */ 253 bic r1, r1, #3 254 ldr lr, [r1], #4 255 2566: pld [r1, #(CACHE_LINE_SIZE * 2)] 257 pld [r4, #(CACHE_LINE_SIZE * 2)] 258 mov ip, lr, lsr #16 259 ldr lr, [r1], #4 260 ldr r0, [r4], #4 261 orr ip, ip, lr, lsl #16 262 eors r0, r0, ip 263 moveq ip, lr, lsr #16 264 ldreq lr, [r1], #4 265 ldreq r0, [r4], #4 266 orreq ip, ip, lr, lsl #16 267 eoreqs r0, r0, ip 268 moveq ip, lr, lsr #16 269 ldreq lr, [r1], #4 270 ldreq r0, [r4], #4 271 orreq ip, ip, lr, lsl #16 272 eoreqs r0, r0, ip 273 moveq ip, lr, lsr #16 274 ldreq lr, [r1], #4 275 ldreq r0, [r4], #4 276 orreq ip, ip, lr, lsl #16 277 eoreqs r0, r0, ip 278 bne 7f 279 subs r2, r2, #16 280 bhs 6b 281 sub r1, r1, #2 282 /* are we done? */ 283 adds r2, r2, #16 284 moveq r0, #0 285 beq 9b 286 /* finish off the remaining bytes */ 287 b 8b 288 2897: /* fix up the 2 pointers and fallthrough... */ 290 sub r1, r1, #(4+2) 291 sub r4, r4, #4 292 mov r2, #4 293 b 8b 294 295 2964: /*************** offset is 1 or 3 (less optimized) ***************/ 297 298 stmfd sp!, {r5, r6, r7} 299 300 // r5 = rhs 301 // r6 = lhs 302 // r7 = scratch 303 304 mov r5, r0, lsl #3 /* r5 = right shift */ 305 rsb r6, r5, #32 /* r6 = left shift */ 306 307 /* align the unaligned pointer */ 308 bic r1, r1, #3 309 ldr r7, [r1], #4 310 sub r2, r2, #8 311 3126: mov ip, r7, lsr r5 313 ldr r7, [r1], #4 314 ldr r0, [r4], #4 315 orr ip, ip, r7, lsl r6 316 eors r0, r0, ip 317 moveq ip, r7, lsr r5 318 ldreq r7, [r1], #4 319 ldreq r0, [r4], #4 320 orreq ip, ip, r7, lsl r6 321 eoreqs r0, r0, ip 322 bne 7f 323 subs r2, r2, #8 324 bhs 6b 325 326 sub r1, r1, r6, lsr #3 327 ldmfd sp!, {r5, r6, r7} 328 329 /* are we done? */ 330 adds r2, r2, #8 331 moveq r0, #0 332 beq 9b 333 334 /* finish off the remaining bytes */ 335 b 8b 336 3377: /* fix up the 2 pointers and fallthrough... */ 338 sub r1, r1, #4 339 sub r1, r1, r6, lsr #3 340 sub r4, r4, #4 341 mov r2, #4 342 ldmfd sp!, {r5, r6, r7} 343 b 8b 344END(memcmp) 345