memcmp.S revision 4e468ed2eb86a2406e14f1eca82072ee501d05fd
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <machine/cpu-features.h> 30 31 .text 32 33 .global memcmp 34 .type memcmp, %function 35 .align 4 36 37/* 38 * Optimized memcmp() for ARM9. 39 * This would not be optimal on XScale or ARM11, where more prefetching 40 * and use of PLD will be needed. 41 * The 2 major optimzations here are 42 * (1) The main loop compares 16 bytes at a time 43 * (2) The loads are scheduled in a way they won't stall 44 */ 45 46memcmp: 47 PLD (r0, #0) 48 PLD (r1, #0) 49 50 /* take of the case where length is 0 or the buffers are the same */ 51 cmp r0, r1 52 cmpne r2, #0 53 moveq r0, #0 54 bxeq lr 55 56 /* save registers */ 57 stmfd sp!, {r4, lr} 58 59 PLD (r0, #32) 60 PLD (r1, #32) 61 62 /* since r0 hold the result, move the first source 63 * pointer somewhere else 64 */ 65 66 mov r4, r0 67 68 /* make sure we have at least 8+4 bytes, this simplify things below 69 * and avoid some overhead for small blocks 70 */ 71 cmp r2, #(8+4) 72 bmi 8f 73 74 /* align first pointer to word boundary 75 * offset = -src & 3 76 */ 77 rsb r3, r4, #0 78 ands r3, r3, #3 79 beq 0f 80 81 /* align first pointer */ 82 sub r2, r2, r3 831: ldrb r0, [r4], #1 84 ldrb ip, [r1], #1 85 subs r0, r0, ip 86 bne 9f 87 subs r3, r3, #1 88 bne 1b 89 90 910: /* here the first pointer is aligned, and we have at least 4 bytes 92 * to process. 93 */ 94 95 /* see if the pointers are congruent */ 96 eor r0, r4, r1 97 ands r0, r0, #3 98 bne 5f 99 100 /* congruent case, 32 bytes per iteration 101 * We need to make sure there are at least 32+4 bytes left 102 * because we effectively read ahead one word, and we could 103 * read past the buffer (and segfault) if we're not careful. 104 */ 105 106 ldr ip, [r1] 107 subs r2, r2, #(32 + 4) 108 bmi 1f 109 1100: PLD (r4, #64) 111 PLD (r1, #64) 112 ldr r0, [r4], #4 113 ldr lr, [r1, #4]! 114 eors r0, r0, ip 115 ldreq r0, [r4], #4 116 ldreq ip, [r1, #4]! 117 eoreqs r0, r0, lr 118 ldreq r0, [r4], #4 119 ldreq lr, [r1, #4]! 120 eoreqs r0, r0, ip 121 ldreq r0, [r4], #4 122 ldreq ip, [r1, #4]! 123 eoreqs r0, r0, lr 124 ldreq r0, [r4], #4 125 ldreq lr, [r1, #4]! 126 eoreqs r0, r0, ip 127 ldreq r0, [r4], #4 128 ldreq ip, [r1, #4]! 129 eoreqs r0, r0, lr 130 ldreq r0, [r4], #4 131 ldreq lr, [r1, #4]! 132 eoreqs r0, r0, ip 133 ldreq r0, [r4], #4 134 ldreq ip, [r1, #4]! 135 eoreqs r0, r0, lr 136 bne 2f 137 subs r2, r2, #32 138 bhs 0b 139 140 /* do we have at least 4 bytes left? */ 1411: adds r2, r2, #(32 - 4 + 4) 142 bmi 4f 143 144 /* finish off 4 bytes at a time */ 1453: ldr r0, [r4], #4 146 ldr ip, [r1], #4 147 eors r0, r0, ip 148 bne 2f 149 subs r2, r2, #4 150 bhs 3b 151 152 /* are we done? */ 1534: adds r2, r2, #4 154 moveq r0, #0 155 beq 9f 156 157 /* finish off the remaining bytes */ 158 b 8f 159 1602: /* the last 4 bytes are different, restart them */ 161 sub r4, r4, #4 162 sub r1, r1, #4 163 mov r2, #4 164 165 /* process the last few bytes */ 1668: ldrb r0, [r4], #1 167 ldrb ip, [r1], #1 168 // stall 169 subs r0, r0, ip 170 bne 9f 171 subs r2, r2, #1 172 bne 8b 173 1749: /* restore registers and return */ 175 ldmfd sp!, {r4, lr} 176 bx lr 177 178 179 180 181 1825: /*************** non-congruent case ***************/ 183 and r0, r1, #3 184 cmp r0, #2 185 bne 4f 186 187 /* here, offset is 2 (16-bits aligned, special cased) */ 188 189 /* make sure we have at least 16 bytes to process */ 190 subs r2, r2, #16 191 addmi r2, r2, #16 192 bmi 8b 193 194 /* align the unaligned pointer */ 195 bic r1, r1, #3 196 ldr lr, [r1], #4 197 1986: PLD (r1, #64) 199 PLD (r4, #64) 200 mov ip, lr, lsr #16 201 ldr lr, [r1], #4 202 ldr r0, [r4], #4 203 orr ip, ip, lr, lsl #16 204 eors r0, r0, ip 205 moveq ip, lr, lsr #16 206 ldreq lr, [r1], #4 207 ldreq r0, [r4], #4 208 orreq ip, ip, lr, lsl #16 209 eoreqs r0, r0, ip 210 moveq ip, lr, lsr #16 211 ldreq lr, [r1], #4 212 ldreq r0, [r4], #4 213 orreq ip, ip, lr, lsl #16 214 eoreqs r0, r0, ip 215 moveq ip, lr, lsr #16 216 ldreq lr, [r1], #4 217 ldreq r0, [r4], #4 218 orreq ip, ip, lr, lsl #16 219 eoreqs r0, r0, ip 220 bne 7f 221 subs r2, r2, #16 222 bhs 6b 223 sub r1, r1, #2 224 /* are we done? */ 225 adds r2, r2, #16 226 moveq r0, #0 227 beq 9b 228 /* finish off the remaining bytes */ 229 b 8b 230 2317: /* fix up the 2 pointers and fallthrough... */ 232 sub r1, r1, #(4+2) 233 sub r4, r4, #4 234 mov r2, #4 235 b 8b 236 237 2384: /*************** offset is 1 or 3 (less optimized) ***************/ 239 240 stmfd sp!, {r5, r6, r7} 241 242 // r5 = rhs 243 // r6 = lhs 244 // r7 = scratch 245 246 mov r5, r0, lsl #3 /* r5 = right shift */ 247 rsb r6, r5, #32 /* r6 = left shift */ 248 249 /* align the unaligned pointer */ 250 bic r1, r1, #3 251 ldr r7, [r1], #4 252 sub r2, r2, #8 253 2546: mov ip, r7, lsr r5 255 ldr r7, [r1], #4 256 ldr r0, [r4], #4 257 orr ip, ip, r7, lsl r6 258 eors r0, r0, ip 259 moveq ip, r7, lsr r5 260 ldreq r7, [r1], #4 261 ldreq r0, [r4], #4 262 orreq ip, ip, r7, lsl r6 263 eoreqs r0, r0, ip 264 bne 7f 265 subs r2, r2, #8 266 bhs 6b 267 268 sub r1, r1, r6, lsr #3 269 ldmfd sp!, {r5, r6, r7} 270 271 /* are we done? */ 272 adds r2, r2, #8 273 moveq r0, #0 274 beq 9b 275 276 /* finish off the remaining bytes */ 277 b 8b 278 2797: /* fix up the 2 pointers and fallthrough... */ 280 sub r1, r1, #4 281 sub r1, r1, r6, lsr #3 282 sub r4, r4, #4 283 mov r2, #4 284 ldmfd sp!, {r5, r6, r7} 285 b 8b 286