memcmp.S revision bd192b470b69e00e9313680b70c5572a609e535d
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <machine/cpu-features.h> 30 31 .text 32 33 .global memcmp 34 .type memcmp, %function 35 .align 4 36 37/* 38 * Optimized memcmp() for ARM9. 39 * This would not be optimal on XScale or ARM11, where more prefetching 40 * and use of PLD will be needed. 41 * The 2 major optimzations here are 42 * (1) The main loop compares 16 bytes at a time 43 * (2) The loads are scheduled in a way they won't stall 44 */ 45 46memcmp: 47 .fnstart 48 PLD (r0, #0) 49 PLD (r1, #0) 50 51 /* take of the case where length is 0 or the buffers are the same */ 52 cmp r0, r1 53 cmpne r2, #0 54 moveq r0, #0 55 bxeq lr 56 57 .save {r4, lr} 58 /* save registers */ 59 stmfd sp!, {r4, lr} 60 61 PLD (r0, #32) 62 PLD (r1, #32) 63 64 /* since r0 hold the result, move the first source 65 * pointer somewhere else 66 */ 67 68 mov r4, r0 69 70 /* make sure we have at least 8+4 bytes, this simplify things below 71 * and avoid some overhead for small blocks 72 */ 73 cmp r2, #(8+4) 74 bmi 8f 75 76 /* align first pointer to word boundary 77 * offset = -src & 3 78 */ 79 rsb r3, r4, #0 80 ands r3, r3, #3 81 beq 0f 82 83 /* align first pointer */ 84 sub r2, r2, r3 851: ldrb r0, [r4], #1 86 ldrb ip, [r1], #1 87 subs r0, r0, ip 88 bne 9f 89 subs r3, r3, #1 90 bne 1b 91 92 930: /* here the first pointer is aligned, and we have at least 4 bytes 94 * to process. 95 */ 96 97 /* see if the pointers are congruent */ 98 eor r0, r4, r1 99 ands r0, r0, #3 100 bne 5f 101 102 /* congruent case, 32 bytes per iteration 103 * We need to make sure there are at least 32+4 bytes left 104 * because we effectively read ahead one word, and we could 105 * read past the buffer (and segfault) if we're not careful. 106 */ 107 108 ldr ip, [r1] 109 subs r2, r2, #(32 + 4) 110 bmi 1f 111 1120: PLD (r4, #64) 113 PLD (r1, #64) 114 ldr r0, [r4], #4 115 ldr lr, [r1, #4]! 116 eors r0, r0, ip 117 ldreq r0, [r4], #4 118 ldreq ip, [r1, #4]! 119 eoreqs r0, r0, lr 120 ldreq r0, [r4], #4 121 ldreq lr, [r1, #4]! 122 eoreqs r0, r0, ip 123 ldreq r0, [r4], #4 124 ldreq ip, [r1, #4]! 125 eoreqs r0, r0, lr 126 ldreq r0, [r4], #4 127 ldreq lr, [r1, #4]! 128 eoreqs r0, r0, ip 129 ldreq r0, [r4], #4 130 ldreq ip, [r1, #4]! 131 eoreqs r0, r0, lr 132 ldreq r0, [r4], #4 133 ldreq lr, [r1, #4]! 134 eoreqs r0, r0, ip 135 ldreq r0, [r4], #4 136 ldreq ip, [r1, #4]! 137 eoreqs r0, r0, lr 138 bne 2f 139 subs r2, r2, #32 140 bhs 0b 141 142 /* do we have at least 4 bytes left? */ 1431: adds r2, r2, #(32 - 4 + 4) 144 bmi 4f 145 146 /* finish off 4 bytes at a time */ 1473: ldr r0, [r4], #4 148 ldr ip, [r1], #4 149 eors r0, r0, ip 150 bne 2f 151 subs r2, r2, #4 152 bhs 3b 153 154 /* are we done? */ 1554: adds r2, r2, #4 156 moveq r0, #0 157 beq 9f 158 159 /* finish off the remaining bytes */ 160 b 8f 161 1622: /* the last 4 bytes are different, restart them */ 163 sub r4, r4, #4 164 sub r1, r1, #4 165 mov r2, #4 166 167 /* process the last few bytes */ 1688: ldrb r0, [r4], #1 169 ldrb ip, [r1], #1 170 // stall 171 subs r0, r0, ip 172 bne 9f 173 subs r2, r2, #1 174 bne 8b 175 1769: /* restore registers and return */ 177 ldmfd sp!, {r4, lr} 178 bx lr 179 .fnend 180 181 182 183 184 1855: /*************** non-congruent case ***************/ 186 and r0, r1, #3 187 cmp r0, #2 188 bne 4f 189 190 /* here, offset is 2 (16-bits aligned, special cased) */ 191 192 /* make sure we have at least 16 bytes to process */ 193 subs r2, r2, #16 194 addmi r2, r2, #16 195 bmi 8b 196 197 /* align the unaligned pointer */ 198 bic r1, r1, #3 199 ldr lr, [r1], #4 200 2016: PLD (r1, #64) 202 PLD (r4, #64) 203 mov ip, lr, lsr #16 204 ldr lr, [r1], #4 205 ldr r0, [r4], #4 206 orr ip, ip, lr, lsl #16 207 eors r0, r0, ip 208 moveq ip, lr, lsr #16 209 ldreq lr, [r1], #4 210 ldreq r0, [r4], #4 211 orreq ip, ip, lr, lsl #16 212 eoreqs r0, r0, ip 213 moveq ip, lr, lsr #16 214 ldreq lr, [r1], #4 215 ldreq r0, [r4], #4 216 orreq ip, ip, lr, lsl #16 217 eoreqs r0, r0, ip 218 moveq ip, lr, lsr #16 219 ldreq lr, [r1], #4 220 ldreq r0, [r4], #4 221 orreq ip, ip, lr, lsl #16 222 eoreqs r0, r0, ip 223 bne 7f 224 subs r2, r2, #16 225 bhs 6b 226 sub r1, r1, #2 227 /* are we done? */ 228 adds r2, r2, #16 229 moveq r0, #0 230 beq 9b 231 /* finish off the remaining bytes */ 232 b 8b 233 2347: /* fix up the 2 pointers and fallthrough... */ 235 sub r1, r1, #(4+2) 236 sub r4, r4, #4 237 mov r2, #4 238 b 8b 239 240 2414: /*************** offset is 1 or 3 (less optimized) ***************/ 242 243 stmfd sp!, {r5, r6, r7} 244 245 // r5 = rhs 246 // r6 = lhs 247 // r7 = scratch 248 249 mov r5, r0, lsl #3 /* r5 = right shift */ 250 rsb r6, r5, #32 /* r6 = left shift */ 251 252 /* align the unaligned pointer */ 253 bic r1, r1, #3 254 ldr r7, [r1], #4 255 sub r2, r2, #8 256 2576: mov ip, r7, lsr r5 258 ldr r7, [r1], #4 259 ldr r0, [r4], #4 260 orr ip, ip, r7, lsl r6 261 eors r0, r0, ip 262 moveq ip, r7, lsr r5 263 ldreq r7, [r1], #4 264 ldreq r0, [r4], #4 265 orreq ip, ip, r7, lsl r6 266 eoreqs r0, r0, ip 267 bne 7f 268 subs r2, r2, #8 269 bhs 6b 270 271 sub r1, r1, r6, lsr #3 272 ldmfd sp!, {r5, r6, r7} 273 274 /* are we done? */ 275 adds r2, r2, #8 276 moveq r0, #0 277 beq 9b 278 279 /* finish off the remaining bytes */ 280 b 8b 281 2827: /* fix up the 2 pointers and fallthrough... */ 283 sub r1, r1, #4 284 sub r1, r1, r6, lsr #3 285 sub r4, r4, #4 286 mov r2, #4 287 ldmfd sp!, {r5, r6, r7} 288 b 8b 289