memcmp.S revision a27d2baa0c1a2ec70f47ea9199b1dd6762c8a349
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 .text 29 30 .global memcmp 31 .type memcmp, %function 32 .align 4 33 34/* 35 * Optimized memcmp() for ARM9. 36 * This would not be optimal on XScale or ARM11, where more prefetching 37 * and use of PLD will be needed. 38 * The 2 major optimzations here are 39 * (1) The main loop compares 16 bytes at a time 40 * (2) The loads are scheduled in a way they won't stall 41 */ 42 43memcmp: 44 pld [r0, #0] 45 pld [r1, #0] 46 47 /* take of the case where length is 0 or the buffers are the same */ 48 cmp r0, r1 49 cmpne r2, #0 50 moveq r0, #0 51 bxeq lr 52 53 /* save registers */ 54 stmfd sp!, {r4, lr} 55 56 pld [r0, #32] 57 pld [r1, #32] 58 59 /* since r0 hold the result, move the first source 60 * pointer somewhere else 61 */ 62 63 mov r4, r0 64 65 /* make sure we have at least 8+4 bytes, this simplify things below 66 * and avoid some overhead for small blocks 67 */ 68 cmp r2, #(8+4) 69 bmi 8f 70 71 /* align first pointer to word boundary 72 * offset = -src & 3 73 */ 74 rsb r3, r4, #0 75 ands r3, r3, #3 76 beq 0f 77 78 /* align first pointer */ 79 sub r2, r2, r3 801: ldrb r0, [r4], #1 81 ldrb ip, [r1], #1 82 subs r0, r0, ip 83 bne 9f 84 subs r3, r3, #1 85 bne 1b 86 87 880: /* here the first pointer is aligned, and we have at least 4 bytes 89 * to process. 90 */ 91 92 /* see if the pointers are congruent */ 93 eor r0, r4, r1 94 ands r0, r0, #3 95 bne 5f 96 97 /* congruent case, 32 bytes per iteration 98 * We need to make sure there are at least 32+4 bytes left 99 * because we effectively read ahead one word, and we could 100 * read past the buffer (and segfault) if we're not careful. 101 */ 102 103 ldr ip, [r1] 104 subs r2, r2, #(32 + 4) 105 bmi 1f 106 1070: pld [r4, #64] 108 pld [r1, #64] 109 ldr r0, [r4], #4 110 ldr lr, [r1, #4]! 111 eors r0, r0, ip 112 ldreq r0, [r4], #4 113 ldreq ip, [r1, #4]! 114 eoreqs r0, r0, lr 115 ldreq r0, [r4], #4 116 ldreq lr, [r1, #4]! 117 eoreqs r0, r0, ip 118 ldreq r0, [r4], #4 119 ldreq ip, [r1, #4]! 120 eoreqs r0, r0, lr 121 ldreq r0, [r4], #4 122 ldreq lr, [r1, #4]! 123 eoreqs r0, r0, ip 124 ldreq r0, [r4], #4 125 ldreq ip, [r1, #4]! 126 eoreqs r0, r0, lr 127 ldreq r0, [r4], #4 128 ldreq lr, [r1, #4]! 129 eoreqs r0, r0, ip 130 ldreq r0, [r4], #4 131 ldreq ip, [r1, #4]! 132 eoreqs r0, r0, lr 133 bne 2f 134 subs r2, r2, #32 135 bhs 0b 136 137 /* do we have at least 4 bytes left? */ 1381: adds r2, r2, #(32 - 4 + 4) 139 bmi 4f 140 141 /* finish off 4 bytes at a time */ 1423: ldr r0, [r4], #4 143 ldr ip, [r1], #4 144 eors r0, r0, ip 145 bne 2f 146 subs r2, r2, #4 147 bhs 3b 148 149 /* are we done? */ 1504: adds r2, r2, #4 151 moveq r0, #0 152 beq 9f 153 154 /* finish off the remaining bytes */ 155 b 8f 156 1572: /* the last 4 bytes are different, restart them */ 158 sub r4, r4, #4 159 sub r1, r1, #4 160 mov r2, #4 161 162 /* process the last few bytes */ 1638: ldrb r0, [r4], #1 164 ldrb ip, [r1], #1 165 // stall 166 subs r0, r0, ip 167 bne 9f 168 subs r2, r2, #1 169 bne 8b 170 1719: /* restore registers and return */ 172 ldmfd sp!, {r4, lr} 173 bx lr 174 175 176 177 178 1795: /*************** non-congruent case ***************/ 180 and r0, r1, #3 181 cmp r0, #2 182 bne 4f 183 184 /* here, offset is 2 (16-bits aligned, special cased) */ 185 186 /* make sure we have at least 16 bytes to process */ 187 subs r2, r2, #16 188 addmi r2, r2, #16 189 bmi 8b 190 191 /* align the unaligned pointer */ 192 bic r1, r1, #3 193 ldr lr, [r1], #4 194 1956: pld [r1, #64] 196 pld [r4, #64] 197 mov ip, lr, lsr #16 198 ldr lr, [r1], #4 199 ldr r0, [r4], #4 200 orr ip, ip, lr, lsl #16 201 eors r0, r0, ip 202 moveq ip, lr, lsr #16 203 ldreq lr, [r1], #4 204 ldreq r0, [r4], #4 205 orreq ip, ip, lr, lsl #16 206 eoreqs r0, r0, ip 207 moveq ip, lr, lsr #16 208 ldreq lr, [r1], #4 209 ldreq r0, [r4], #4 210 orreq ip, ip, lr, lsl #16 211 eoreqs r0, r0, ip 212 moveq ip, lr, lsr #16 213 ldreq lr, [r1], #4 214 ldreq r0, [r4], #4 215 orreq ip, ip, lr, lsl #16 216 eoreqs r0, r0, ip 217 bne 7f 218 subs r2, r2, #16 219 bhs 6b 220 sub r1, r1, #2 221 /* are we done? */ 222 adds r2, r2, #16 223 moveq r0, #0 224 beq 9b 225 /* finish off the remaining bytes */ 226 b 8b 227 2287: /* fix up the 2 pointers and fallthrough... */ 229 sub r1, r1, #(4+2) 230 sub r4, r4, #4 231 mov r2, #4 232 b 8b 233 234 2354: /*************** offset is 1 or 3 (less optimized) ***************/ 236 237 stmfd sp!, {r5, r6, r7} 238 239 // r5 = rhs 240 // r6 = lhs 241 // r7 = scratch 242 243 mov r5, r0, lsl #3 /* r5 = right shift */ 244 rsb r6, r5, #32 /* r6 = left shift */ 245 246 /* align the unaligned pointer */ 247 bic r1, r1, #3 248 ldr r7, [r1], #4 249 sub r2, r2, #8 250 2516: mov ip, r7, lsr r5 252 ldr r7, [r1], #4 253 ldr r0, [r4], #4 254 orr ip, ip, r7, lsl r6 255 eors r0, r0, ip 256 moveq ip, r7, lsr r5 257 ldreq r7, [r1], #4 258 ldreq r0, [r4], #4 259 orreq ip, ip, r7, lsl r6 260 eoreqs r0, r0, ip 261 bne 7f 262 subs r2, r2, #8 263 bhs 6b 264 265 sub r1, r1, r6, lsr #3 266 ldmfd sp!, {r5, r6, r7} 267 268 /* are we done? */ 269 adds r2, r2, #8 270 moveq r0, #0 271 beq 9b 272 273 /* finish off the remaining bytes */ 274 b 8b 275 2767: /* fix up the 2 pointers and fallthrough... */ 277 sub r1, r1, #4 278 sub r1, r1, r6, lsr #3 279 sub r4, r4, #4 280 mov r2, #4 281 ldmfd sp!, {r5, r6, r7} 282 b 8b 283