1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in 12 * the documentation and/or other materials provided with the 13 * distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS 22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <machine/cpu-features.h> 30#include <machine/asm.h> 31 32/* 33 * Optimized memcmp() for ARM9. 34 * This would not be optimal on XScale or ARM11, where more prefetching 35 * and use of PLD will be needed. 36 * The 2 major optimzations here are 37 * (1) The main loop compares 16 bytes at a time 38 * (2) The loads are scheduled in a way they won't stall 39 */ 40 41ENTRY(memcmp) 42 PLD (r0, #0) 43 PLD (r1, #0) 44 45 /* take of the case where length is 0 or the buffers are the same */ 46 cmp r0, r1 47 cmpne r2, #0 48 moveq r0, #0 49 bxeq lr 50 51 .save {r4, lr} 52 /* save registers */ 53 stmfd sp!, {r4, lr} 54 55 PLD (r0, #32) 56 PLD (r1, #32) 57 58 /* since r0 hold the result, move the first source 59 * pointer somewhere else 60 */ 61 62 mov r4, r0 63 64 /* make sure we have at least 8+4 bytes, this simplify things below 65 * and avoid some overhead for small blocks 66 */ 67 cmp r2, #(8+4) 68 bmi 8f 69 70 /* align first pointer to word boundary 71 * offset = -src & 3 72 */ 73 rsb r3, r4, #0 74 ands r3, r3, #3 75 beq 0f 76 77 /* align first pointer */ 78 sub r2, r2, r3 791: ldrb r0, [r4], #1 80 ldrb ip, [r1], #1 81 subs r0, r0, ip 82 bne 9f 83 subs r3, r3, #1 84 bne 1b 85 86 870: /* here the first pointer is aligned, and we have at least 4 bytes 88 * to process. 89 */ 90 91 /* see if the pointers are congruent */ 92 eor r0, r4, r1 93 ands r0, r0, #3 94 bne 5f 95 96 /* congruent case, 32 bytes per iteration 97 * We need to make sure there are at least 32+4 bytes left 98 * because we effectively read ahead one word, and we could 99 * read past the buffer (and segfault) if we're not careful. 100 */ 101 102 ldr ip, [r1] 103 subs r2, r2, #(32 + 4) 104 bmi 1f 105 1060: PLD (r4, #64) 107 PLD (r1, #64) 108 ldr r0, [r4], #4 109 ldr lr, [r1, #4]! 110 eors r0, r0, ip 111 ldreq r0, [r4], #4 112 ldreq ip, [r1, #4]! 113 eoreqs r0, r0, lr 114 ldreq r0, [r4], #4 115 ldreq lr, [r1, #4]! 116 eoreqs r0, r0, ip 117 ldreq r0, [r4], #4 118 ldreq ip, [r1, #4]! 119 eoreqs r0, r0, lr 120 ldreq r0, [r4], #4 121 ldreq lr, [r1, #4]! 122 eoreqs r0, r0, ip 123 ldreq r0, [r4], #4 124 ldreq ip, [r1, #4]! 125 eoreqs r0, r0, lr 126 ldreq r0, [r4], #4 127 ldreq lr, [r1, #4]! 128 eoreqs r0, r0, ip 129 ldreq r0, [r4], #4 130 ldreq ip, [r1, #4]! 131 eoreqs r0, r0, lr 132 bne 2f 133 subs r2, r2, #32 134 bhs 0b 135 136 /* do we have at least 4 bytes left? */ 1371: adds r2, r2, #(32 - 4 + 4) 138 bmi 4f 139 140 /* finish off 4 bytes at a time */ 1413: ldr r0, [r4], #4 142 ldr ip, [r1], #4 143 eors r0, r0, ip 144 bne 2f 145 subs r2, r2, #4 146 bhs 3b 147 148 /* are we done? */ 1494: adds r2, r2, #4 150 moveq r0, #0 151 beq 9f 152 153 /* finish off the remaining bytes */ 154 b 8f 155 1562: /* the last 4 bytes are different, restart them */ 157 sub r4, r4, #4 158 sub r1, r1, #4 159 mov r2, #4 160 161 /* process the last few bytes */ 1628: ldrb r0, [r4], #1 163 ldrb ip, [r1], #1 164 // stall 165 subs r0, r0, ip 166 bne 9f 167 subs r2, r2, #1 168 bne 8b 169 1709: /* restore registers and return */ 171 ldmfd sp!, {r4, lr} 172 bx lr 173END(memcmp) 174 175 176 177 178 1795: /*************** non-congruent case ***************/ 180 and r0, r1, #3 181 cmp r0, #2 182 bne 4f 183 184 /* here, offset is 2 (16-bits aligned, special cased) */ 185 186 /* make sure we have at least 16 bytes to process */ 187 subs r2, r2, #16 188 addmi r2, r2, #16 189 bmi 8b 190 191 /* align the unaligned pointer */ 192 bic r1, r1, #3 193 ldr lr, [r1], #4 194 1956: PLD (r1, #64) 196 PLD (r4, #64) 197 mov ip, lr, lsr #16 198 ldr lr, [r1], #4 199 ldr r0, [r4], #4 200 orr ip, ip, lr, lsl #16 201 eors r0, r0, ip 202 moveq ip, lr, lsr #16 203 ldreq lr, [r1], #4 204 ldreq r0, [r4], #4 205 orreq ip, ip, lr, lsl #16 206 eoreqs r0, r0, ip 207 moveq ip, lr, lsr #16 208 ldreq lr, [r1], #4 209 ldreq r0, [r4], #4 210 orreq ip, ip, lr, lsl #16 211 eoreqs r0, r0, ip 212 moveq ip, lr, lsr #16 213 ldreq lr, [r1], #4 214 ldreq r0, [r4], #4 215 orreq ip, ip, lr, lsl #16 216 eoreqs r0, r0, ip 217 bne 7f 218 subs r2, r2, #16 219 bhs 6b 220 sub r1, r1, #2 221 /* are we done? */ 222 adds r2, r2, #16 223 moveq r0, #0 224 beq 9b 225 /* finish off the remaining bytes */ 226 b 8b 227 2287: /* fix up the 2 pointers and fallthrough... */ 229 sub r1, r1, #(4+2) 230 sub r4, r4, #4 231 mov r2, #4 232 b 8b 233 234 2354: /*************** offset is 1 or 3 (less optimized) ***************/ 236 237 stmfd sp!, {r5, r6, r7} 238 239 // r5 = rhs 240 // r6 = lhs 241 // r7 = scratch 242 243 mov r5, r0, lsl #3 /* r5 = right shift */ 244 rsb r6, r5, #32 /* r6 = left shift */ 245 246 /* align the unaligned pointer */ 247 bic r1, r1, #3 248 ldr r7, [r1], #4 249 sub r2, r2, #8 250 2516: mov ip, r7, lsr r5 252 ldr r7, [r1], #4 253 ldr r0, [r4], #4 254 orr ip, ip, r7, lsl r6 255 eors r0, r0, ip 256 moveq ip, r7, lsr r5 257 ldreq r7, [r1], #4 258 ldreq r0, [r4], #4 259 orreq ip, ip, r7, lsl r6 260 eoreqs r0, r0, ip 261 bne 7f 262 subs r2, r2, #8 263 bhs 6b 264 265 sub r1, r1, r6, lsr #3 266 ldmfd sp!, {r5, r6, r7} 267 268 /* are we done? */ 269 adds r2, r2, #8 270 moveq r0, #0 271 beq 9b 272 273 /* finish off the remaining bytes */ 274 b 8b 275 2767: /* fix up the 2 pointers and fallthrough... */ 277 sub r1, r1, #4 278 sub r1, r1, r6, lsr #3 279 sub r4, r4, #4 280 mov r2, #4 281 ldmfd sp!, {r5, r6, r7} 282 b 8b 283