memcmp.S revision bd192b470b69e00e9313680b70c5572a609e535d
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30
31    .text
32
33    .global memcmp
34    .type memcmp, %function
35    .align 4
36
37/*
38 * Optimized memcmp() for ARM9.
39 * This would not be optimal on XScale or ARM11, where more prefetching
40 * and use of PLD will be needed.
41 * The 2 major optimzations here are
42 * (1) The main loop compares 16 bytes at a time
43 * (2) The loads are scheduled in a way they won't stall
44 */
45
46memcmp:
47        .fnstart
48        PLD         (r0, #0)
49        PLD         (r1, #0)
50
51        /* take of the case where length is 0 or the buffers are the same */
52        cmp         r0, r1
53        cmpne       r2, #0
54        moveq       r0, #0
55        bxeq        lr
56
57        .save {r4, lr}
58        /* save registers */
59        stmfd       sp!, {r4, lr}
60
61        PLD         (r0, #32)
62        PLD         (r1, #32)
63
64        /* since r0 hold the result, move the first source
65         * pointer somewhere else
66         */
67
68         mov        r4, r0
69
70         /* make sure we have at least 8+4 bytes, this simplify things below
71          * and avoid some overhead for small blocks
72          */
73         cmp        r2, #(8+4)
74         bmi        8f
75
76        /* align first pointer to word boundary
77         * offset = -src & 3
78         */
79        rsb         r3, r4, #0
80        ands        r3, r3, #3
81        beq         0f
82
83        /* align first pointer  */
84        sub         r2, r2, r3
851:      ldrb        r0, [r4], #1
86        ldrb        ip, [r1], #1
87        subs        r0, r0, ip
88        bne         9f
89        subs        r3, r3, #1
90        bne         1b
91
92
930:      /* here the first pointer is aligned, and we have at least 4 bytes
94         * to process.
95         */
96
97        /* see if the pointers are congruent */
98        eor         r0, r4, r1
99        ands        r0, r0, #3
100        bne         5f
101
102        /* congruent case, 32 bytes per iteration
103         * We need to make sure there are at least 32+4 bytes left
104         * because we effectively read ahead one word, and we could
105         * read past the buffer (and segfault) if we're not careful.
106         */
107
108        ldr         ip, [r1]
109        subs        r2, r2, #(32 + 4)
110        bmi         1f
111
1120:      PLD         (r4, #64)
113        PLD         (r1, #64)
114        ldr         r0, [r4], #4
115        ldr         lr, [r1, #4]!
116        eors        r0, r0, ip
117        ldreq       r0, [r4], #4
118        ldreq       ip, [r1, #4]!
119        eoreqs      r0, r0, lr
120        ldreq       r0, [r4], #4
121        ldreq       lr, [r1, #4]!
122        eoreqs      r0, r0, ip
123        ldreq       r0, [r4], #4
124        ldreq       ip, [r1, #4]!
125        eoreqs      r0, r0, lr
126        ldreq       r0, [r4], #4
127        ldreq       lr, [r1, #4]!
128        eoreqs      r0, r0, ip
129        ldreq       r0, [r4], #4
130        ldreq       ip, [r1, #4]!
131        eoreqs      r0, r0, lr
132        ldreq       r0, [r4], #4
133        ldreq       lr, [r1, #4]!
134        eoreqs      r0, r0, ip
135        ldreq       r0, [r4], #4
136        ldreq       ip, [r1, #4]!
137        eoreqs      r0, r0, lr
138        bne         2f
139        subs        r2, r2, #32
140        bhs         0b
141
142        /* do we have at least 4 bytes left? */
1431:      adds        r2, r2, #(32 - 4 + 4)
144        bmi         4f
145
146        /* finish off 4 bytes at a time */
1473:      ldr         r0, [r4], #4
148        ldr         ip, [r1], #4
149        eors        r0, r0, ip
150        bne         2f
151        subs        r2, r2, #4
152        bhs         3b
153
154        /* are we done? */
1554:      adds        r2, r2, #4
156        moveq       r0, #0
157        beq         9f
158
159        /* finish off the remaining bytes */
160        b           8f
161
1622:      /* the last 4 bytes are different, restart them */
163        sub         r4, r4, #4
164        sub         r1, r1, #4
165        mov         r2, #4
166
167        /* process the last few bytes */
1688:      ldrb        r0, [r4], #1
169        ldrb        ip, [r1], #1
170        // stall
171        subs        r0, r0, ip
172        bne         9f
173        subs        r2, r2, #1
174        bne         8b
175
1769:      /* restore registers and return */
177        ldmfd       sp!, {r4, lr}
178        bx          lr
179        .fnend
180
181
182
183
184
1855:      /*************** non-congruent case ***************/
186        and         r0, r1, #3
187        cmp         r0, #2
188        bne         4f
189
190        /* here, offset is 2 (16-bits aligned, special cased) */
191
192        /* make sure we have at least 16 bytes to process */
193        subs        r2, r2, #16
194        addmi       r2, r2, #16
195        bmi         8b
196
197        /* align the unaligned pointer */
198        bic         r1, r1, #3
199        ldr         lr, [r1], #4
200
2016:      PLD         (r1, #64)
202        PLD         (r4, #64)
203        mov         ip, lr, lsr #16
204        ldr         lr, [r1], #4
205        ldr         r0, [r4], #4
206        orr         ip, ip, lr, lsl #16
207        eors        r0, r0, ip
208        moveq       ip, lr, lsr #16
209        ldreq       lr, [r1], #4
210        ldreq       r0, [r4], #4
211        orreq       ip, ip, lr, lsl #16
212        eoreqs      r0, r0, ip
213        moveq       ip, lr, lsr #16
214        ldreq       lr, [r1], #4
215        ldreq       r0, [r4], #4
216        orreq       ip, ip, lr, lsl #16
217        eoreqs      r0, r0, ip
218        moveq       ip, lr, lsr #16
219        ldreq       lr, [r1], #4
220        ldreq       r0, [r4], #4
221        orreq       ip, ip, lr, lsl #16
222        eoreqs      r0, r0, ip
223        bne         7f
224        subs        r2, r2, #16
225        bhs         6b
226        sub         r1, r1, #2
227        /* are we done? */
228        adds        r2, r2, #16
229        moveq       r0, #0
230        beq         9b
231        /* finish off the remaining bytes */
232        b           8b
233
2347:      /* fix up the 2 pointers and fallthrough... */
235        sub         r1, r1, #(4+2)
236        sub         r4, r4, #4
237        mov         r2, #4
238        b           8b
239
240
2414:      /*************** offset is 1 or 3 (less optimized) ***************/
242
243		stmfd		sp!, {r5, r6, r7}
244
245        // r5 = rhs
246        // r6 = lhs
247        // r7 = scratch
248
249        mov         r5, r0, lsl #3		/* r5 = right shift */
250        rsb         r6, r5, #32         /* r6 = left shift */
251
252        /* align the unaligned pointer */
253        bic         r1, r1, #3
254        ldr         r7, [r1], #4
255        sub         r2, r2, #8
256
2576:      mov         ip, r7, lsr r5
258        ldr         r7, [r1], #4
259        ldr         r0, [r4], #4
260        orr         ip, ip, r7, lsl r6
261        eors        r0, r0, ip
262        moveq       ip, r7, lsr r5
263        ldreq       r7, [r1], #4
264        ldreq       r0, [r4], #4
265        orreq       ip, ip, r7, lsl r6
266        eoreqs      r0, r0, ip
267        bne         7f
268        subs        r2, r2, #8
269        bhs         6b
270
271        sub         r1, r1, r6, lsr #3
272		ldmfd       sp!, {r5, r6, r7}
273
274        /* are we done? */
275        adds        r2, r2, #8
276        moveq       r0, #0
277        beq         9b
278
279        /* finish off the remaining bytes */
280        b           8b
281
2827:      /* fix up the 2 pointers and fallthrough... */
283        sub         r1, r1, #4
284        sub         r1, r1, r6, lsr #3
285        sub         r4, r4, #4
286        mov         r2, #4
287		ldmfd		sp!, {r5, r6, r7}
288        b           8b
289