memcmp.S revision 4e468ed2eb86a2406e14f1eca82072ee501d05fd
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30
31    .text
32
33    .global memcmp
34    .type memcmp, %function
35    .align 4
36
37/*
38 * Optimized memcmp() for ARM9.
39 * This would not be optimal on XScale or ARM11, where more prefetching
40 * and use of PLD will be needed.
41 * The 2 major optimzations here are
42 * (1) The main loop compares 16 bytes at a time
43 * (2) The loads are scheduled in a way they won't stall
44 */
45
46memcmp:
47        PLD         (r0, #0)
48        PLD         (r1, #0)
49
50        /* take of the case where length is 0 or the buffers are the same */
51        cmp         r0, r1
52        cmpne       r2, #0
53        moveq       r0, #0
54        bxeq        lr
55
56        /* save registers */
57        stmfd       sp!, {r4, lr}
58
59        PLD         (r0, #32)
60        PLD         (r1, #32)
61
62        /* since r0 hold the result, move the first source
63         * pointer somewhere else
64         */
65
66         mov        r4, r0
67
68         /* make sure we have at least 8+4 bytes, this simplify things below
69          * and avoid some overhead for small blocks
70          */
71         cmp        r2, #(8+4)
72         bmi        8f
73
74        /* align first pointer to word boundary
75         * offset = -src & 3
76         */
77        rsb         r3, r4, #0
78        ands        r3, r3, #3
79        beq         0f
80
81        /* align first pointer  */
82        sub         r2, r2, r3
831:      ldrb        r0, [r4], #1
84        ldrb        ip, [r1], #1
85        subs        r0, r0, ip
86        bne         9f
87        subs        r3, r3, #1
88        bne         1b
89
90
910:      /* here the first pointer is aligned, and we have at least 4 bytes
92         * to process.
93         */
94
95        /* see if the pointers are congruent */
96        eor         r0, r4, r1
97        ands        r0, r0, #3
98        bne         5f
99
100        /* congruent case, 32 bytes per iteration
101         * We need to make sure there are at least 32+4 bytes left
102         * because we effectively read ahead one word, and we could
103         * read past the buffer (and segfault) if we're not careful.
104         */
105
106        ldr         ip, [r1]
107        subs        r2, r2, #(32 + 4)
108        bmi         1f
109
1100:      PLD         (r4, #64)
111        PLD         (r1, #64)
112        ldr         r0, [r4], #4
113        ldr         lr, [r1, #4]!
114        eors        r0, r0, ip
115        ldreq       r0, [r4], #4
116        ldreq       ip, [r1, #4]!
117        eoreqs      r0, r0, lr
118        ldreq       r0, [r4], #4
119        ldreq       lr, [r1, #4]!
120        eoreqs      r0, r0, ip
121        ldreq       r0, [r4], #4
122        ldreq       ip, [r1, #4]!
123        eoreqs      r0, r0, lr
124        ldreq       r0, [r4], #4
125        ldreq       lr, [r1, #4]!
126        eoreqs      r0, r0, ip
127        ldreq       r0, [r4], #4
128        ldreq       ip, [r1, #4]!
129        eoreqs      r0, r0, lr
130        ldreq       r0, [r4], #4
131        ldreq       lr, [r1, #4]!
132        eoreqs      r0, r0, ip
133        ldreq       r0, [r4], #4
134        ldreq       ip, [r1, #4]!
135        eoreqs      r0, r0, lr
136        bne         2f
137        subs        r2, r2, #32
138        bhs         0b
139
140        /* do we have at least 4 bytes left? */
1411:      adds        r2, r2, #(32 - 4 + 4)
142        bmi         4f
143
144        /* finish off 4 bytes at a time */
1453:      ldr         r0, [r4], #4
146        ldr         ip, [r1], #4
147        eors        r0, r0, ip
148        bne         2f
149        subs        r2, r2, #4
150        bhs         3b
151
152        /* are we done? */
1534:      adds        r2, r2, #4
154        moveq       r0, #0
155        beq         9f
156
157        /* finish off the remaining bytes */
158        b           8f
159
1602:      /* the last 4 bytes are different, restart them */
161        sub         r4, r4, #4
162        sub         r1, r1, #4
163        mov         r2, #4
164
165        /* process the last few bytes */
1668:      ldrb        r0, [r4], #1
167        ldrb        ip, [r1], #1
168        // stall
169        subs        r0, r0, ip
170        bne         9f
171        subs        r2, r2, #1
172        bne         8b
173
1749:      /* restore registers and return */
175        ldmfd       sp!, {r4, lr}
176        bx          lr
177
178
179
180
181
1825:      /*************** non-congruent case ***************/
183        and         r0, r1, #3
184        cmp         r0, #2
185        bne         4f
186
187        /* here, offset is 2 (16-bits aligned, special cased) */
188
189        /* make sure we have at least 16 bytes to process */
190        subs        r2, r2, #16
191        addmi       r2, r2, #16
192        bmi         8b
193
194        /* align the unaligned pointer */
195        bic         r1, r1, #3
196        ldr         lr, [r1], #4
197
1986:      PLD         (r1, #64)
199        PLD         (r4, #64)
200        mov         ip, lr, lsr #16
201        ldr         lr, [r1], #4
202        ldr         r0, [r4], #4
203        orr         ip, ip, lr, lsl #16
204        eors        r0, r0, ip
205        moveq       ip, lr, lsr #16
206        ldreq       lr, [r1], #4
207        ldreq       r0, [r4], #4
208        orreq       ip, ip, lr, lsl #16
209        eoreqs      r0, r0, ip
210        moveq       ip, lr, lsr #16
211        ldreq       lr, [r1], #4
212        ldreq       r0, [r4], #4
213        orreq       ip, ip, lr, lsl #16
214        eoreqs      r0, r0, ip
215        moveq       ip, lr, lsr #16
216        ldreq       lr, [r1], #4
217        ldreq       r0, [r4], #4
218        orreq       ip, ip, lr, lsl #16
219        eoreqs      r0, r0, ip
220        bne         7f
221        subs        r2, r2, #16
222        bhs         6b
223        sub         r1, r1, #2
224        /* are we done? */
225        adds        r2, r2, #16
226        moveq       r0, #0
227        beq         9b
228        /* finish off the remaining bytes */
229        b           8b
230
2317:      /* fix up the 2 pointers and fallthrough... */
232        sub         r1, r1, #(4+2)
233        sub         r4, r4, #4
234        mov         r2, #4
235        b           8b
236
237
2384:      /*************** offset is 1 or 3 (less optimized) ***************/
239
240		stmfd		sp!, {r5, r6, r7}
241
242        // r5 = rhs
243        // r6 = lhs
244        // r7 = scratch
245
246        mov         r5, r0, lsl #3		/* r5 = right shift */
247        rsb         r6, r5, #32         /* r6 = left shift */
248
249        /* align the unaligned pointer */
250        bic         r1, r1, #3
251        ldr         r7, [r1], #4
252        sub         r2, r2, #8
253
2546:      mov         ip, r7, lsr r5
255        ldr         r7, [r1], #4
256        ldr         r0, [r4], #4
257        orr         ip, ip, r7, lsl r6
258        eors        r0, r0, ip
259        moveq       ip, r7, lsr r5
260        ldreq       r7, [r1], #4
261        ldreq       r0, [r4], #4
262        orreq       ip, ip, r7, lsl r6
263        eoreqs      r0, r0, ip
264        bne         7f
265        subs        r2, r2, #8
266        bhs         6b
267
268        sub         r1, r1, r6, lsr #3
269		ldmfd       sp!, {r5, r6, r7}
270
271        /* are we done? */
272        adds        r2, r2, #8
273        moveq       r0, #0
274        beq         9b
275
276        /* finish off the remaining bytes */
277        b           8b
278
2797:      /* fix up the 2 pointers and fallthrough... */
280        sub         r1, r1, #4
281        sub         r1, r1, r6, lsr #3
282        sub         r4, r4, #4
283        mov         r2, #4
284		ldmfd		sp!, {r5, r6, r7}
285        b           8b
286