1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30#include <machine/asm.h>
31
32/*
33 * Optimized memcmp() for ARM9.
34 * This would not be optimal on XScale or ARM11, where more prefetching
35 * and use of PLD will be needed.
36 * The 2 major optimzations here are
37 * (1) The main loop compares 16 bytes at a time
38 * (2) The loads are scheduled in a way they won't stall
39 */
40
41ENTRY(memcmp)
42        PLD         (r0, #0)
43        PLD         (r1, #0)
44
45        /* take of the case where length is 0 or the buffers are the same */
46        cmp         r0, r1
47        cmpne       r2, #0
48        moveq       r0, #0
49        bxeq        lr
50
51        .save {r4, lr}
52        /* save registers */
53        stmfd       sp!, {r4, lr}
54
55        PLD         (r0, #32)
56        PLD         (r1, #32)
57
58        /* since r0 hold the result, move the first source
59         * pointer somewhere else
60         */
61
62         mov        r4, r0
63
64         /* make sure we have at least 8+4 bytes, this simplify things below
65          * and avoid some overhead for small blocks
66          */
67         cmp        r2, #(8+4)
68         bmi        8f
69
70        /* align first pointer to word boundary
71         * offset = -src & 3
72         */
73        rsb         r3, r4, #0
74        ands        r3, r3, #3
75        beq         0f
76
77        /* align first pointer  */
78        sub         r2, r2, r3
791:      ldrb        r0, [r4], #1
80        ldrb        ip, [r1], #1
81        subs        r0, r0, ip
82        bne         9f
83        subs        r3, r3, #1
84        bne         1b
85
86
870:      /* here the first pointer is aligned, and we have at least 4 bytes
88         * to process.
89         */
90
91        /* see if the pointers are congruent */
92        eor         r0, r4, r1
93        ands        r0, r0, #3
94        bne         5f
95
96        /* congruent case, 32 bytes per iteration
97         * We need to make sure there are at least 32+4 bytes left
98         * because we effectively read ahead one word, and we could
99         * read past the buffer (and segfault) if we're not careful.
100         */
101
102        ldr         ip, [r1]
103        subs        r2, r2, #(32 + 4)
104        bmi         1f
105
1060:      PLD         (r4, #64)
107        PLD         (r1, #64)
108        ldr         r0, [r4], #4
109        ldr         lr, [r1, #4]!
110        eors        r0, r0, ip
111        ldreq       r0, [r4], #4
112        ldreq       ip, [r1, #4]!
113        eoreqs      r0, r0, lr
114        ldreq       r0, [r4], #4
115        ldreq       lr, [r1, #4]!
116        eoreqs      r0, r0, ip
117        ldreq       r0, [r4], #4
118        ldreq       ip, [r1, #4]!
119        eoreqs      r0, r0, lr
120        ldreq       r0, [r4], #4
121        ldreq       lr, [r1, #4]!
122        eoreqs      r0, r0, ip
123        ldreq       r0, [r4], #4
124        ldreq       ip, [r1, #4]!
125        eoreqs      r0, r0, lr
126        ldreq       r0, [r4], #4
127        ldreq       lr, [r1, #4]!
128        eoreqs      r0, r0, ip
129        ldreq       r0, [r4], #4
130        ldreq       ip, [r1, #4]!
131        eoreqs      r0, r0, lr
132        bne         2f
133        subs        r2, r2, #32
134        bhs         0b
135
136        /* do we have at least 4 bytes left? */
1371:      adds        r2, r2, #(32 - 4 + 4)
138        bmi         4f
139
140        /* finish off 4 bytes at a time */
1413:      ldr         r0, [r4], #4
142        ldr         ip, [r1], #4
143        eors        r0, r0, ip
144        bne         2f
145        subs        r2, r2, #4
146        bhs         3b
147
148        /* are we done? */
1494:      adds        r2, r2, #4
150        moveq       r0, #0
151        beq         9f
152
153        /* finish off the remaining bytes */
154        b           8f
155
1562:      /* the last 4 bytes are different, restart them */
157        sub         r4, r4, #4
158        sub         r1, r1, #4
159        mov         r2, #4
160
161        /* process the last few bytes */
1628:      ldrb        r0, [r4], #1
163        ldrb        ip, [r1], #1
164        // stall
165        subs        r0, r0, ip
166        bne         9f
167        subs        r2, r2, #1
168        bne         8b
169
1709:      /* restore registers and return */
171        ldmfd       sp!, {r4, lr}
172        bx          lr
173END(memcmp)
174
175
176
177
178
1795:      /*************** non-congruent case ***************/
180        and         r0, r1, #3
181        cmp         r0, #2
182        bne         4f
183
184        /* here, offset is 2 (16-bits aligned, special cased) */
185
186        /* make sure we have at least 16 bytes to process */
187        subs        r2, r2, #16
188        addmi       r2, r2, #16
189        bmi         8b
190
191        /* align the unaligned pointer */
192        bic         r1, r1, #3
193        ldr         lr, [r1], #4
194
1956:      PLD         (r1, #64)
196        PLD         (r4, #64)
197        mov         ip, lr, lsr #16
198        ldr         lr, [r1], #4
199        ldr         r0, [r4], #4
200        orr         ip, ip, lr, lsl #16
201        eors        r0, r0, ip
202        moveq       ip, lr, lsr #16
203        ldreq       lr, [r1], #4
204        ldreq       r0, [r4], #4
205        orreq       ip, ip, lr, lsl #16
206        eoreqs      r0, r0, ip
207        moveq       ip, lr, lsr #16
208        ldreq       lr, [r1], #4
209        ldreq       r0, [r4], #4
210        orreq       ip, ip, lr, lsl #16
211        eoreqs      r0, r0, ip
212        moveq       ip, lr, lsr #16
213        ldreq       lr, [r1], #4
214        ldreq       r0, [r4], #4
215        orreq       ip, ip, lr, lsl #16
216        eoreqs      r0, r0, ip
217        bne         7f
218        subs        r2, r2, #16
219        bhs         6b
220        sub         r1, r1, #2
221        /* are we done? */
222        adds        r2, r2, #16
223        moveq       r0, #0
224        beq         9b
225        /* finish off the remaining bytes */
226        b           8b
227
2287:      /* fix up the 2 pointers and fallthrough... */
229        sub         r1, r1, #(4+2)
230        sub         r4, r4, #4
231        mov         r2, #4
232        b           8b
233
234
2354:      /*************** offset is 1 or 3 (less optimized) ***************/
236
237		stmfd		sp!, {r5, r6, r7}
238
239        // r5 = rhs
240        // r6 = lhs
241        // r7 = scratch
242
243        mov         r5, r0, lsl #3		/* r5 = right shift */
244        rsb         r6, r5, #32         /* r6 = left shift */
245
246        /* align the unaligned pointer */
247        bic         r1, r1, #3
248        ldr         r7, [r1], #4
249        sub         r2, r2, #8
250
2516:      mov         ip, r7, lsr r5
252        ldr         r7, [r1], #4
253        ldr         r0, [r4], #4
254        orr         ip, ip, r7, lsl r6
255        eors        r0, r0, ip
256        moveq       ip, r7, lsr r5
257        ldreq       r7, [r1], #4
258        ldreq       r0, [r4], #4
259        orreq       ip, ip, r7, lsl r6
260        eoreqs      r0, r0, ip
261        bne         7f
262        subs        r2, r2, #8
263        bhs         6b
264
265        sub         r1, r1, r6, lsr #3
266		ldmfd       sp!, {r5, r6, r7}
267
268        /* are we done? */
269        adds        r2, r2, #8
270        moveq       r0, #0
271        beq         9b
272
273        /* finish off the remaining bytes */
274        b           8b
275
2767:      /* fix up the 2 pointers and fallthrough... */
277        sub         r1, r1, #4
278        sub         r1, r1, r6, lsr #3
279        sub         r4, r4, #4
280        mov         r2, #4
281		ldmfd		sp!, {r5, r6, r7}
282        b           8b
283