1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30#include <machine/asm.h>
31
32/*
33 * Optimized memcmp16() for ARM9.
34 * This would not be optimal on XScale or ARM11, where more prefetching
35 * and use of PLD will be needed.
36 * The 2 major optimzations here are
37 * (1) The main loop compares 16 bytes at a time
38 * (2) The loads are scheduled in a way they won't stall
39 */
40
41ENTRY(__memcmp16)
42        PLD         (r0, #0)
43        PLD         (r1, #0)
44
45        /* take of the case where length is nul or the buffers are the same */
46        cmp         r0, r1
47        cmpne       r2, #0
48        moveq       r0, #0
49        bxeq        lr
50
51        /* since r0 hold the result, move the first source
52         * pointer somewhere else
53         */
54
55        mov         r3, r0
56
57         /* make sure we have at least 12 words, this simplify things below
58          * and avoid some overhead for small blocks
59          */
60
61        cmp         r2, #12
62        bpl         0f
63
64        /* small blocks (less then 12 words) */
65        PLD         (r0, #32)
66        PLD         (r1, #32)
67
681:      ldrh        r0, [r3], #2
69        ldrh        ip, [r1], #2
70        subs        r0, r0, ip
71        bxne        lr
72        subs        r2, r2, #1
73        bne         1b
74        bx          lr
75
76
77        .save {r4, lr}
78        /* save registers */
790:      stmfd       sp!, {r4, lr}
80
81        /* align first pointer to word boundary */
82        tst         r3, #2
83        beq         0f
84
85        ldrh        r0, [r3], #2
86        ldrh        ip, [r1], #2
87        sub         r2, r2, #1
88        subs        r0, r0, ip
89        /* restore registers and return */
90        ldmnefd     sp!, {r4, lr}
91        bxne        lr
92
93
940:      /* here the first pointer is aligned, and we have at least 3 words
95         * to process.
96         */
97
98        /* see if the pointers are congruent */
99        eor         r0, r3, r1
100        ands        r0, r0, #2
101        bne         5f
102
103        /* congruent case, 16 half-words per iteration
104         * We need to make sure there are at least 16+2 words left
105         * because we effectively read ahead one long word, and we could
106         * read past the buffer (and segfault) if we're not careful.
107         */
108
109        ldr         ip, [r1]
110        subs        r2, r2, #(16 + 2)
111        bmi         1f
112
1130:
114        PLD         (r3, #64)
115        PLD         (r1, #64)
116        ldr         r0, [r3], #4
117        ldr         lr, [r1, #4]!
118        eors        r0, r0, ip
119        ldreq       r0, [r3], #4
120        ldreq       ip, [r1, #4]!
121        eoreqs      r0, r0, lr
122        ldreq       r0, [r3], #4
123        ldreq       lr, [r1, #4]!
124        eoreqs      r0, r0, ip
125        ldreq       r0, [r3], #4
126        ldreq       ip, [r1, #4]!
127        eoreqs      r0, r0, lr
128        ldreq       r0, [r3], #4
129        ldreq       lr, [r1, #4]!
130        eoreqs      r0, r0, ip
131        ldreq       r0, [r3], #4
132        ldreq       ip, [r1, #4]!
133        eoreqs      r0, r0, lr
134        ldreq       r0, [r3], #4
135        ldreq       lr, [r1, #4]!
136        eoreqs      r0, r0, ip
137        ldreq       r0, [r3], #4
138        ldreq       ip, [r1, #4]!
139        eoreqs      r0, r0, lr
140        bne         2f
141        subs        r2, r2, #16
142        bhs         0b
143
144        /* do we have at least 2 words left? */
1451:      adds        r2, r2, #(16 - 2 + 2)
146        bmi         4f
147
148        /* finish off 2 words at a time */
1493:      ldr         r0, [r3], #4
150        ldr         ip, [r1], #4
151        eors        r0, r0, ip
152        bne         2f
153        subs        r2, r2, #2
154        bhs         3b
155
156        /* are we done? */
1574:      adds        r2, r2, #2
158        bne         8f
159        /* restore registers and return */
160        mov         r0, #0
161        ldmfd       sp!, {r4, lr}
162        bx          lr
163
1642:      /* the last 2 words are different, restart them */
165        ldrh        r0, [r3, #-4]
166        ldrh        ip, [r1, #-4]
167        subs        r0, r0, ip
168        ldreqh      r0, [r3, #-2]
169        ldreqh      ip, [r1, #-2]
170        subeqs      r0, r0, ip
171        /* restore registers and return */
172        ldmfd       sp!, {r4, lr}
173        bx          lr
174
175        /* process the last few words */
1768:      ldrh        r0, [r3], #2
177        ldrh        ip, [r1], #2
178        subs        r0, r0, ip
179        bne         9f
180        subs        r2, r2, #1
181        bne         8b
182
1839:      /* restore registers and return */
184        ldmfd       sp!, {r4, lr}
185        bx          lr
186
187
1885:      /*************** non-congruent case ***************/
189
190        /* align the unaligned pointer */
191        bic         r1, r1, #3
192        ldr         lr, [r1], #4
193        sub         r2, r2, #8
194
1956:
196        PLD         (r3, #64)
197        PLD         (r1, #64)
198        mov         ip, lr, lsr #16
199        ldr         lr, [r1], #4
200        ldr         r0, [r3], #4
201        orr         ip, ip, lr, lsl #16
202        eors        r0, r0, ip
203        moveq       ip, lr, lsr #16
204        ldreq       lr, [r1], #4
205        ldreq       r0, [r3], #4
206        orreq       ip, ip, lr, lsl #16
207        eoreqs      r0, r0, ip
208        moveq       ip, lr, lsr #16
209        ldreq       lr, [r1], #4
210        ldreq       r0, [r3], #4
211        orreq       ip, ip, lr, lsl #16
212        eoreqs      r0, r0, ip
213        moveq       ip, lr, lsr #16
214        ldreq       lr, [r1], #4
215        ldreq       r0, [r3], #4
216        orreq       ip, ip, lr, lsl #16
217        eoreqs      r0, r0, ip
218        bne         7f
219        subs        r2, r2, #8
220        bhs         6b
221        sub         r1, r1, #2
222        /* are we done? */
223        adds        r2, r2, #8
224        moveq       r0, #0
225        beq         9b
226        /* finish off the remaining bytes */
227        b           8b
228
2297:      /* fix up the 2 pointers and fallthrough... */
230        sub         r1, r1, #2
231        b           2b
232END(__memcmp16)
233