memcmp.S revision a27d2baa0c1a2ec70f47ea9199b1dd6762c8a349
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28    .text
29
30    .global memcmp
31    .type memcmp, %function
32    .align 4
33
34/*
35 * Optimized memcmp() for ARM9.
36 * This would not be optimal on XScale or ARM11, where more prefetching
37 * and use of PLD will be needed.
38 * The 2 major optimzations here are
39 * (1) The main loop compares 16 bytes at a time
40 * (2) The loads are scheduled in a way they won't stall
41 */
42
43memcmp:
44        pld         [r0, #0]
45        pld         [r1, #0]
46
47        /* take of the case where length is 0 or the buffers are the same */
48        cmp         r0, r1
49        cmpne       r2, #0
50        moveq       r0, #0
51        bxeq        lr
52
53        /* save registers */
54        stmfd       sp!, {r4, lr}
55
56        pld         [r0, #32]
57        pld         [r1, #32]
58
59        /* since r0 hold the result, move the first source
60         * pointer somewhere else
61         */
62
63         mov        r4, r0
64
65         /* make sure we have at least 8+4 bytes, this simplify things below
66          * and avoid some overhead for small blocks
67          */
68         cmp        r2, #(8+4)
69         bmi        8f
70
71        /* align first pointer to word boundary
72         * offset = -src & 3
73         */
74        rsb         r3, r4, #0
75        ands        r3, r3, #3
76        beq         0f
77
78        /* align first pointer  */
79        sub         r2, r2, r3
801:      ldrb        r0, [r4], #1
81        ldrb        ip, [r1], #1
82        subs        r0, r0, ip
83        bne         9f
84        subs        r3, r3, #1
85        bne         1b
86
87
880:      /* here the first pointer is aligned, and we have at least 4 bytes
89         * to process.
90         */
91
92        /* see if the pointers are congruent */
93        eor         r0, r4, r1
94        ands        r0, r0, #3
95        bne         5f
96
97        /* congruent case, 32 bytes per iteration
98         * We need to make sure there are at least 32+4 bytes left
99         * because we effectively read ahead one word, and we could
100         * read past the buffer (and segfault) if we're not careful.
101         */
102
103        ldr         ip, [r1]
104        subs        r2, r2, #(32 + 4)
105        bmi         1f
106
1070:      pld         [r4, #64]
108        pld         [r1, #64]
109        ldr         r0, [r4], #4
110        ldr         lr, [r1, #4]!
111        eors        r0, r0, ip
112        ldreq       r0, [r4], #4
113        ldreq       ip, [r1, #4]!
114        eoreqs      r0, r0, lr
115        ldreq       r0, [r4], #4
116        ldreq       lr, [r1, #4]!
117        eoreqs      r0, r0, ip
118        ldreq       r0, [r4], #4
119        ldreq       ip, [r1, #4]!
120        eoreqs      r0, r0, lr
121        ldreq       r0, [r4], #4
122        ldreq       lr, [r1, #4]!
123        eoreqs      r0, r0, ip
124        ldreq       r0, [r4], #4
125        ldreq       ip, [r1, #4]!
126        eoreqs      r0, r0, lr
127        ldreq       r0, [r4], #4
128        ldreq       lr, [r1, #4]!
129        eoreqs      r0, r0, ip
130        ldreq       r0, [r4], #4
131        ldreq       ip, [r1, #4]!
132        eoreqs      r0, r0, lr
133        bne         2f
134        subs        r2, r2, #32
135        bhs         0b
136
137        /* do we have at least 4 bytes left? */
1381:      adds        r2, r2, #(32 - 4 + 4)
139        bmi         4f
140
141        /* finish off 4 bytes at a time */
1423:      ldr         r0, [r4], #4
143        ldr         ip, [r1], #4
144        eors        r0, r0, ip
145        bne         2f
146        subs        r2, r2, #4
147        bhs         3b
148
149        /* are we done? */
1504:      adds        r2, r2, #4
151        moveq       r0, #0
152        beq         9f
153
154        /* finish off the remaining bytes */
155        b           8f
156
1572:      /* the last 4 bytes are different, restart them */
158        sub         r4, r4, #4
159        sub         r1, r1, #4
160        mov         r2, #4
161
162        /* process the last few bytes */
1638:      ldrb        r0, [r4], #1
164        ldrb        ip, [r1], #1
165        // stall
166        subs        r0, r0, ip
167        bne         9f
168        subs        r2, r2, #1
169        bne         8b
170
1719:      /* restore registers and return */
172        ldmfd       sp!, {r4, lr}
173        bx          lr
174
175
176
177
178
1795:      /*************** non-congruent case ***************/
180        and         r0, r1, #3
181        cmp         r0, #2
182        bne         4f
183
184        /* here, offset is 2 (16-bits aligned, special cased) */
185
186        /* make sure we have at least 16 bytes to process */
187        subs        r2, r2, #16
188        addmi       r2, r2, #16
189        bmi         8b
190
191        /* align the unaligned pointer */
192        bic         r1, r1, #3
193        ldr         lr, [r1], #4
194
1956:      pld         [r1, #64]
196        pld         [r4, #64]
197        mov         ip, lr, lsr #16
198        ldr         lr, [r1], #4
199        ldr         r0, [r4], #4
200        orr         ip, ip, lr, lsl #16
201        eors        r0, r0, ip
202        moveq       ip, lr, lsr #16
203        ldreq       lr, [r1], #4
204        ldreq       r0, [r4], #4
205        orreq       ip, ip, lr, lsl #16
206        eoreqs      r0, r0, ip
207        moveq       ip, lr, lsr #16
208        ldreq       lr, [r1], #4
209        ldreq       r0, [r4], #4
210        orreq       ip, ip, lr, lsl #16
211        eoreqs      r0, r0, ip
212        moveq       ip, lr, lsr #16
213        ldreq       lr, [r1], #4
214        ldreq       r0, [r4], #4
215        orreq       ip, ip, lr, lsl #16
216        eoreqs      r0, r0, ip
217        bne         7f
218        subs        r2, r2, #16
219        bhs         6b
220        sub         r1, r1, #2
221        /* are we done? */
222        adds        r2, r2, #16
223        moveq       r0, #0
224        beq         9b
225        /* finish off the remaining bytes */
226        b           8b
227
2287:      /* fix up the 2 pointers and fallthrough... */
229        sub         r1, r1, #(4+2)
230        sub         r4, r4, #4
231        mov         r2, #4
232        b           8b
233
234
2354:      /*************** offset is 1 or 3 (less optimized) ***************/
236
237		stmfd		sp!, {r5, r6, r7}
238
239        // r5 = rhs
240        // r6 = lhs
241        // r7 = scratch
242
243        mov         r5, r0, lsl #3		/* r5 = right shift */
244        rsb         r6, r5, #32         /* r6 = left shift */
245
246        /* align the unaligned pointer */
247        bic         r1, r1, #3
248        ldr         r7, [r1], #4
249        sub         r2, r2, #8
250
2516:      mov         ip, r7, lsr r5
252        ldr         r7, [r1], #4
253        ldr         r0, [r4], #4
254        orr         ip, ip, r7, lsl r6
255        eors        r0, r0, ip
256        moveq       ip, r7, lsr r5
257        ldreq       r7, [r1], #4
258        ldreq       r0, [r4], #4
259        orreq       ip, ip, r7, lsl r6
260        eoreqs      r0, r0, ip
261        bne         7f
262        subs        r2, r2, #8
263        bhs         6b
264
265        sub         r1, r1, r6, lsr #3
266		ldmfd       sp!, {r5, r6, r7}
267
268        /* are we done? */
269        adds        r2, r2, #8
270        moveq       r0, #0
271        beq         9b
272
273        /* finish off the remaining bytes */
274        b           8b
275
2767:      /* fix up the 2 pointers and fallthrough... */
277        sub         r1, r1, #4
278        sub         r1, r1, r6, lsr #3
279        sub         r4, r4, #4
280        mov         r2, #4
281		ldmfd		sp!, {r5, r6, r7}
282        b           8b
283