memcmp.S revision 40bc7cd4ed9fb848a7b3d934f69669f64ceed707
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30#include <machine/asm.h>
31
32
33#ifdef HAVE_32_BYTE_CACHE_LINE
34#define CACHE_LINE_SIZE     32
35#else
36#define CACHE_LINE_SIZE     64
37#endif
38
39/*
40 * Optimized memcmp() for Cortex-A9.
41 */
42
43ENTRY(memcmp)
44        pld         [r0, #(CACHE_LINE_SIZE * 0)]
45        pld         [r0, #(CACHE_LINE_SIZE * 1)]
46
47        /* take of the case where length is 0 or the buffers are the same */
48        cmp         r0, r1
49        moveq       r0, #0
50        bxeq        lr
51
52        pld         [r1, #(CACHE_LINE_SIZE * 0)]
53        pld         [r1, #(CACHE_LINE_SIZE * 1)]
54
55        /* make sure we have at least 8+4 bytes, this simplify things below
56         * and avoid some overhead for small blocks
57         */
58        cmp        r2, #(8+4)
59        bmi        10f
60/*
61 * Neon optimization
62 * Comparing 32 bytes at a time
63 */
64#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
65        subs        r2, r2, #32
66        blo         3f
67
68        /* preload all the cache lines we need. */
69        pld         [r0, #(CACHE_LINE_SIZE * 2)]
70        pld         [r1, #(CACHE_LINE_SIZE * 2)]
71
721:      /* The main loop compares 32 bytes at a time */
73        vld1.8      {d0 - d3}, [r0]!
74        pld         [r0, #(CACHE_LINE_SIZE * 2)]
75        vld1.8      {d4 - d7}, [r1]!
76        pld         [r1, #(CACHE_LINE_SIZE * 2)]
77
78        /* Start subtracting the values and merge results */
79        vsub.i8     q0, q2
80        vsub.i8     q1, q3
81        vorr        q2, q0, q1
82        vorr        d4, d5
83        vmov        r3, ip, d4
84        /* Check if there are any differences among the 32 bytes */
85        orrs        r3, ip
86        bne         2f
87        subs        r2, r2, #32
88        bhs         1b
89        b           3f
902:
91        /* Check if the difference was in the first or last 16 bytes */
92        sub         r0, #32
93        vorr        d0, d1
94        sub         r1, #32
95        vmov        r3, ip, d0
96        orrs        r3, ip
97        /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
98        ittt        eq
99        subeq       r2, #16
100        addeq       r0, #16
101        addeq       r1, #16
102
1033:      /* fix-up the remaining count */
104        add         r2, r2, #32
105
106        cmp        r2, #(8+4)
107        bmi        10f
108#endif
109
110        /* save registers */
111        .save       {r4, lr}
112        stmfd       sp!, {r4, lr}
113        .cfi_def_cfa_offset 8
114        .cfi_rel_offset r4, 0
115        .cfi_rel_offset lr, 4
116
117        /* since r0 hold the result, move the first source
118         * pointer somewhere else
119         */
120         mov        r4, r0
121
122        /* align first pointer to word boundary
123         * offset = -src & 3
124         */
125        rsb         r3, r4, #0
126        ands        r3, r3, #3
127        beq         0f
128
129        /* align first pointer  */
130        sub         r2, r2, r3
1311:      ldrb        r0, [r4], #1
132        ldrb        ip, [r1], #1
133        subs        r0, r0, ip
134        bne         9f
135        subs        r3, r3, #1
136        bne         1b
137
138
1390:      /* here the first pointer is aligned, and we have at least 4 bytes
140         * to process.
141         */
142
143        /* see if the pointers are congruent */
144        eor         r0, r4, r1
145        ands        r0, r0, #3
146        bne         5f
147
148        /* congruent case, 32 bytes per iteration
149         * We need to make sure there are at least 32+4 bytes left
150         * because we effectively read ahead one word, and we could
151         * read past the buffer (and segfault) if we're not careful.
152         */
153
154        ldr         ip, [r1]
155        subs        r2, r2, #(32 + 4)
156        bmi         1f
157
1580:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
159        pld         [r1, #(CACHE_LINE_SIZE * 2)]
160        ldr         r0, [r4], #4
161        ldr         lr, [r1, #4]!
162        eors        r0, r0, ip
163        ldreq       r0, [r4], #4
164        ldreq       ip, [r1, #4]!
165        eoreqs      r0, r0, lr
166        ldreq       r0, [r4], #4
167        ldreq       lr, [r1, #4]!
168        eoreqs      r0, r0, ip
169        ldreq       r0, [r4], #4
170        ldreq       ip, [r1, #4]!
171        eoreqs      r0, r0, lr
172        ldreq       r0, [r4], #4
173        ldreq       lr, [r1, #4]!
174        eoreqs      r0, r0, ip
175        ldreq       r0, [r4], #4
176        ldreq       ip, [r1, #4]!
177        eoreqs      r0, r0, lr
178        ldreq       r0, [r4], #4
179        ldreq       lr, [r1, #4]!
180        eoreqs      r0, r0, ip
181        ldreq       r0, [r4], #4
182        ldreq       ip, [r1, #4]!
183        eoreqs      r0, r0, lr
184        bne         2f
185        subs        r2, r2, #32
186        bhs         0b
187
188        /* do we have at least 4 bytes left? */
1891:      adds        r2, r2, #(32 - 4 + 4)
190        bmi         4f
191
192        /* finish off 4 bytes at a time */
1933:      ldr         r0, [r4], #4
194        ldr         ip, [r1], #4
195        eors        r0, r0, ip
196        bne         2f
197        subs        r2, r2, #4
198        bhs         3b
199
200        /* are we done? */
2014:      adds        r2, r2, #4
202        moveq       r0, #0
203        beq         9f
204
205        /* finish off the remaining bytes */
206        b           8f
207
2082:      /* the last 4 bytes are different, restart them */
209        sub         r4, r4, #4
210        sub         r1, r1, #4
211        mov         r2, #4
212
213        /* process the last few bytes */
2148:      ldrb        r0, [r4], #1
215        ldrb        ip, [r1], #1
216        // stall
217        subs        r0, r0, ip
218        bne         9f
219        subs        r2, r2, #1
220        bne         8b
221
2229:      /* restore registers and return */
223        ldmfd       sp!, {r4, lr}
224        bx          lr
225
22610:     /* process less than 12 bytes */
227        cmp         r2, #0
228        moveq       r0, #0
229        bxeq        lr
230        mov         r3, r0
23111:
232        ldrb        r0, [r3], #1
233        ldrb        ip, [r1], #1
234        subs        r0, ip
235        bxne        lr
236        subs        r2, r2, #1
237        bne         11b
238        bx          lr
239
2405:      /*************** non-congruent case ***************/
241        and         r0, r1, #3
242        cmp         r0, #2
243        bne         4f
244
245        /* here, offset is 2 (16-bits aligned, special cased) */
246
247        /* make sure we have at least 16 bytes to process */
248        subs        r2, r2, #16
249        addmi       r2, r2, #16
250        bmi         8b
251
252        /* align the unaligned pointer */
253        bic         r1, r1, #3
254        ldr         lr, [r1], #4
255
2566:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
257        pld         [r4, #(CACHE_LINE_SIZE * 2)]
258        mov         ip, lr, lsr #16
259        ldr         lr, [r1], #4
260        ldr         r0, [r4], #4
261        orr         ip, ip, lr, lsl #16
262        eors        r0, r0, ip
263        moveq       ip, lr, lsr #16
264        ldreq       lr, [r1], #4
265        ldreq       r0, [r4], #4
266        orreq       ip, ip, lr, lsl #16
267        eoreqs      r0, r0, ip
268        moveq       ip, lr, lsr #16
269        ldreq       lr, [r1], #4
270        ldreq       r0, [r4], #4
271        orreq       ip, ip, lr, lsl #16
272        eoreqs      r0, r0, ip
273        moveq       ip, lr, lsr #16
274        ldreq       lr, [r1], #4
275        ldreq       r0, [r4], #4
276        orreq       ip, ip, lr, lsl #16
277        eoreqs      r0, r0, ip
278        bne         7f
279        subs        r2, r2, #16
280        bhs         6b
281        sub         r1, r1, #2
282        /* are we done? */
283        adds        r2, r2, #16
284        moveq       r0, #0
285        beq         9b
286        /* finish off the remaining bytes */
287        b           8b
288
2897:      /* fix up the 2 pointers and fallthrough... */
290        sub         r1, r1, #(4+2)
291        sub         r4, r4, #4
292        mov         r2, #4
293        b           8b
294
295
2964:      /*************** offset is 1 or 3 (less optimized) ***************/
297
298		stmfd		sp!, {r5, r6, r7}
299
300        // r5 = rhs
301        // r6 = lhs
302        // r7 = scratch
303
304        mov         r5, r0, lsl #3		/* r5 = right shift */
305        rsb         r6, r5, #32         /* r6 = left shift */
306
307        /* align the unaligned pointer */
308        bic         r1, r1, #3
309        ldr         r7, [r1], #4
310        sub         r2, r2, #8
311
3126:      mov         ip, r7, lsr r5
313        ldr         r7, [r1], #4
314        ldr         r0, [r4], #4
315        orr         ip, ip, r7, lsl r6
316        eors        r0, r0, ip
317        moveq       ip, r7, lsr r5
318        ldreq       r7, [r1], #4
319        ldreq       r0, [r4], #4
320        orreq       ip, ip, r7, lsl r6
321        eoreqs      r0, r0, ip
322        bne         7f
323        subs        r2, r2, #8
324        bhs         6b
325
326        sub         r1, r1, r6, lsr #3
327		ldmfd       sp!, {r5, r6, r7}
328
329        /* are we done? */
330        adds        r2, r2, #8
331        moveq       r0, #0
332        beq         9b
333
334        /* finish off the remaining bytes */
335        b           8b
336
3377:      /* fix up the 2 pointers and fallthrough... */
338        sub         r1, r1, #4
339        sub         r1, r1, r6, lsr #3
340        sub         r4, r4, #4
341        mov         r2, #4
342		ldmfd		sp!, {r5, r6, r7}
343        b           8b
344END(memcmp)
345