1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/* Assumptions:
18 *
19 * ARMv8-a, AArch64
20 */
21
22#ifndef ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
23#define ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
24
25#include "asm_support_arm64.S"
26
27/* Parameters and result.  */
28#define src1        x0
29#define src2        x1
30#define limit       x2
31#define result      x0
32
33/* Internal variables.  */
34#define data1       x3
35#define data1w      w3
36#define data2       x4
37#define data2w      w4
38#define has_nul     x5
39#define diff        x6
40#define endloop     x7
41#define tmp1        x8
42#define tmp2        x9
43#define tmp3        x10
44#define limit_wd    x12
45#define mask        x13
46
47// WARNING: If you change this code to use x14 and x15, you must also change
48//          art_quick_string_compareto, which relies on these temps being unused.
49
50ENTRY __memcmp16
51  cbz     limit, .Lret0
52  lsl     limit, limit, #1  /* Half-words to bytes.  */
53  eor     tmp1, src1, src2
54  tst     tmp1, #7
55  b.ne    .Lmisaligned8
56  ands    tmp1, src1, #7
57  b.ne    .Lmutual_align
58  add     limit_wd, limit, #7
59  lsr     limit_wd, limit_wd, #3
60  /* Start of performance-critical section  -- one 64B cache line.  */
61.Lloop_aligned:
62  ldr     data1, [src1], #8
63  ldr     data2, [src2], #8
64.Lstart_realigned:
65  subs    limit_wd, limit_wd, #1
66  eor     diff, data1, data2  /* Non-zero if differences found.  */
67  csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */
68  cbz     endloop, .Lloop_aligned
69  /* End of performance-critical section  -- one 64B cache line.  */
70
71  /* Not reached the limit, must have found a diff.  */
72  cbnz    limit_wd, .Lnot_limit
73
74  /* Limit % 8 == 0 => all bytes significant.  */
75  ands    limit, limit, #7
76  b.eq    .Lnot_limit
77
78  lsl     limit, limit, #3  /* Bits -> bytes.  */
79  mov     mask, #~0
80  lsl     mask, mask, limit
81  bic     data1, data1, mask
82  bic     data2, data2, mask
83
84.Lnot_limit:
85
86  // Swap the byte order of diff. Exact reverse is not important, as we only need to detect
87  // the half-word.
88  rev     diff, diff
89  // The most significant bit of DIFF marks the least significant bit of change between DATA1/2
90  clz     diff, diff
91  // Mask off 0xF to have shift amount. Why does ARM64 not have BIC with immediate?!?!
92  bfi     diff, xzr, #0, #4
93  // Create a 16b mask
94  mov     mask, #0xFFFF
95  // Shift to the right half-word.
96  lsr     data1, data1, diff
97  lsr     data2, data2, diff
98  // Mask the lowest half-word.
99  and     data1, data1, mask
100  and     data2, data2, mask
101  // Compute difference.
102  sub     result, data1, data2
103  ret
104
105.Lmutual_align:
106  /* Sources are mutually aligned, but are not currently at an
107     alignment boundary.  Round down the addresses and then mask off
108     the bytes that precede the start point.  */
109  bic     src1, src1, #7
110  bic     src2, src2, #7
111  add     limit, limit, tmp1  /* Adjust the limit for the extra.  */
112  lsl     tmp1, tmp1, #3    /* Bytes beyond alignment -> bits.  */
113  ldr     data1, [src1], #8
114  neg     tmp1, tmp1    /* Bits to alignment -64.  */
115  ldr     data2, [src2], #8
116  mov     tmp2, #~0
117  /* Little-endian.  Early bytes are at LSB.  */
118  lsr     tmp2, tmp2, tmp1  /* Shift (tmp1 & 63).  */
119  add     limit_wd, limit, #7
120  orr     data1, data1, tmp2
121  orr     data2, data2, tmp2
122  lsr     limit_wd, limit_wd, #3
123  b       .Lstart_realigned
124
125.Lret0:
126  mov     result, #0
127  ret
128
129  .p2align 6
130.Lmisaligned8:
131  sub     limit, limit, #1
1321:
133  /* Perhaps we can do better than this.  */
134  ldrh    data1w, [src1], #2
135  ldrh    data2w, [src2], #2
136  subs    limit, limit, #2
137  ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
138  b.eq    1b
139  sub     result, data1, data2
140  ret
141END __memcmp16
142
143#endif  // ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
144