1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include "cache.h" 32 33#ifndef MEMCPY 34# define MEMCPY memcpy 35#endif 36 37#ifndef L 38# define L(label) .L##label 39#endif 40 41#ifndef cfi_startproc 42# define cfi_startproc .cfi_startproc 43#endif 44 45#ifndef cfi_endproc 46# define cfi_endproc .cfi_endproc 47#endif 48 49#ifndef cfi_rel_offset 50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51#endif 52 53#ifndef cfi_restore 54# define cfi_restore(reg) .cfi_restore reg 55#endif 56 57#ifndef cfi_adjust_cfa_offset 58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59#endif 60 61#ifndef ENTRY 62# define ENTRY(name) \ 63 .type name, @function; \ 64 .globl name; \ 65 .p2align 4; \ 66name: \ 67 cfi_startproc 68#endif 69 70#ifndef END 71# define END(name) \ 72 cfi_endproc; \ 73 .size name, .-name 74#endif 75 76#define DEST PARMS 77#define SRC DEST+4 78#define LEN SRC+4 79 80#define CFI_PUSH(REG) \ 81 cfi_adjust_cfa_offset (4); \ 82 cfi_rel_offset (REG, 0) 83 84#define CFI_POP(REG) \ 85 cfi_adjust_cfa_offset (-4); \ 86 cfi_restore (REG) 87 88#define PUSH(REG) pushl REG; CFI_PUSH (REG) 89#define POP(REG) popl REG; CFI_POP (REG) 90 91#define PARMS 8 /* Preserve EBX. */ 92#define ENTRANCE PUSH (%ebx); 93#define RETURN_END POP (%ebx); ret 94#define RETURN RETURN_END; CFI_PUSH (%ebx) 95 96 .section .text.sse2,"ax",@progbits 97ENTRY (MEMCPY) 98 ENTRANCE 99 movl LEN(%esp), %ecx 100 movl SRC(%esp), %eax 101 movl DEST(%esp), %edx 102 103 cmp %eax, %edx 104 je L(return) 105 106 cmp $16, %ecx 107 jbe L(len_0_16_bytes) 108 109 cmp $SHARED_CACHE_SIZE_HALF, %ecx 110 jae L(large_page) 111 112 movdqu (%eax), %xmm0 113 movdqu -16(%eax, %ecx), %xmm1 114 cmpl $32, %ecx 115 movdqu %xmm0, (%edx) 116 movdqu %xmm1, -16(%edx, %ecx) 117 jbe L(return) 118 119 movdqu 16(%eax), %xmm0 120 movdqu -32(%eax, %ecx), %xmm1 121 cmpl $64, %ecx 122 movdqu %xmm0, 16(%edx) 123 movdqu %xmm1, -32(%edx, %ecx) 124 jbe L(return) 125 126 movdqu 32(%eax), %xmm0 127 movdqu 48(%eax), %xmm1 128 movdqu -48(%eax, %ecx), %xmm2 129 movdqu -64(%eax, %ecx), %xmm3 130 cmpl $128, %ecx 131 movdqu %xmm0, 32(%edx) 132 movdqu %xmm1, 48(%edx) 133 movdqu %xmm2, -48(%edx, %ecx) 134 movdqu %xmm3, -64(%edx, %ecx) 135 jbe L(return) 136 137/* Now the main loop: we align the address of the destination. */ 138 leal 64(%edx), %ebx 139 andl $-64, %ebx 140 141 addl %edx, %ecx 142 andl $-64, %ecx 143 144 subl %edx, %eax 145 146/* We should stop two iterations before the termination 147 (in order not to misprefetch). */ 148 subl $64, %ecx 149 cmpl %ebx, %ecx 150 je L(main_loop_just_one_iteration) 151 152 subl $64, %ecx 153 cmpl %ebx, %ecx 154 je L(main_loop_last_two_iterations) 155 156 157 .p2align 4 158L(main_loop_cache): 159 160 prefetcht0 128(%ebx, %eax) 161 162 movdqu (%ebx, %eax), %xmm0 163 movdqu 16(%ebx, %eax), %xmm1 164 movdqu 32(%ebx, %eax), %xmm2 165 movdqu 48(%ebx, %eax), %xmm3 166 movdqa %xmm0, (%ebx) 167 movdqa %xmm1, 16(%ebx) 168 movdqa %xmm2, 32(%ebx) 169 movdqa %xmm3, 48(%ebx) 170 lea 64(%ebx), %ebx 171 cmpl %ebx, %ecx 172 jne L(main_loop_cache) 173 174L(main_loop_last_two_iterations): 175 movdqu (%ebx, %eax), %xmm0 176 movdqu 16(%ebx, %eax), %xmm1 177 movdqu 32(%ebx, %eax), %xmm2 178 movdqu 48(%ebx, %eax), %xmm3 179 movdqu 64(%ebx, %eax), %xmm4 180 movdqu 80(%ebx, %eax), %xmm5 181 movdqu 96(%ebx, %eax), %xmm6 182 movdqu 112(%ebx, %eax), %xmm7 183 movdqa %xmm0, (%ebx) 184 movdqa %xmm1, 16(%ebx) 185 movdqa %xmm2, 32(%ebx) 186 movdqa %xmm3, 48(%ebx) 187 movdqa %xmm4, 64(%ebx) 188 movdqa %xmm5, 80(%ebx) 189 movdqa %xmm6, 96(%ebx) 190 movdqa %xmm7, 112(%ebx) 191 jmp L(return) 192 193L(main_loop_just_one_iteration): 194 movdqu (%ebx, %eax), %xmm0 195 movdqu 16(%ebx, %eax), %xmm1 196 movdqu 32(%ebx, %eax), %xmm2 197 movdqu 48(%ebx, %eax), %xmm3 198 movdqa %xmm0, (%ebx) 199 movdqa %xmm1, 16(%ebx) 200 movdqa %xmm2, 32(%ebx) 201 movdqa %xmm3, 48(%ebx) 202 jmp L(return) 203 204L(large_page): 205 movdqu (%eax), %xmm0 206 movdqu 16(%eax), %xmm1 207 movdqu 32(%eax), %xmm2 208 movdqu 48(%eax), %xmm3 209 movdqu -64(%eax, %ecx), %xmm4 210 movdqu -48(%eax, %ecx), %xmm5 211 movdqu -32(%eax, %ecx), %xmm6 212 movdqu -16(%eax, %ecx), %xmm7 213 movdqu %xmm0, (%edx) 214 movdqu %xmm1, 16(%edx) 215 movdqu %xmm2, 32(%edx) 216 movdqu %xmm3, 48(%edx) 217 movdqu %xmm4, -64(%edx, %ecx) 218 movdqu %xmm5, -48(%edx, %ecx) 219 movdqu %xmm6, -32(%edx, %ecx) 220 movdqu %xmm7, -16(%edx, %ecx) 221 222 movdqu 64(%eax), %xmm0 223 movdqu 80(%eax), %xmm1 224 movdqu 96(%eax), %xmm2 225 movdqu 112(%eax), %xmm3 226 movdqu -128(%eax, %ecx), %xmm4 227 movdqu -112(%eax, %ecx), %xmm5 228 movdqu -96(%eax, %ecx), %xmm6 229 movdqu -80(%eax, %ecx), %xmm7 230 movdqu %xmm0, 64(%edx) 231 movdqu %xmm1, 80(%edx) 232 movdqu %xmm2, 96(%edx) 233 movdqu %xmm3, 112(%edx) 234 movdqu %xmm4, -128(%edx, %ecx) 235 movdqu %xmm5, -112(%edx, %ecx) 236 movdqu %xmm6, -96(%edx, %ecx) 237 movdqu %xmm7, -80(%edx, %ecx) 238 239/* Now the main loop with non temporal stores. We align 240 the address of the destination. */ 241 leal 128(%edx), %ebx 242 andl $-128, %ebx 243 244 addl %edx, %ecx 245 andl $-128, %ecx 246 247 subl %edx, %eax 248 249 .p2align 4 250L(main_loop_large_page): 251 movdqu (%ebx, %eax), %xmm0 252 movdqu 16(%ebx, %eax), %xmm1 253 movdqu 32(%ebx, %eax), %xmm2 254 movdqu 48(%ebx, %eax), %xmm3 255 movdqu 64(%ebx, %eax), %xmm4 256 movdqu 80(%ebx, %eax), %xmm5 257 movdqu 96(%ebx, %eax), %xmm6 258 movdqu 112(%ebx, %eax), %xmm7 259 movntdq %xmm0, (%ebx) 260 movntdq %xmm1, 16(%ebx) 261 movntdq %xmm2, 32(%ebx) 262 movntdq %xmm3, 48(%ebx) 263 movntdq %xmm4, 64(%ebx) 264 movntdq %xmm5, 80(%ebx) 265 movntdq %xmm6, 96(%ebx) 266 movntdq %xmm7, 112(%ebx) 267 lea 128(%ebx), %ebx 268 cmpl %ebx, %ecx 269 jne L(main_loop_large_page) 270 sfence 271 jmp L(return) 272 273L(len_0_16_bytes): 274 testb $24, %cl 275 jne L(len_9_16_bytes) 276 testb $4, %cl 277 .p2align 4,,5 278 jne L(len_5_8_bytes) 279 testl %ecx, %ecx 280 .p2align 4,,2 281 je L(return) 282 movzbl (%eax), %ebx 283 testb $2, %cl 284 movb %bl, (%edx) 285 je L(return) 286 movzwl -2(%eax,%ecx), %ebx 287 movw %bx, -2(%edx,%ecx) 288 jmp L(return) 289 290L(len_9_16_bytes): 291 movq (%eax), %xmm0 292 movq -8(%eax, %ecx), %xmm1 293 movq %xmm0, (%edx) 294 movq %xmm1, -8(%edx, %ecx) 295 jmp L(return) 296 297L(len_5_8_bytes): 298 movl (%eax), %ebx 299 movl %ebx, (%edx) 300 movl -4(%eax,%ecx), %ebx 301 movl %ebx, -4(%edx,%ecx) 302 jmp L(return) 303 304L(return): 305 movl %edx, %eax 306 RETURN 307 308END (MEMCPY) 309