1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "go_asm.h"
6#include "go_tls.h"
7#include "funcdata.h"
8#include "textflag.h"
9
10TEXT runtime·rt0_go(SB),NOSPLIT,$0
11	// copy arguments forward on an even stack
12	MOVL	argc+0(FP), AX
13	MOVL	argv+4(FP), BX
14	MOVL	SP, CX
15	SUBL	$128, CX		// plenty of scratch
16	ANDL	$~15, CX
17	MOVL	CX, SP
18
19	MOVL	AX, 16(SP)
20	MOVL	BX, 24(SP)
21
22	// create istack out of the given (operating system) stack.
23	MOVL	$runtime·g0(SB), DI
24	LEAL	(-64*1024+104)(SP), BX
25	MOVL	BX, g_stackguard0(DI)
26	MOVL	BX, g_stackguard1(DI)
27	MOVL	BX, (g_stack+stack_lo)(DI)
28	MOVL	SP, (g_stack+stack_hi)(DI)
29
30	// find out information about the processor we're on
31	MOVQ	$0, AX
32	CPUID
33	CMPQ	AX, $0
34	JE	nocpuinfo
35	MOVQ	$1, AX
36	CPUID
37	MOVL	CX, runtime·cpuid_ecx(SB)
38	MOVL	DX, runtime·cpuid_edx(SB)
39nocpuinfo:
40
41needtls:
42	LEAL	runtime·m0+m_tls(SB), DI
43	CALL	runtime·settls(SB)
44
45	// store through it, to make sure it works
46	get_tls(BX)
47	MOVQ	$0x123, g(BX)
48	MOVQ	runtime·m0+m_tls(SB), AX
49	CMPQ	AX, $0x123
50	JEQ 2(PC)
51	MOVL	AX, 0	// abort
52ok:
53	// set the per-goroutine and per-mach "registers"
54	get_tls(BX)
55	LEAL	runtime·g0(SB), CX
56	MOVL	CX, g(BX)
57	LEAL	runtime·m0(SB), AX
58
59	// save m->g0 = g0
60	MOVL	CX, m_g0(AX)
61	// save m0 to g0->m
62	MOVL	AX, g_m(CX)
63
64	CLD				// convention is D is always left cleared
65	CALL	runtime·check(SB)
66
67	MOVL	16(SP), AX		// copy argc
68	MOVL	AX, 0(SP)
69	MOVL	24(SP), AX		// copy argv
70	MOVL	AX, 4(SP)
71	CALL	runtime·args(SB)
72	CALL	runtime·osinit(SB)
73	CALL	runtime·schedinit(SB)
74
75	// create a new goroutine to start program
76	MOVL	$runtime·mainPC(SB), AX	// entry
77	MOVL	$0, 0(SP)
78	MOVL	AX, 4(SP)
79	CALL	runtime·newproc(SB)
80
81	// start this M
82	CALL	runtime·mstart(SB)
83
84	MOVL	$0xf1, 0xf1  // crash
85	RET
86
87DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
88GLOBL	runtime·mainPC(SB),RODATA,$4
89
90TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
91	INT $3
92	RET
93
94TEXT runtime·asminit(SB),NOSPLIT,$0-0
95	// No per-thread init.
96	RET
97
98/*
99 *  go-routine
100 */
101
102// void gosave(Gobuf*)
103// save state in Gobuf; setjmp
104TEXT runtime·gosave(SB), NOSPLIT, $0-4
105	MOVL	buf+0(FP), AX	// gobuf
106	LEAL	buf+0(FP), BX	// caller's SP
107	MOVL	BX, gobuf_sp(AX)
108	MOVL	0(SP), BX		// caller's PC
109	MOVL	BX, gobuf_pc(AX)
110	MOVQ	$0, gobuf_ret(AX)
111	// Assert ctxt is zero. See func save.
112	MOVL	gobuf_ctxt(AX), BX
113	TESTL	BX, BX
114	JZ	2(PC)
115	CALL	runtime·badctxt(SB)
116	get_tls(CX)
117	MOVL	g(CX), BX
118	MOVL	BX, gobuf_g(AX)
119	RET
120
121// void gogo(Gobuf*)
122// restore state from Gobuf; longjmp
123TEXT runtime·gogo(SB), NOSPLIT, $8-4
124	MOVL	buf+0(FP), BX		// gobuf
125
126	// If ctxt is not nil, invoke deletion barrier before overwriting.
127	MOVL	gobuf_ctxt(BX), DX
128	TESTL	DX, DX
129	JZ	nilctxt
130	LEAL	gobuf_ctxt(BX), AX
131	MOVL	AX, 0(SP)
132	MOVL	$0, 4(SP)
133	CALL	runtime·writebarrierptr_prewrite(SB)
134	MOVL	buf+0(FP), BX
135
136nilctxt:
137	MOVL	gobuf_g(BX), DX
138	MOVL	0(DX), CX		// make sure g != nil
139	get_tls(CX)
140	MOVL	DX, g(CX)
141	MOVL	gobuf_sp(BX), SP	// restore SP
142	MOVL	gobuf_ctxt(BX), DX
143	MOVQ	gobuf_ret(BX), AX
144	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
145	MOVQ	$0, gobuf_ret(BX)
146	MOVL	$0, gobuf_ctxt(BX)
147	MOVL	gobuf_pc(BX), BX
148	JMP	BX
149
150// func mcall(fn func(*g))
151// Switch to m->g0's stack, call fn(g).
152// Fn must never return. It should gogo(&g->sched)
153// to keep running g.
154TEXT runtime·mcall(SB), NOSPLIT, $0-4
155	MOVL	fn+0(FP), DI
156
157	get_tls(CX)
158	MOVL	g(CX), AX	// save state in g->sched
159	MOVL	0(SP), BX	// caller's PC
160	MOVL	BX, (g_sched+gobuf_pc)(AX)
161	LEAL	fn+0(FP), BX	// caller's SP
162	MOVL	BX, (g_sched+gobuf_sp)(AX)
163	MOVL	AX, (g_sched+gobuf_g)(AX)
164
165	// switch to m->g0 & its stack, call fn
166	MOVL	g(CX), BX
167	MOVL	g_m(BX), BX
168	MOVL	m_g0(BX), SI
169	CMPL	SI, AX	// if g == m->g0 call badmcall
170	JNE	3(PC)
171	MOVL	$runtime·badmcall(SB), AX
172	JMP	AX
173	MOVL	SI, g(CX)	// g = m->g0
174	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
175	PUSHQ	AX
176	MOVL	DI, DX
177	MOVL	0(DI), DI
178	CALL	DI
179	POPQ	AX
180	MOVL	$runtime·badmcall2(SB), AX
181	JMP	AX
182	RET
183
184// systemstack_switch is a dummy routine that systemstack leaves at the bottom
185// of the G stack. We need to distinguish the routine that
186// lives at the bottom of the G stack from the one that lives
187// at the top of the system stack because the one at the top of
188// the system stack terminates the stack walk (see topofstack()).
189TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
190	RET
191
192// func systemstack(fn func())
193TEXT runtime·systemstack(SB), NOSPLIT, $0-4
194	MOVL	fn+0(FP), DI	// DI = fn
195	get_tls(CX)
196	MOVL	g(CX), AX	// AX = g
197	MOVL	g_m(AX), BX	// BX = m
198
199	MOVL	m_gsignal(BX), DX	// DX = gsignal
200	CMPL	AX, DX
201	JEQ	noswitch
202
203	MOVL	m_g0(BX), DX	// DX = g0
204	CMPL	AX, DX
205	JEQ	noswitch
206
207	MOVL	m_curg(BX), R8
208	CMPL	AX, R8
209	JEQ	switch
210
211	// Not g0, not curg. Must be gsignal, but that's not allowed.
212	// Hide call from linker nosplit analysis.
213	MOVL	$runtime·badsystemstack(SB), AX
214	CALL	AX
215
216switch:
217	// save our state in g->sched. Pretend to
218	// be systemstack_switch if the G stack is scanned.
219	MOVL	$runtime·systemstack_switch(SB), SI
220	MOVL	SI, (g_sched+gobuf_pc)(AX)
221	MOVL	SP, (g_sched+gobuf_sp)(AX)
222	MOVL	AX, (g_sched+gobuf_g)(AX)
223
224	// switch to g0
225	MOVL	DX, g(CX)
226	MOVL	(g_sched+gobuf_sp)(DX), SP
227
228	// call target function
229	MOVL	DI, DX
230	MOVL	0(DI), DI
231	CALL	DI
232
233	// switch back to g
234	get_tls(CX)
235	MOVL	g(CX), AX
236	MOVL	g_m(AX), BX
237	MOVL	m_curg(BX), AX
238	MOVL	AX, g(CX)
239	MOVL	(g_sched+gobuf_sp)(AX), SP
240	MOVL	$0, (g_sched+gobuf_sp)(AX)
241	RET
242
243noswitch:
244	// already on m stack, just call directly
245	MOVL	DI, DX
246	MOVL	0(DI), DI
247	CALL	DI
248	RET
249
250/*
251 * support for morestack
252 */
253
254// Called during function prolog when more stack is needed.
255//
256// The traceback routines see morestack on a g0 as being
257// the top of a stack (for example, morestack calling newstack
258// calling the scheduler calling newm calling gc), so we must
259// record an argument size. For that purpose, it has no arguments.
260TEXT runtime·morestack(SB),NOSPLIT,$0-0
261	get_tls(CX)
262	MOVL	g(CX), BX
263	MOVL	g_m(BX), BX
264
265	// Cannot grow scheduler stack (m->g0).
266	MOVL	m_g0(BX), SI
267	CMPL	g(CX), SI
268	JNE	3(PC)
269	CALL	runtime·badmorestackg0(SB)
270	MOVL	0, AX
271
272	// Cannot grow signal stack (m->gsignal).
273	MOVL	m_gsignal(BX), SI
274	CMPL	g(CX), SI
275	JNE	3(PC)
276	CALL	runtime·badmorestackgsignal(SB)
277	MOVL	0, AX
278
279	// Called from f.
280	// Set m->morebuf to f's caller.
281	MOVL	8(SP), AX	// f's caller's PC
282	MOVL	AX, (m_morebuf+gobuf_pc)(BX)
283	LEAL	16(SP), AX	// f's caller's SP
284	MOVL	AX, (m_morebuf+gobuf_sp)(BX)
285	get_tls(CX)
286	MOVL	g(CX), SI
287	MOVL	SI, (m_morebuf+gobuf_g)(BX)
288
289	// Set g->sched to context in f.
290	MOVL	0(SP), AX // f's PC
291	MOVL	AX, (g_sched+gobuf_pc)(SI)
292	MOVL	SI, (g_sched+gobuf_g)(SI)
293	LEAL	8(SP), AX // f's SP
294	MOVL	AX, (g_sched+gobuf_sp)(SI)
295	// newstack will fill gobuf.ctxt.
296
297	// Call newstack on m->g0's stack.
298	MOVL	m_g0(BX), BX
299	MOVL	BX, g(CX)
300	MOVL	(g_sched+gobuf_sp)(BX), SP
301	PUSHQ	DX	// ctxt argument
302	CALL	runtime·newstack(SB)
303	MOVL	$0, 0x1003	// crash if newstack returns
304	POPQ	DX	// keep balance check happy
305	RET
306
307// morestack trampolines
308TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
309	MOVL	$0, DX
310	JMP	runtime·morestack(SB)
311
312TEXT runtime·stackBarrier(SB),NOSPLIT,$0
313	// We came here via a RET to an overwritten return PC.
314	// AX may be live. Other registers are available.
315
316	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
317	get_tls(CX)
318	MOVL	g(CX), CX
319	MOVL	(g_stkbar+slice_array)(CX), DX
320	MOVL	g_stkbarPos(CX), BX
321	IMULL	$stkbar__size, BX	// Too big for SIB.
322	ADDL	DX, BX
323	MOVL	stkbar_savedLRVal(BX), BX
324	// Record that this stack barrier was hit.
325	ADDL	$1, g_stkbarPos(CX)
326	// Jump to the original return PC.
327	JMP	BX
328
329// reflectcall: call a function with the given argument list
330// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
331// we don't have variable-sized frames, so we use a small number
332// of constant-sized-frame functions to encode a few bits of size in the pc.
333// Caution: ugly multiline assembly macros in your future!
334
335#define DISPATCH(NAME,MAXSIZE)		\
336	CMPL	CX, $MAXSIZE;		\
337	JA	3(PC);			\
338	MOVL	$NAME(SB), AX;		\
339	JMP	AX
340// Note: can't just "JMP NAME(SB)" - bad inlining results.
341
342TEXT reflect·call(SB), NOSPLIT, $0-0
343	JMP	·reflectcall(SB)
344
345TEXT ·reflectcall(SB), NOSPLIT, $0-20
346	MOVLQZX argsize+12(FP), CX
347	DISPATCH(runtime·call16, 16)
348	DISPATCH(runtime·call32, 32)
349	DISPATCH(runtime·call64, 64)
350	DISPATCH(runtime·call128, 128)
351	DISPATCH(runtime·call256, 256)
352	DISPATCH(runtime·call512, 512)
353	DISPATCH(runtime·call1024, 1024)
354	DISPATCH(runtime·call2048, 2048)
355	DISPATCH(runtime·call4096, 4096)
356	DISPATCH(runtime·call8192, 8192)
357	DISPATCH(runtime·call16384, 16384)
358	DISPATCH(runtime·call32768, 32768)
359	DISPATCH(runtime·call65536, 65536)
360	DISPATCH(runtime·call131072, 131072)
361	DISPATCH(runtime·call262144, 262144)
362	DISPATCH(runtime·call524288, 524288)
363	DISPATCH(runtime·call1048576, 1048576)
364	DISPATCH(runtime·call2097152, 2097152)
365	DISPATCH(runtime·call4194304, 4194304)
366	DISPATCH(runtime·call8388608, 8388608)
367	DISPATCH(runtime·call16777216, 16777216)
368	DISPATCH(runtime·call33554432, 33554432)
369	DISPATCH(runtime·call67108864, 67108864)
370	DISPATCH(runtime·call134217728, 134217728)
371	DISPATCH(runtime·call268435456, 268435456)
372	DISPATCH(runtime·call536870912, 536870912)
373	DISPATCH(runtime·call1073741824, 1073741824)
374	MOVL	$runtime·badreflectcall(SB), AX
375	JMP	AX
376
377#define CALLFN(NAME,MAXSIZE)			\
378TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
379	NO_LOCAL_POINTERS;			\
380	/* copy arguments to stack */		\
381	MOVL	argptr+8(FP), SI;		\
382	MOVL	argsize+12(FP), CX;		\
383	MOVL	SP, DI;				\
384	REP;MOVSB;				\
385	/* call function */			\
386	MOVL	f+4(FP), DX;			\
387	MOVL	(DX), AX;			\
388	CALL	AX;				\
389	/* copy return values back */		\
390	MOVL	argtype+0(FP), DX;		\
391	MOVL	argptr+8(FP), DI;		\
392	MOVL	argsize+12(FP), CX;		\
393	MOVL	retoffset+16(FP), BX;		\
394	MOVL	SP, SI;				\
395	ADDL	BX, DI;				\
396	ADDL	BX, SI;				\
397	SUBL	BX, CX;				\
398	CALL	callRet<>(SB);			\
399	RET
400
401// callRet copies return values back at the end of call*. This is a
402// separate function so it can allocate stack space for the arguments
403// to reflectcallmove. It does not follow the Go ABI; it expects its
404// arguments in registers.
405TEXT callRet<>(SB), NOSPLIT, $16-0
406	MOVL	DX, 0(SP)
407	MOVL	DI, 4(SP)
408	MOVL	SI, 8(SP)
409	MOVL	CX, 12(SP)
410	CALL	runtime·reflectcallmove(SB)
411	RET
412
413CALLFNcall16, 16)
414CALLFNcall32, 32)
415CALLFNcall64, 64)
416CALLFNcall128, 128)
417CALLFNcall256, 256)
418CALLFNcall512, 512)
419CALLFNcall1024, 1024)
420CALLFNcall2048, 2048)
421CALLFNcall4096, 4096)
422CALLFNcall8192, 8192)
423CALLFNcall16384, 16384)
424CALLFNcall32768, 32768)
425CALLFNcall65536, 65536)
426CALLFNcall131072, 131072)
427CALLFNcall262144, 262144)
428CALLFNcall524288, 524288)
429CALLFNcall1048576, 1048576)
430CALLFNcall2097152, 2097152)
431CALLFNcall4194304, 4194304)
432CALLFNcall8388608, 8388608)
433CALLFNcall16777216, 16777216)
434CALLFNcall33554432, 33554432)
435CALLFNcall67108864, 67108864)
436CALLFNcall134217728, 134217728)
437CALLFNcall268435456, 268435456)
438CALLFNcall536870912, 536870912)
439CALLFNcall1073741824, 1073741824)
440
441TEXT runtime·procyield(SB),NOSPLIT,$0-0
442	MOVL	cycles+0(FP), AX
443again:
444	PAUSE
445	SUBL	$1, AX
446	JNZ	again
447	RET
448
449TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
450	// Stores are already ordered on x86, so this is just a
451	// compile barrier.
452	RET
453
454// void jmpdefer(fn, sp);
455// called from deferreturn.
456// 1. pop the caller
457// 2. sub 5 bytes from the callers return
458// 3. jmp to the argument
459TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
460	MOVL	fv+0(FP), DX
461	MOVL	argp+4(FP), BX
462	LEAL	-8(BX), SP	// caller sp after CALL
463	SUBL	$5, (SP)	// return to CALL again
464	MOVL	0(DX), BX
465	JMP	BX	// but first run the deferred function
466
467// func asmcgocall(fn, arg unsafe.Pointer) int32
468// Not implemented.
469TEXT runtime·asmcgocall(SB),NOSPLIT,$0-12
470	MOVL	0, AX
471	RET
472
473// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
474// Not implemented.
475TEXT runtime·cgocallback(SB),NOSPLIT,$0-16
476	MOVL	0, AX
477	RET
478
479// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
480// Not implemented.
481TEXT ·cgocallback_gofunc(SB),NOSPLIT,$0-16
482	MOVL	0, AX
483	RET
484
485// void setg(G*); set g. for use by needm.
486// Not implemented.
487TEXT runtime·setg(SB), NOSPLIT, $0-4
488	MOVL	0, AX
489	RET
490
491// check that SP is in range [g->stack.lo, g->stack.hi)
492TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
493	get_tls(CX)
494	MOVL	g(CX), AX
495	CMPL	(g_stack+stack_hi)(AX), SP
496	JHI	2(PC)
497	MOVL	0, AX
498	CMPL	SP, (g_stack+stack_lo)(AX)
499	JHI	2(PC)
500	MOVL	0, AX
501	RET
502
503TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-8
504	MOVL	ptr+0(FP), DI
505	MOVL	n+4(FP), CX
506	MOVQ	CX, BX
507	ANDQ	$3, BX
508	SHRQ	$2, CX
509	MOVQ	$0, AX
510	CLD
511	REP
512	STOSL
513	MOVQ	BX, CX
514	REP
515	STOSB
516	// Note: we zero only 4 bytes at a time so that the tail is at most
517	// 3 bytes. That guarantees that we aren't zeroing pointers with STOSB.
518	// See issue 13160.
519	RET
520
521TEXT runtime·getcallerpc(SB),NOSPLIT,$8-12
522	MOVL	argp+0(FP),AX		// addr of first arg
523	MOVL	-8(AX),AX		// get calling pc
524	CMPL	AX, runtime·stackBarrierPC(SB)
525	JNE	nobar
526	// Get original return PC.
527	CALL	runtime·nextBarrierPC(SB)
528	MOVL	0(SP), AX
529nobar:
530	MOVL	AX, ret+8(FP)
531	RET
532
533TEXT runtime·setcallerpc(SB),NOSPLIT,$8-8
534	MOVL	argp+0(FP),AX		// addr of first arg
535	MOVL	pc+4(FP), BX		// pc to set
536	MOVL	-8(AX), CX
537	CMPL	CX, runtime·stackBarrierPC(SB)
538	JEQ	setbar
539	MOVQ	BX, -8(AX)		// set calling pc
540	RET
541setbar:
542	// Set the stack barrier return PC.
543	MOVL	BX, 0(SP)
544	CALL	runtime·setNextBarrierPC(SB)
545	RET
546
547// int64 runtime·cputicks(void)
548TEXT runtime·cputicks(SB),NOSPLIT,$0-0
549	RDTSC
550	SHLQ	$32, DX
551	ADDQ	DX, AX
552	MOVQ	AX, ret+0(FP)
553	RET
554
555// memhash_varlen(p unsafe.Pointer, h seed) uintptr
556// redirects to memhash(p, h, size) using the size
557// stored in the closure.
558TEXT runtime·memhash_varlen(SB),NOSPLIT,$24-12
559	GO_ARGS
560	NO_LOCAL_POINTERS
561	MOVL	p+0(FP), AX
562	MOVL	h+4(FP), BX
563	MOVL	4(DX), CX
564	MOVL	AX, 0(SP)
565	MOVL	BX, 4(SP)
566	MOVL	CX, 8(SP)
567	CALL	runtime·memhash(SB)
568	MOVL	16(SP), AX
569	MOVL	AX, ret+8(FP)
570	RET
571
572// hash function using AES hardware instructions
573// For now, our one amd64p32 system (NaCl) does not
574// support using AES instructions, so have not bothered to
575// write the implementations. Can copy and adjust the ones
576// in asm_amd64.s when the time comes.
577
578TEXT runtime·aeshash(SB),NOSPLIT,$0-20
579	MOVL	AX, ret+16(FP)
580	RET
581
582TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
583	MOVL	AX, ret+8(FP)
584	RET
585
586TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
587	MOVL	AX, ret+8(FP)
588	RET
589
590TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
591	MOVL	AX, ret+8(FP)
592	RET
593
594// memequal(p, q unsafe.Pointer, size uintptr) bool
595TEXT runtime·memequal(SB),NOSPLIT,$0-17
596	MOVL	a+0(FP), SI
597	MOVL	b+4(FP), DI
598	CMPL	SI, DI
599	JEQ	eq
600	MOVL	size+8(FP), BX
601	CALL	runtime·memeqbody(SB)
602	MOVB	AX, ret+16(FP)
603	RET
604eq:
605	MOVB    $1, ret+16(FP)
606	RET
607
608// memequal_varlen(a, b unsafe.Pointer) bool
609TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
610	MOVL    a+0(FP), SI
611	MOVL    b+4(FP), DI
612	CMPL    SI, DI
613	JEQ     eq
614	MOVL    4(DX), BX    // compiler stores size at offset 4 in the closure
615	CALL    runtime·memeqbody(SB)
616	MOVB    AX, ret+8(FP)
617	RET
618eq:
619	MOVB    $1, ret+8(FP)
620	RET
621
622// eqstring tests whether two strings are equal.
623// The compiler guarantees that strings passed
624// to eqstring have equal length.
625// See runtime_test.go:eqstring_generic for
626// equivalent Go code.
627TEXT runtime·eqstring(SB),NOSPLIT,$0-17
628	MOVL	s1_base+0(FP), SI
629	MOVL	s2_base+8(FP), DI
630	CMPL	SI, DI
631	JEQ	same
632	MOVL	s1_len+4(FP), BX
633	CALL	runtime·memeqbody(SB)
634	MOVB	AX, ret+16(FP)
635	RET
636same:
637	MOVB	$1, ret+16(FP)
638	RET
639
640// a in SI
641// b in DI
642// count in BX
643TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
644	XORQ	AX, AX
645
646	CMPQ	BX, $8
647	JB	small
648
649	// 64 bytes at a time using xmm registers
650hugeloop:
651	CMPQ	BX, $64
652	JB	bigloop
653	MOVOU	(SI), X0
654	MOVOU	(DI), X1
655	MOVOU	16(SI), X2
656	MOVOU	16(DI), X3
657	MOVOU	32(SI), X4
658	MOVOU	32(DI), X5
659	MOVOU	48(SI), X6
660	MOVOU	48(DI), X7
661	PCMPEQB	X1, X0
662	PCMPEQB	X3, X2
663	PCMPEQB	X5, X4
664	PCMPEQB	X7, X6
665	PAND	X2, X0
666	PAND	X6, X4
667	PAND	X4, X0
668	PMOVMSKB X0, DX
669	ADDQ	$64, SI
670	ADDQ	$64, DI
671	SUBQ	$64, BX
672	CMPL	DX, $0xffff
673	JEQ	hugeloop
674	RET
675
676	// 8 bytes at a time using 64-bit register
677bigloop:
678	CMPQ	BX, $8
679	JBE	leftover
680	MOVQ	(SI), CX
681	MOVQ	(DI), DX
682	ADDQ	$8, SI
683	ADDQ	$8, DI
684	SUBQ	$8, BX
685	CMPQ	CX, DX
686	JEQ	bigloop
687	RET
688
689	// remaining 0-8 bytes
690leftover:
691	ADDQ	BX, SI
692	ADDQ	BX, DI
693	MOVQ	-8(SI), CX
694	MOVQ	-8(DI), DX
695	CMPQ	CX, DX
696	SETEQ	AX
697	RET
698
699small:
700	CMPQ	BX, $0
701	JEQ	equal
702
703	LEAQ	0(BX*8), CX
704	NEGQ	CX
705
706	CMPB	SI, $0xf8
707	JA	si_high
708
709	// load at SI won't cross a page boundary.
710	MOVQ	(SI), SI
711	JMP	si_finish
712si_high:
713	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
714	MOVQ	BX, DX
715	ADDQ	SI, DX
716	MOVQ	-8(DX), SI
717	SHRQ	CX, SI
718si_finish:
719
720	// same for DI.
721	CMPB	DI, $0xf8
722	JA	di_high
723	MOVQ	(DI), DI
724	JMP	di_finish
725di_high:
726	MOVQ	BX, DX
727	ADDQ	DI, DX
728	MOVQ	-8(DX), DI
729	SHRQ	CX, DI
730di_finish:
731
732	SUBQ	SI, DI
733	SHLQ	CX, DI
734equal:
735	SETEQ	AX
736	RET
737
738TEXT runtime·cmpstring(SB),NOSPLIT,$0-20
739	MOVL	s1_base+0(FP), SI
740	MOVL	s1_len+4(FP), BX
741	MOVL	s2_base+8(FP), DI
742	MOVL	s2_len+12(FP), DX
743	CALL	runtime·cmpbody(SB)
744	MOVL	AX, ret+16(FP)
745	RET
746
747TEXT bytes·Compare(SB),NOSPLIT,$0-28
748	MOVL	s1+0(FP), SI
749	MOVL	s1+4(FP), BX
750	MOVL	s2+12(FP), DI
751	MOVL	s2+16(FP), DX
752	CALL	runtime·cmpbody(SB)
753	MOVL	AX, res+24(FP)
754	RET
755
756// input:
757//   SI = a
758//   DI = b
759//   BX = alen
760//   DX = blen
761// output:
762//   AX = 1/0/-1
763TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
764	CMPQ	SI, DI
765	JEQ	allsame
766	CMPQ	BX, DX
767	MOVQ	DX, R8
768	CMOVQLT	BX, R8 // R8 = min(alen, blen) = # of bytes to compare
769	CMPQ	R8, $8
770	JB	small
771
772loop:
773	CMPQ	R8, $16
774	JBE	_0through16
775	MOVOU	(SI), X0
776	MOVOU	(DI), X1
777	PCMPEQB X0, X1
778	PMOVMSKB X1, AX
779	XORQ	$0xffff, AX	// convert EQ to NE
780	JNE	diff16	// branch if at least one byte is not equal
781	ADDQ	$16, SI
782	ADDQ	$16, DI
783	SUBQ	$16, R8
784	JMP	loop
785
786	// AX = bit mask of differences
787diff16:
788	BSFQ	AX, BX	// index of first byte that differs
789	XORQ	AX, AX
790	ADDQ	BX, SI
791	MOVB	(SI), CX
792	ADDQ	BX, DI
793	CMPB	CX, (DI)
794	SETHI	AX
795	LEAQ	-1(AX*2), AX	// convert 1/0 to +1/-1
796	RET
797
798	// 0 through 16 bytes left, alen>=8, blen>=8
799_0through16:
800	CMPQ	R8, $8
801	JBE	_0through8
802	MOVQ	(SI), AX
803	MOVQ	(DI), CX
804	CMPQ	AX, CX
805	JNE	diff8
806_0through8:
807	ADDQ	R8, SI
808	ADDQ	R8, DI
809	MOVQ	-8(SI), AX
810	MOVQ	-8(DI), CX
811	CMPQ	AX, CX
812	JEQ	allsame
813
814	// AX and CX contain parts of a and b that differ.
815diff8:
816	BSWAPQ	AX	// reverse order of bytes
817	BSWAPQ	CX
818	XORQ	AX, CX
819	BSRQ	CX, CX	// index of highest bit difference
820	SHRQ	CX, AX	// move a's bit to bottom
821	ANDQ	$1, AX	// mask bit
822	LEAQ	-1(AX*2), AX // 1/0 => +1/-1
823	RET
824
825	// 0-7 bytes in common
826small:
827	LEAQ	(R8*8), CX	// bytes left -> bits left
828	NEGQ	CX		//  - bits lift (== 64 - bits left mod 64)
829	JEQ	allsame
830
831	// load bytes of a into high bytes of AX
832	CMPB	SI, $0xf8
833	JA	si_high
834	MOVQ	(SI), SI
835	JMP	si_finish
836si_high:
837	ADDQ	R8, SI
838	MOVQ	-8(SI), SI
839	SHRQ	CX, SI
840si_finish:
841	SHLQ	CX, SI
842
843	// load bytes of b in to high bytes of BX
844	CMPB	DI, $0xf8
845	JA	di_high
846	MOVQ	(DI), DI
847	JMP	di_finish
848di_high:
849	ADDQ	R8, DI
850	MOVQ	-8(DI), DI
851	SHRQ	CX, DI
852di_finish:
853	SHLQ	CX, DI
854
855	BSWAPQ	SI	// reverse order of bytes
856	BSWAPQ	DI
857	XORQ	SI, DI	// find bit differences
858	JEQ	allsame
859	BSRQ	DI, CX	// index of highest bit difference
860	SHRQ	CX, SI	// move a's bit to bottom
861	ANDQ	$1, SI	// mask bit
862	LEAQ	-1(SI*2), AX // 1/0 => +1/-1
863	RET
864
865allsame:
866	XORQ	AX, AX
867	XORQ	CX, CX
868	CMPQ	BX, DX
869	SETGT	AX	// 1 if alen > blen
870	SETEQ	CX	// 1 if alen == blen
871	LEAQ	-1(CX)(AX*2), AX	// 1,0,-1 result
872	RET
873
874TEXT bytes·IndexByte(SB),NOSPLIT,$0-20
875	MOVL s+0(FP), SI
876	MOVL s_len+4(FP), BX
877	MOVB c+12(FP), AL
878	CALL runtime·indexbytebody(SB)
879	MOVL AX, ret+16(FP)
880	RET
881
882TEXT strings·IndexByte(SB),NOSPLIT,$0-20
883	MOVL s+0(FP), SI
884	MOVL s_len+4(FP), BX
885	MOVB c+8(FP), AL
886	CALL runtime·indexbytebody(SB)
887	MOVL AX, ret+16(FP)
888	RET
889
890// input:
891//   SI: data
892//   BX: data len
893//   AL: byte sought
894// output:
895//   AX
896TEXT runtime·indexbytebody(SB),NOSPLIT,$0
897	MOVL SI, DI
898
899	CMPL BX, $16
900	JLT small
901
902	// round up to first 16-byte boundary
903	TESTL $15, SI
904	JZ aligned
905	MOVL SI, CX
906	ANDL $~15, CX
907	ADDL $16, CX
908
909	// search the beginning
910	SUBL SI, CX
911	REPN; SCASB
912	JZ success
913
914// DI is 16-byte aligned; get ready to search using SSE instructions
915aligned:
916	// round down to last 16-byte boundary
917	MOVL BX, R11
918	ADDL SI, R11
919	ANDL $~15, R11
920
921	// shuffle X0 around so that each byte contains c
922	MOVD AX, X0
923	PUNPCKLBW X0, X0
924	PUNPCKLBW X0, X0
925	PSHUFL $0, X0, X0
926	JMP condition
927
928sse:
929	// move the next 16-byte chunk of the buffer into X1
930	MOVO (DI), X1
931	// compare bytes in X0 to X1
932	PCMPEQB X0, X1
933	// take the top bit of each byte in X1 and put the result in DX
934	PMOVMSKB X1, DX
935	TESTL DX, DX
936	JNZ ssesuccess
937	ADDL $16, DI
938
939condition:
940	CMPL DI, R11
941	JLT sse
942
943	// search the end
944	MOVL SI, CX
945	ADDL BX, CX
946	SUBL R11, CX
947	// if CX == 0, the zero flag will be set and we'll end up
948	// returning a false success
949	JZ failure
950	REPN; SCASB
951	JZ success
952
953failure:
954	MOVL $-1, AX
955	RET
956
957// handle for lengths < 16
958small:
959	MOVL BX, CX
960	REPN; SCASB
961	JZ success
962	MOVL $-1, AX
963	RET
964
965// we've found the chunk containing the byte
966// now just figure out which specific byte it is
967ssesuccess:
968	// get the index of the least significant set bit
969	BSFW DX, DX
970	SUBL SI, DI
971	ADDL DI, DX
972	MOVL DX, AX
973	RET
974
975success:
976	SUBL SI, DI
977	SUBL $1, DI
978	MOVL DI, AX
979	RET
980
981TEXT bytes·Equal(SB),NOSPLIT,$0-25
982	MOVL	a_len+4(FP), BX
983	MOVL	b_len+16(FP), CX
984	XORL	AX, AX
985	CMPL	BX, CX
986	JNE	eqret
987	MOVL	a+0(FP), SI
988	MOVL	b+12(FP), DI
989	CALL	runtime·memeqbody(SB)
990eqret:
991	MOVB	AX, ret+24(FP)
992	RET
993
994TEXT runtime·fastrand(SB), NOSPLIT, $0-4
995	get_tls(CX)
996	MOVL	g(CX), AX
997	MOVL	g_m(AX), AX
998	MOVL	m_fastrand(AX), DX
999	ADDL	DX, DX
1000	MOVL	DX, BX
1001	XORL	$0x88888eef, DX
1002	CMOVLMI	BX, DX
1003	MOVL	DX, m_fastrand(AX)
1004	MOVL	DX, ret+0(FP)
1005	RET
1006
1007TEXT runtime·return0(SB), NOSPLIT, $0
1008	MOVL	$0, AX
1009	RET
1010
1011// The top-most function running on a goroutine
1012// returns to goexit+PCQuantum.
1013TEXT runtime·goexit(SB),NOSPLIT,$0-0
1014	BYTE	$0x90	// NOP
1015	CALL	runtime·goexit1(SB)	// does not return
1016	// traceback from goexit1 must hit code range of goexit
1017	BYTE	$0x90	// NOP
1018
1019TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
1020	MOVL	addr+0(FP), AX
1021	PREFETCHT0	(AX)
1022	RET
1023
1024TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
1025	MOVL	addr+0(FP), AX
1026	PREFETCHT1	(AX)
1027	RET
1028
1029
1030TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
1031	MOVL	addr+0(FP), AX
1032	PREFETCHT2	(AX)
1033	RET
1034
1035TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
1036	MOVL	addr+0(FP), AX
1037	PREFETCHNTA	(AX)
1038	RET
1039
1040TEXT ·checkASM(SB),NOSPLIT,$0-1
1041	MOVB	$1, ret+0(FP)
1042	RET
1043