1;; -----------------------------------------------------------------------
2;;
3;;   Copyright 1994-2009 H. Peter Anvin - All Rights Reserved
4;;   Copyright 2009-2010 Intel Corporation; author: H. Peter Anvin
5;;
6;;   This program is free software; you can redistribute it and/or modify
7;;   it under the terms of the GNU General Public License as published by
8;;   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
9;;   Boston MA 02111-1307, USA; either version 2 of the License, or
10;;   (at your option) any later version; incorporated herein by reference.
11;;
12;; -----------------------------------------------------------------------
13
14;;
15;; bcopy32xx.inc
16;;
17
18
19;
20; 32-bit bcopy routine
21;
22; This is the actual 32-bit portion of the bcopy and shuffle and boot
23; routines.  ALL THIS CODE NEEDS TO BE POSITION-INDEPENDENT, with the
24; sole exception being the actual relocation code at the beginning of
25; pm_shuffle_boot.
26;
27; It also really needs to live all in a single segment, for the
28; address calculcations to actually work.
29;
30
31		bits 32
32		section .bcopyxx.text
33		align 16
34;
35; pm_bcopy:
36;
37;	This is the protected-mode core of the "bcopy" routine.
38;	Try to do aligned transfers; if the src and dst are relatively
39;	misaligned, align the dst.
40;
41;	ECX is guaranteed to not be zero on entry.
42;
43;	Clobbers ESI, EDI, ECX.
44;
45
46pm_bcopy:
47		push ebx
48		push edx
49		push eax
50
51		cmp esi,-1
52		je .bzero
53
54		cmp esi,edi		; If source < destination, we might
55		jb .reverse		; have to copy backwards
56
57.forward:
58		; Initial alignment
59		mov edx,edi
60		shr edx,1
61		jnc .faa1
62		movsb
63		dec ecx
64.faa1:
65		mov al,cl
66		cmp ecx,2
67		jb .f_tiny
68
69		shr edx,1
70		jnc .faa2
71		movsw
72		sub ecx,2
73.faa2:
74
75		; Bulk transfer
76		mov al,cl		; Save low bits
77		shr ecx,2		; Convert to dwords
78		rep movsd		; Do our business
79		; At this point ecx == 0
80
81		test al,2
82		jz .fab2
83		movsw
84.fab2:
85.f_tiny:
86		test al,1
87		jz .fab1
88		movsb
89.fab1:
90.done:
91		pop eax
92		pop edx
93		pop ebx
94		ret
95
96.reverse:
97		lea eax,[esi+ecx-1]	; Point to final byte
98		cmp edi,eax
99		ja .forward		; No overlap, do forward copy
100
101		std			; Reverse copy
102		lea edi,[edi+ecx-1]
103		mov esi,eax
104
105		; Initial alignment
106		mov edx,edi
107		shr edx,1
108		jc .raa1
109		movsb
110		dec ecx
111.raa1:
112
113		dec esi
114		dec edi
115		mov al,cl
116		cmp ecx,2
117		jb .r_tiny
118		shr edx,1
119		jc .raa2
120		movsw
121		sub ecx,2
122.raa2:
123
124		; Bulk copy
125		sub esi,2
126		sub edi,2
127		mov al,cl		; Save low bits
128		shr ecx,2
129		rep movsd
130
131		; Final alignment
132.r_final:
133		add esi,2
134		add edi,2
135		test al,2
136		jz .rab2
137		movsw
138.rab2:
139.r_tiny:
140		inc esi
141		inc edi
142		test al,1
143		jz .rab1
144		movsb
145.rab1:
146		cld
147		jmp short .done
148
149.bzero:
150		xor eax,eax
151
152		; Initial alignment
153		mov edx,edi
154		shr edx,1
155		jnc .zaa1
156		stosb
157		dec ecx
158.zaa1:
159
160		mov bl,cl
161		cmp ecx,2
162		jb .z_tiny
163		shr edx,1
164		jnc .zaa2
165		stosw
166		sub ecx,2
167.zaa2:
168
169		; Bulk
170		mov bl,cl		; Save low bits
171		shr ecx,2
172		rep stosd
173
174		test bl,2
175		jz .zab2
176		stosw
177.zab2:
178.z_tiny:
179		test bl,1
180		jz .zab1
181		stosb
182.zab1:
183		jmp short .done
184
185;
186; shuffle_and_boot:
187;
188; This routine is used to shuffle memory around, followed by
189; invoking an entry point somewhere in low memory.  This routine
190; can clobber any memory outside the bcopy special area.
191;
192; IMPORTANT: This routine does not set up any registers.
193; It is the responsibility of the caller to generate an appropriate entry
194; stub; *especially* when going to real mode.
195;
196; Inputs:
197;	ESI		-> Pointer to list of (dst, src, len) pairs(*)
198;	EDI		-> Pointer to safe area for list + shuffler
199;			   (must not overlap this code nor the RM stack)
200;	ECX		-> Byte count of list area (for initial copy)
201;
202;     If src == -1: then the memory pointed to by (dst, len) is bzeroed;
203;		    this is handled inside the bcopy routine.
204;
205;     If len == 0:  this marks the end of the list; dst indicates
206;		    the entry point and src the mode (0 = pm, 1 = rm)
207;
208;     (*) dst, src, and len are four bytes each
209;
210; do_raw_shuffle_and_boot is the same entry point, but with a C ABI:
211; do_raw_shuffle_and_boot(safearea, descriptors, bytecount)
212;
213		global do_raw_shuffle_and_boot
214do_raw_shuffle_and_boot:
215		mov edi,eax
216		mov esi,edx
217
218pm_shuffle:
219		cli			; End interrupt service (for good)
220		mov ebx,edi		; EBX <- descriptor list
221		lea edx,[edi+ecx+15]	; EDX <- where to relocate our code to
222		and edx,~15		; Align 16 to benefit the GDT
223		call pm_bcopy
224		mov esi,__bcopyxx_start	; Absolute source address
225		mov edi,edx		; Absolute target address
226		sub edx,esi		; EDX <- address delta
227		mov ecx,__bcopyxx_dwords
228		lea eax,[edx+.safe]	; Resume point
229		; Relocate this code
230		rep movsd
231		jmp eax			; Jump to safe location
232.safe:
233		; Give ourselves a safe stack
234		lea esp,[edx+bcopyxx_stack+__bcopyxx_end]
235		add edx,bcopy_gdt	; EDX <- new GDT
236		mov [edx+2],edx		; GDT self-pointer
237		lgdt [edx]		; Switch to local GDT
238
239		; Now for the actual shuffling...
240.loop:
241		mov edi,[ebx]
242		mov esi,[ebx+4]
243		mov ecx,[ebx+8]
244		add ebx,12
245		jecxz .done
246		call pm_bcopy
247		jmp .loop
248.done:
249		lidt [edx+RM_IDT_ptr-bcopy_gdt]	; RM-like IDT
250		push ecx		; == 0, for cleaning the flags register
251		and esi,esi
252		jz pm_shuffle_16
253		popfd			; Clean the flags
254		jmp edi			; Protected mode entry
255
256		; We have a 16-bit entry point, so we need to return
257		; to 16-bit mode.  Note: EDX already points to the GDT.
258pm_shuffle_16:
259		mov eax,edi
260		mov [edx+PM_CS16+2],ax
261		mov [edx+PM_DS16+2],ax
262		shr eax,16
263		mov [edx+PM_CS16+4],al
264		mov [edx+PM_CS16+7],ah
265		mov [edx+PM_DS16+4],al
266		mov [edx+PM_DS16+7],ah
267		mov eax,cr0
268		and al,~1
269		popfd			; Clean the flags
270		; No flag-changing instructions below...
271		mov dx,PM_DS16
272		mov ds,edx
273		mov es,edx
274		mov fs,edx
275		mov gs,edx
276		mov ss,edx
277		jmp PM_CS16:0
278
279		section	.bcopyxx.data
280
281		alignz 16
282; GDT descriptor entry
283%macro desc 1
284bcopy_gdt.%1:
285PM_%1		equ bcopy_gdt.%1-bcopy_gdt
286%endmacro
287
288bcopy_gdt:
289		dw bcopy_gdt_size-1	; Null descriptor - contains GDT
290		dd bcopy_gdt		; pointer for LGDT instruction
291		dw 0
292
293		; TSS segment to keep Intel VT happy.  Intel VT is
294		; unhappy about anything that doesn't smell like a
295		; full-blown 32-bit OS.
296	desc TSS
297		dw 104-1, DummyTSS	; 08h 32-bit task state segment
298		dd 00008900h		; present, dpl 0, 104 bytes @DummyTSS
299
300	desc CS16
301		dd 0000ffffh		; 10h Code segment, use16, readable,
302		dd 00009b00h		; present, dpl 0, cover 64K
303	desc DS16
304		dd 0000ffffh		; 18h Data segment, use16, read/write,
305		dd 00009300h		; present, dpl 0, cover 64K
306	desc CS32
307		dd 0000ffffh		; 20h Code segment, use32, readable,
308		dd 00cf9b00h		; present, dpl 0, cover all 4G
309	desc DS32
310		dd 0000ffffh		; 28h Data segment, use32, read/write,
311		dd 00cf9300h		; present, dpl 0, cover all 4G
312
313bcopy_gdt_size:	equ $-bcopy_gdt
314;
315; Space for a dummy task state segment.  It should never be actually
316; accessed, but just in case it is, point to a chunk of memory that
317; has a chance to not be used for anything real...
318;
319DummyTSS	equ 0x580
320
321		align 4
322RM_IDT_ptr:	dw 0FFFFh		; Length (nonsense, but matches CPU)
323		dd 0			; Offset
324
325bcopyxx_stack	equ 128			; We want this much stack
326
327		section .rodata
328		global __syslinux_shuffler_size
329		extern __bcopyxx_len
330		align 4
331__syslinux_shuffler_size:
332		dd __bcopyxx_len
333
334		bits 16
335		section .text16
336