1/*
2Copyright (C) 1996-1997 Id Software, Inc.
3
4This program is free software; you can redistribute it and/or
5modify it under the terms of the GNU General Public License
6as published by the Free Software Foundation; either version 2
7of the License, or (at your option) any later version.
8
9This program is distributed in the hope that it will be useful,
10but WITHOUT ANY WARRANTY; without even the implied warranty of
11MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12
13See the GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with this program; if not, write to the Free Software
17Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
18
19*/
20//
21// d_draw.s
22// x86 assembly-language horizontal 8-bpp span-drawing code.
23//
24
25#include "asm_i386.h"
26#include "quakeasm.h"
27#include "asm_draw.h"
28#include "d_ifacea.h"
29
30#if	id386
31
32//----------------------------------------------------------------------
33// 8-bpp horizontal span drawing code for polygons, with no transparency.
34//
35// Assumes there is at least one span in pspans, and that every span
36// contains at least one pixel
37//----------------------------------------------------------------------
38
39	.text
40
41// out-of-line, rarely-needed clamping code
42
43LClampHigh0:
44	movl	C(bbextents),%esi
45	jmp		LClampReentry0
46LClampHighOrLow0:
47	jg		LClampHigh0
48	xorl	%esi,%esi
49	jmp		LClampReentry0
50
51LClampHigh1:
52	movl	C(bbextentt),%edx
53	jmp		LClampReentry1
54LClampHighOrLow1:
55	jg		LClampHigh1
56	xorl	%edx,%edx
57	jmp		LClampReentry1
58
59LClampLow2:
60	movl	$2048,%ebp
61	jmp		LClampReentry2
62LClampHigh2:
63	movl	C(bbextents),%ebp
64	jmp		LClampReentry2
65
66LClampLow3:
67	movl	$2048,%ecx
68	jmp		LClampReentry3
69LClampHigh3:
70	movl	C(bbextentt),%ecx
71	jmp		LClampReentry3
72
73LClampLow4:
74	movl	$2048,%eax
75	jmp		LClampReentry4
76LClampHigh4:
77	movl	C(bbextents),%eax
78	jmp		LClampReentry4
79
80LClampLow5:
81	movl	$2048,%ebx
82	jmp		LClampReentry5
83LClampHigh5:
84	movl	C(bbextentt),%ebx
85	jmp		LClampReentry5
86
87
88#define pspans	4+16
89
90	.align 4
91.globl C(D_DrawSpans8)
92C(D_DrawSpans8):
93	pushl	%ebp				// preserve caller's stack frame
94	pushl	%edi
95	pushl	%esi				// preserve register variables
96	pushl	%ebx
97
98//
99// set up scaled-by-8 steps, for 8-long segments; also set up cacheblock
100// and span list pointers
101//
102// TODO: any overlap from rearranging?
103	flds	C(d_sdivzstepu)
104	fmuls	fp_8
105	movl	C(cacheblock),%edx
106	flds	C(d_tdivzstepu)
107	fmuls	fp_8
108	movl	pspans(%esp),%ebx	// point to the first span descriptor
109	flds	C(d_zistepu)
110	fmuls	fp_8
111	movl	%edx,pbase			// pbase = cacheblock
112	fstps	zi8stepu
113	fstps	tdivz8stepu
114	fstps	sdivz8stepu
115
116LSpanLoop:
117//
118// set up the initial s/z, t/z, and 1/z on the FP stack, and generate the
119// initial s and t values
120//
121// FIXME: pipeline FILD?
122	fildl	espan_t_v(%ebx)
123	fildl	espan_t_u(%ebx)
124
125	fld		%st(1)			// dv | du | dv
126	fmuls	C(d_sdivzstepv)	// dv*d_sdivzstepv | du | dv
127	fld		%st(1)			// du | dv*d_sdivzstepv | du | dv
128	fmuls	C(d_sdivzstepu)	// du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
129	fld		%st(2)			// du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv
130	fmuls	C(d_tdivzstepu)	// du*d_tdivzstepu | du*d_sdivzstepu |
131							//  dv*d_sdivzstepv | du | dv
132	fxch	%st(1)			// du*d_sdivzstepu | du*d_tdivzstepu |
133							//  dv*d_sdivzstepv | du | dv
134	faddp	%st(0),%st(2)	// du*d_tdivzstepu |
135							//  du*d_sdivzstepu + dv*d_sdivzstepv | du | dv
136	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
137							//  du*d_tdivzstepu | du | dv
138	fld		%st(3)			// dv | du*d_sdivzstepu + dv*d_sdivzstepv |
139							//  du*d_tdivzstepu | du | dv
140	fmuls	C(d_tdivzstepv)	// dv*d_tdivzstepv |
141							//  du*d_sdivzstepu + dv*d_sdivzstepv |
142							//  du*d_tdivzstepu | du | dv
143	fxch	%st(1)			// du*d_sdivzstepu + dv*d_sdivzstepv |
144							//  dv*d_tdivzstepv | du*d_tdivzstepu | du | dv
145	fadds	C(d_sdivzorigin)	// sdivz = d_sdivzorigin + dv*d_sdivzstepv +
146							//  du*d_sdivzstepu; stays in %st(2) at end
147	fxch	%st(4)			// dv | dv*d_tdivzstepv | du*d_tdivzstepu | du |
148							//  s/z
149	fmuls	C(d_zistepv)		// dv*d_zistepv | dv*d_tdivzstepv |
150							//  du*d_tdivzstepu | du | s/z
151	fxch	%st(1)			// dv*d_tdivzstepv |  dv*d_zistepv |
152							//  du*d_tdivzstepu | du | s/z
153	faddp	%st(0),%st(2)	// dv*d_zistepv |
154							//  dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z
155	fxch	%st(2)			// du | dv*d_tdivzstepv + du*d_tdivzstepu |
156							//  dv*d_zistepv | s/z
157	fmuls	C(d_zistepu)		// du*d_zistepu |
158							//  dv*d_tdivzstepv + du*d_tdivzstepu |
159							//  dv*d_zistepv | s/z
160	fxch	%st(1)			// dv*d_tdivzstepv + du*d_tdivzstepu |
161							//  du*d_zistepu | dv*d_zistepv | s/z
162	fadds	C(d_tdivzorigin)	// tdivz = d_tdivzorigin + dv*d_tdivzstepv +
163							//  du*d_tdivzstepu; stays in %st(1) at end
164	fxch	%st(2)			// dv*d_zistepv | du*d_zistepu | t/z | s/z
165	faddp	%st(0),%st(1)	// dv*d_zistepv + du*d_zistepu | t/z | s/z
166
167	flds	fp_64k			// fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z
168	fxch	%st(1)			// dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z
169	fadds	C(d_ziorigin)		// zi = d_ziorigin + dv*d_zistepv +
170							//  du*d_zistepu; stays in %st(0) at end
171							// 1/z | fp_64k | t/z | s/z
172//
173// calculate and clamp s & t
174//
175	fdivr	%st(0),%st(1)	// 1/z | z*64k | t/z | s/z
176
177//
178// point %edi to the first pixel in the span
179//
180	movl	C(d_viewbuffer),%ecx
181	movl	espan_t_v(%ebx),%eax
182	movl	%ebx,pspantemp	// preserve spans pointer
183
184	movl	C(tadjust),%edx
185	movl	C(sadjust),%esi
186	movl	C(d_scantable)(,%eax,4),%edi	// v * screenwidth
187	addl	%ecx,%edi
188	movl	espan_t_u(%ebx),%ecx
189	addl	%ecx,%edi				// pdest = &pdestspan[scans->u];
190	movl	espan_t_count(%ebx),%ecx
191
192//
193// now start the FDIV for the end of the span
194//
195	cmpl	$8,%ecx
196	ja		LSetupNotLast1
197
198	decl	%ecx
199	jz		LCleanup1		// if only one pixel, no need to start an FDIV
200	movl	%ecx,spancountminus1
201
202// finish up the s and t calcs
203	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
204
205	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
206	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
207	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
208	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
209	fxch	%st(1)			// s | t | 1/z | t/z | s/z
210	fistpl	s				// 1/z | t | t/z | s/z
211	fistpl	t				// 1/z | t/z | s/z
212
213	fildl	spancountminus1
214
215	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | spancountminus1
216	flds	C(d_zistepu)		// C(d_zistepu) | C(d_tdivzstepu) | spancountminus1
217	fmul	%st(2),%st(0)	// C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1
218	fxch	%st(1)			// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
219	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
220	fxch	%st(2)			// scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1
221	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 |
222							//  C(d_tdivzstepu)*scm1
223	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 |
224							//  C(d_tdivzstepu)*scm1
225	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
226	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
227	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
228	faddp	%st(0),%st(3)
229
230	flds	fp_64k
231	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
232							//  overlap
233	jmp		LFDIVInFlight1
234
235LCleanup1:
236// finish up the s and t calcs
237	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
238
239	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
240	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
241	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
242	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
243	fxch	%st(1)			// s | t | 1/z | t/z | s/z
244	fistpl	s				// 1/z | t | t/z | s/z
245	fistpl	t				// 1/z | t/z | s/z
246	jmp		LFDIVInFlight1
247
248	.align	4
249LSetupNotLast1:
250// finish up the s and t calcs
251	fxch	%st(1)			// z*64k | 1/z | t/z | s/z
252
253	fld		%st(0)			// z*64k | z*64k | 1/z | t/z | s/z
254	fmul	%st(4),%st(0)	// s | z*64k | 1/z | t/z | s/z
255	fxch	%st(1)			// z*64k | s | 1/z | t/z | s/z
256	fmul	%st(3),%st(0)	// t | s | 1/z | t/z | s/z
257	fxch	%st(1)			// s | t | 1/z | t/z | s/z
258	fistpl	s				// 1/z | t | t/z | s/z
259	fistpl	t				// 1/z | t/z | s/z
260
261	fadds	zi8stepu
262	fxch	%st(2)
263	fadds	sdivz8stepu
264	fxch	%st(2)
265	flds	tdivz8stepu
266	faddp	%st(0),%st(2)
267	flds	fp_64k
268	fdiv	%st(1),%st(0)	// z = 1/1/z
269							// this is what we've gone to all this trouble to
270							//  overlap
271LFDIVInFlight1:
272
273	addl	s,%esi
274	addl	t,%edx
275	movl	C(bbextents),%ebx
276	movl	C(bbextentt),%ebp
277	cmpl	%ebx,%esi
278	ja		LClampHighOrLow0
279LClampReentry0:
280	movl	%esi,s
281	movl	pbase,%ebx
282	shll	$16,%esi
283	cmpl	%ebp,%edx
284	movl	%esi,sfracf
285	ja		LClampHighOrLow1
286LClampReentry1:
287	movl	%edx,t
288	movl	s,%esi					// sfrac = scans->sfrac;
289	shll	$16,%edx
290	movl	t,%eax					// tfrac = scans->tfrac;
291	sarl	$16,%esi
292	movl	%edx,tfracf
293
294//
295// calculate the texture starting address
296//
297	sarl	$16,%eax
298	movl	C(cachewidth),%edx
299	imull	%edx,%eax				// (tfrac >> 16) * cachewidth
300	addl	%ebx,%esi
301	addl	%eax,%esi				// psource = pbase + (sfrac >> 16) +
302									//           ((tfrac >> 16) * cachewidth);
303
304//
305// determine whether last span or not
306//
307	cmpl	$8,%ecx
308	jna		LLastSegment
309
310//
311// not the last segment; do full 8-wide segment
312//
313LNotLastSegment:
314
315//
316// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
317// get there
318//
319
320// pick up after the FDIV that was left in flight previously
321
322	fld		%st(0)			// duplicate it
323	fmul	%st(4),%st(0)	// s = s/z * z
324	fxch	%st(1)
325	fmul	%st(3),%st(0)	// t = t/z * z
326	fxch	%st(1)
327	fistpl	snext
328	fistpl	tnext
329	movl	snext,%eax
330	movl	tnext,%edx
331
332	movb	(%esi),%bl	// get first source texel
333	subl	$8,%ecx		// count off this segments' pixels
334	movl	C(sadjust),%ebp
335	movl	%ecx,counttemp	// remember count of remaining pixels
336
337	movl	C(tadjust),%ecx
338	movb	%bl,(%edi)	// store first dest pixel
339
340	addl	%eax,%ebp
341	addl	%edx,%ecx
342
343	movl	C(bbextents),%eax
344	movl	C(bbextentt),%edx
345
346	cmpl	$2048,%ebp
347	jl		LClampLow2
348	cmpl	%eax,%ebp
349	ja		LClampHigh2
350LClampReentry2:
351
352	cmpl	$2048,%ecx
353	jl		LClampLow3
354	cmpl	%edx,%ecx
355	ja		LClampHigh3
356LClampReentry3:
357
358	movl	%ebp,snext
359	movl	%ecx,tnext
360
361	subl	s,%ebp
362	subl	t,%ecx
363
364//
365// set up advancetable
366//
367	movl	%ecx,%eax
368	movl	%ebp,%edx
369	sarl	$19,%eax			// tstep >>= 16;
370	jz		LZero
371	sarl	$19,%edx			// sstep >>= 16;
372	movl	C(cachewidth),%ebx
373	imull	%ebx,%eax
374	jmp		LSetUp1
375
376LZero:
377	sarl	$19,%edx			// sstep >>= 16;
378	movl	C(cachewidth),%ebx
379
380LSetUp1:
381
382	addl	%edx,%eax			// add in sstep
383								// (tstep >> 16) * cachewidth + (sstep >> 16);
384	movl	tfracf,%edx
385	movl	%eax,advancetable+4	// advance base in t
386	addl	%ebx,%eax			// ((tstep >> 16) + 1) * cachewidth +
387								//  (sstep >> 16);
388	shll	$13,%ebp			// left-justify sstep fractional part
389	movl	sfracf,%ebx
390	shll	$13,%ecx			// left-justify tstep fractional part
391	movl	%eax,advancetable	// advance extra in t
392
393	movl	%ecx,tstep
394	addl	%ecx,%edx			// advance tfrac fractional part by tstep frac
395
396	sbbl	%ecx,%ecx			// turn tstep carry into -1 (0 if none)
397	addl	%ebp,%ebx			// advance sfrac fractional part by sstep frac
398	adcl	advancetable+4(,%ecx,4),%esi	// point to next source texel
399
400	addl	tstep,%edx
401	sbbl	%ecx,%ecx
402	movb	(%esi),%al
403	addl	%ebp,%ebx
404	movb	%al,1(%edi)
405	adcl	advancetable+4(,%ecx,4),%esi
406
407	addl	tstep,%edx
408	sbbl	%ecx,%ecx
409	addl	%ebp,%ebx
410	movb	(%esi),%al
411	adcl	advancetable+4(,%ecx,4),%esi
412
413	addl	tstep,%edx
414	sbbl	%ecx,%ecx
415	movb	%al,2(%edi)
416	addl	%ebp,%ebx
417	movb	(%esi),%al
418	adcl	advancetable+4(,%ecx,4),%esi
419
420	addl	tstep,%edx
421	sbbl	%ecx,%ecx
422	movb	%al,3(%edi)
423	addl	%ebp,%ebx
424	movb	(%esi),%al
425	adcl	advancetable+4(,%ecx,4),%esi
426
427
428//
429// start FDIV for end of next segment in flight, so it can overlap
430//
431	movl	counttemp,%ecx
432	cmpl	$8,%ecx			// more than one segment after this?
433	ja		LSetupNotLast2	// yes
434
435	decl	%ecx
436	jz		LFDIVInFlight2	// if only one pixel, no need to start an FDIV
437	movl	%ecx,spancountminus1
438	fildl	spancountminus1
439
440	flds	C(d_zistepu)		// C(d_zistepu) | spancountminus1
441	fmul	%st(1),%st(0)	// C(d_zistepu)*scm1 | scm1
442	flds	C(d_tdivzstepu)	// C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1
443	fmul	%st(2),%st(0)	// C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1
444	fxch	%st(1)			// C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1
445	faddp	%st(0),%st(3)	// C(d_tdivzstepu)*scm1 | scm1
446	fxch	%st(1)			// scm1 | C(d_tdivzstepu)*scm1
447	fmuls	C(d_sdivzstepu)	// C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1
448	fxch	%st(1)			// C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1
449	faddp	%st(0),%st(3)	// C(d_sdivzstepu)*scm1
450	flds	fp_64k			// 64k | C(d_sdivzstepu)*scm1
451	fxch	%st(1)			// C(d_sdivzstepu)*scm1 | 64k
452	faddp	%st(0),%st(4)	// 64k
453
454	fdiv	%st(1),%st(0)	// this is what we've gone to all this trouble to
455							//  overlap
456	jmp		LFDIVInFlight2
457
458	.align	4
459LSetupNotLast2:
460	fadds	zi8stepu
461	fxch	%st(2)
462	fadds	sdivz8stepu
463	fxch	%st(2)
464	flds	tdivz8stepu
465	faddp	%st(0),%st(2)
466	flds	fp_64k
467	fdiv	%st(1),%st(0)	// z = 1/1/z
468							// this is what we've gone to all this trouble to
469							//  overlap
470LFDIVInFlight2:
471	movl	%ecx,counttemp
472
473	addl	tstep,%edx
474	sbbl	%ecx,%ecx
475	movb	%al,4(%edi)
476	addl	%ebp,%ebx
477	movb	(%esi),%al
478	adcl	advancetable+4(,%ecx,4),%esi
479
480	addl	tstep,%edx
481	sbbl	%ecx,%ecx
482	movb	%al,5(%edi)
483	addl	%ebp,%ebx
484	movb	(%esi),%al
485	adcl	advancetable+4(,%ecx,4),%esi
486
487	addl	tstep,%edx
488	sbbl	%ecx,%ecx
489	movb	%al,6(%edi)
490	addl	%ebp,%ebx
491	movb	(%esi),%al
492	adcl	advancetable+4(,%ecx,4),%esi
493
494	addl	$8,%edi
495	movl	%edx,tfracf
496	movl	snext,%edx
497	movl	%ebx,sfracf
498	movl	tnext,%ebx
499	movl	%edx,s
500	movl	%ebx,t
501
502	movl	counttemp,%ecx		// retrieve count
503
504//
505// determine whether last span or not
506//
507	cmpl	$8,%ecx				// are there multiple segments remaining?
508	movb	%al,-1(%edi)
509	ja		LNotLastSegment		// yes
510
511//
512// last segment of scan
513//
514LLastSegment:
515
516//
517// advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to
518// get there. The number of pixels left is variable, and we want to land on the
519// last pixel, not step one past it, so we can't run into arithmetic problems
520//
521	testl	%ecx,%ecx
522	jz		LNoSteps		// just draw the last pixel and we're done
523
524// pick up after the FDIV that was left in flight previously
525
526
527	fld		%st(0)			// duplicate it
528	fmul	%st(4),%st(0)	// s = s/z * z
529	fxch	%st(1)
530	fmul	%st(3),%st(0)	// t = t/z * z
531	fxch	%st(1)
532	fistpl	snext
533	fistpl	tnext
534
535	movb	(%esi),%al		// load first texel in segment
536	movl	C(tadjust),%ebx
537	movb	%al,(%edi)		// store first pixel in segment
538	movl	C(sadjust),%eax
539
540	addl	snext,%eax
541	addl	tnext,%ebx
542
543	movl	C(bbextents),%ebp
544	movl	C(bbextentt),%edx
545
546	cmpl	$2048,%eax
547	jl		LClampLow4
548	cmpl	%ebp,%eax
549	ja		LClampHigh4
550LClampReentry4:
551	movl	%eax,snext
552
553	cmpl	$2048,%ebx
554	jl		LClampLow5
555	cmpl	%edx,%ebx
556	ja		LClampHigh5
557LClampReentry5:
558
559	cmpl	$1,%ecx			// don't bother
560	je		LOnlyOneStep	// if two pixels in segment, there's only one step,
561							//  of the segment length
562	subl	s,%eax
563	subl	t,%ebx
564
565	addl	%eax,%eax		// convert to 15.17 format so multiply by 1.31
566	addl	%ebx,%ebx		//  reciprocal yields 16.48
567
568	imull	reciprocal_table-8(,%ecx,4) // sstep = (snext - s) / (spancount-1)
569	movl	%edx,%ebp
570
571	movl	%ebx,%eax
572	imull	reciprocal_table-8(,%ecx,4) // tstep = (tnext - t) / (spancount-1)
573
574LSetEntryvec:
575//
576// set up advancetable
577//
578	movl	entryvec_table(,%ecx,4),%ebx
579	movl	%edx,%eax
580	movl	%ebx,jumptemp		// entry point into code for RET later
581	movl	%ebp,%ecx
582	sarl	$16,%edx			// tstep >>= 16;
583	movl	C(cachewidth),%ebx
584	sarl	$16,%ecx			// sstep >>= 16;
585	imull	%ebx,%edx
586
587	addl	%ecx,%edx			// add in sstep
588								// (tstep >> 16) * cachewidth + (sstep >> 16);
589	movl	tfracf,%ecx
590	movl	%edx,advancetable+4	// advance base in t
591	addl	%ebx,%edx			// ((tstep >> 16) + 1) * cachewidth +
592								//  (sstep >> 16);
593	shll	$16,%ebp			// left-justify sstep fractional part
594	movl	sfracf,%ebx
595	shll	$16,%eax			// left-justify tstep fractional part
596	movl	%edx,advancetable	// advance extra in t
597
598	movl	%eax,tstep
599	movl	%ecx,%edx
600	addl	%eax,%edx
601	sbbl	%ecx,%ecx
602	addl	%ebp,%ebx
603	adcl	advancetable+4(,%ecx,4),%esi
604
605	jmp		*jumptemp			// jump to the number-of-pixels handler
606
607//----------------------------------------
608
609LNoSteps:
610	movb	(%esi),%al		// load first texel in segment
611	subl	$7,%edi			// adjust for hardwired offset
612	jmp		LEndSpan
613
614
615LOnlyOneStep:
616	subl	s,%eax
617	subl	t,%ebx
618	movl	%eax,%ebp
619	movl	%ebx,%edx
620	jmp		LSetEntryvec
621
622//----------------------------------------
623
624.globl	Entry2_8
625Entry2_8:
626	subl	$6,%edi		// adjust for hardwired offsets
627	movb	(%esi),%al
628	jmp		LLEntry2_8
629
630//----------------------------------------
631
632.globl	Entry3_8
633Entry3_8:
634	subl	$5,%edi		// adjust for hardwired offsets
635	addl	%eax,%edx
636	movb	(%esi),%al
637	sbbl	%ecx,%ecx
638	addl	%ebp,%ebx
639	adcl	advancetable+4(,%ecx,4),%esi
640	jmp		LLEntry3_8
641
642//----------------------------------------
643
644.globl	Entry4_8
645Entry4_8:
646	subl	$4,%edi		// adjust for hardwired offsets
647	addl	%eax,%edx
648	movb	(%esi),%al
649	sbbl	%ecx,%ecx
650	addl	%ebp,%ebx
651	adcl	advancetable+4(,%ecx,4),%esi
652	addl	tstep,%edx
653	jmp		LLEntry4_8
654
655//----------------------------------------
656
657.globl	Entry5_8
658Entry5_8:
659	subl	$3,%edi		// adjust for hardwired offsets
660	addl	%eax,%edx
661	movb	(%esi),%al
662	sbbl	%ecx,%ecx
663	addl	%ebp,%ebx
664	adcl	advancetable+4(,%ecx,4),%esi
665	addl	tstep,%edx
666	jmp		LLEntry5_8
667
668//----------------------------------------
669
670.globl	Entry6_8
671Entry6_8:
672	subl	$2,%edi		// adjust for hardwired offsets
673	addl	%eax,%edx
674	movb	(%esi),%al
675	sbbl	%ecx,%ecx
676	addl	%ebp,%ebx
677	adcl	advancetable+4(,%ecx,4),%esi
678	addl	tstep,%edx
679	jmp		LLEntry6_8
680
681//----------------------------------------
682
683.globl	Entry7_8
684Entry7_8:
685	decl	%edi		// adjust for hardwired offsets
686	addl	%eax,%edx
687	movb	(%esi),%al
688	sbbl	%ecx,%ecx
689	addl	%ebp,%ebx
690	adcl	advancetable+4(,%ecx,4),%esi
691	addl	tstep,%edx
692	jmp		LLEntry7_8
693
694//----------------------------------------
695
696.globl	Entry8_8
697Entry8_8:
698	addl	%eax,%edx
699	movb	(%esi),%al
700	sbbl	%ecx,%ecx
701	addl	%ebp,%ebx
702	adcl	advancetable+4(,%ecx,4),%esi
703
704	addl	tstep,%edx
705	sbbl	%ecx,%ecx
706	movb	%al,1(%edi)
707	addl	%ebp,%ebx
708	movb	(%esi),%al
709	adcl	advancetable+4(,%ecx,4),%esi
710	addl	tstep,%edx
711LLEntry7_8:
712	sbbl	%ecx,%ecx
713	movb	%al,2(%edi)
714	addl	%ebp,%ebx
715	movb	(%esi),%al
716	adcl	advancetable+4(,%ecx,4),%esi
717	addl	tstep,%edx
718LLEntry6_8:
719	sbbl	%ecx,%ecx
720	movb	%al,3(%edi)
721	addl	%ebp,%ebx
722	movb	(%esi),%al
723	adcl	advancetable+4(,%ecx,4),%esi
724	addl	tstep,%edx
725LLEntry5_8:
726	sbbl	%ecx,%ecx
727	movb	%al,4(%edi)
728	addl	%ebp,%ebx
729	movb	(%esi),%al
730	adcl	advancetable+4(,%ecx,4),%esi
731	addl	tstep,%edx
732LLEntry4_8:
733	sbbl	%ecx,%ecx
734	movb	%al,5(%edi)
735	addl	%ebp,%ebx
736	movb	(%esi),%al
737	adcl	advancetable+4(,%ecx,4),%esi
738LLEntry3_8:
739	movb	%al,6(%edi)
740	movb	(%esi),%al
741LLEntry2_8:
742
743LEndSpan:
744
745//
746// clear s/z, t/z, 1/z from FP stack
747//
748	fstp %st(0)
749	fstp %st(0)
750	fstp %st(0)
751
752	movl	pspantemp,%ebx				// restore spans pointer
753	movl	espan_t_pnext(%ebx),%ebx	// point to next span
754	testl	%ebx,%ebx			// any more spans?
755	movb	%al,7(%edi)
756	jnz		LSpanLoop			// more spans
757
758	popl	%ebx				// restore register variables
759	popl	%esi
760	popl	%edi
761	popl	%ebp				// restore the caller's stack frame
762	ret
763
764//----------------------------------------------------------------------
765// 8-bpp horizontal span z drawing codefor polygons, with no transparency.
766//
767// Assumes there is at least one span in pzspans, and that every span
768// contains at least one pixel
769//----------------------------------------------------------------------
770
771	.text
772
773// z-clamp on a non-negative gradient span
774LClamp:
775	movl	$0x40000000,%edx
776	xorl	%ebx,%ebx
777	fstp	%st(0)
778	jmp		LZDraw
779
780// z-clamp on a negative gradient span
781LClampNeg:
782	movl	$0x40000000,%edx
783	xorl	%ebx,%ebx
784	fstp	%st(0)
785	jmp		LZDrawNeg
786
787
788#define pzspans	4+16
789
790.globl C(D_DrawZSpans)
791C(D_DrawZSpans):
792	pushl	%ebp				// preserve caller's stack frame
793	pushl	%edi
794	pushl	%esi				// preserve register variables
795	pushl	%ebx
796
797	flds	C(d_zistepu)
798	movl	C(d_zistepu),%eax
799	movl	pzspans(%esp),%esi
800	testl	%eax,%eax
801	jz		LFNegSpan
802
803	fmuls	Float2ToThe31nd
804	fistpl	izistep		// note: we are relying on FP exceptions being turned
805						// off here to avoid range problems
806	movl	izistep,%ebx	// remains loaded for all spans
807
808LFSpanLoop:
809// set up the initial 1/z value
810	fildl	espan_t_v(%esi)
811	fildl	espan_t_u(%esi)
812	movl	espan_t_v(%esi),%ecx
813	movl	C(d_pzbuffer),%edi
814	fmuls	C(d_zistepu)
815	fxch	%st(1)
816	fmuls	C(d_zistepv)
817	fxch	%st(1)
818	fadds	C(d_ziorigin)
819	imull	C(d_zrowbytes),%ecx
820	faddp	%st(0),%st(1)
821
822// clamp if z is nearer than 2 (1/z > 0.5)
823	fcoms	float_point5
824	addl	%ecx,%edi
825	movl	espan_t_u(%esi),%edx
826	addl	%edx,%edx				// word count
827	movl	espan_t_count(%esi),%ecx
828	addl	%edx,%edi				// pdest = &pdestspan[scans->u];
829	pushl	%esi		// preserve spans pointer
830	fnstsw	%ax
831	testb	$0x45,%ah
832	jz		LClamp
833
834	fmuls	Float2ToThe31nd
835	fistpl	izi			// note: we are relying on FP exceptions being turned
836						// off here to avoid problems when the span is closer
837						// than 1/(2**31)
838	movl	izi,%edx
839
840// at this point:
841// %ebx = izistep
842// %ecx = count
843// %edx = izi
844// %edi = pdest
845
846LZDraw:
847
848// do a single pixel up front, if necessary to dword align the destination
849	testl	$2,%edi
850	jz		LFMiddle
851	movl	%edx,%eax
852	addl	%ebx,%edx
853	shrl	$16,%eax
854	decl	%ecx
855	movw	%ax,(%edi)
856	addl	$2,%edi
857
858// do middle a pair of aligned dwords at a time
859LFMiddle:
860	pushl	%ecx
861	shrl	$1,%ecx				// count / 2
862	jz		LFLast				// no aligned dwords to do
863	shrl	$1,%ecx				// (count / 2) / 2
864	jnc		LFMiddleLoop		// even number of aligned dwords to do
865
866	movl	%edx,%eax
867	addl	%ebx,%edx
868	shrl	$16,%eax
869	movl	%edx,%esi
870	addl	%ebx,%edx
871	andl	$0xFFFF0000,%esi
872	orl		%esi,%eax
873	movl	%eax,(%edi)
874	addl	$4,%edi
875	andl	%ecx,%ecx
876	jz		LFLast
877
878LFMiddleLoop:
879	movl	%edx,%eax
880	addl	%ebx,%edx
881	shrl	$16,%eax
882	movl	%edx,%esi
883	addl	%ebx,%edx
884	andl	$0xFFFF0000,%esi
885	orl		%esi,%eax
886	movl	%edx,%ebp
887	movl	%eax,(%edi)
888	addl	%ebx,%edx
889	shrl	$16,%ebp
890	movl	%edx,%esi
891	addl	%ebx,%edx
892	andl	$0xFFFF0000,%esi
893	orl		%esi,%ebp
894	movl	%ebp,4(%edi)	// FIXME: eliminate register contention
895	addl	$8,%edi
896
897	decl	%ecx
898	jnz		LFMiddleLoop
899
900LFLast:
901	popl	%ecx			// retrieve count
902	popl	%esi			// retrieve span pointer
903
904// do the last, unaligned pixel, if there is one
905	andl	$1,%ecx			// is there an odd pixel left to do?
906	jz		LFSpanDone		// no
907	shrl	$16,%edx
908	movw	%dx,(%edi)		// do the final pixel's z
909
910LFSpanDone:
911	movl	espan_t_pnext(%esi),%esi
912	testl	%esi,%esi
913	jnz		LFSpanLoop
914
915	jmp		LFDone
916
917LFNegSpan:
918	fmuls	FloatMinus2ToThe31nd
919	fistpl	izistep		// note: we are relying on FP exceptions being turned
920						// off here to avoid range problems
921	movl	izistep,%ebx	// remains loaded for all spans
922
923LFNegSpanLoop:
924// set up the initial 1/z value
925	fildl	espan_t_v(%esi)
926	fildl	espan_t_u(%esi)
927	movl	espan_t_v(%esi),%ecx
928	movl	C(d_pzbuffer),%edi
929	fmuls	C(d_zistepu)
930	fxch	%st(1)
931	fmuls	C(d_zistepv)
932	fxch	%st(1)
933	fadds	C(d_ziorigin)
934	imull	C(d_zrowbytes),%ecx
935	faddp	%st(0),%st(1)
936
937// clamp if z is nearer than 2 (1/z > 0.5)
938	fcoms	float_point5
939	addl	%ecx,%edi
940	movl	espan_t_u(%esi),%edx
941	addl	%edx,%edx				// word count
942	movl	espan_t_count(%esi),%ecx
943	addl	%edx,%edi				// pdest = &pdestspan[scans->u];
944	pushl	%esi		// preserve spans pointer
945	fnstsw	%ax
946	testb	$0x45,%ah
947	jz		LClampNeg
948
949	fmuls	Float2ToThe31nd
950	fistpl	izi			// note: we are relying on FP exceptions being turned
951						// off here to avoid problems when the span is closer
952						// than 1/(2**31)
953	movl	izi,%edx
954
955// at this point:
956// %ebx = izistep
957// %ecx = count
958// %edx = izi
959// %edi = pdest
960
961LZDrawNeg:
962
963// do a single pixel up front, if necessary to dword align the destination
964	testl	$2,%edi
965	jz		LFNegMiddle
966	movl	%edx,%eax
967	subl	%ebx,%edx
968	shrl	$16,%eax
969	decl	%ecx
970	movw	%ax,(%edi)
971	addl	$2,%edi
972
973// do middle a pair of aligned dwords at a time
974LFNegMiddle:
975	pushl	%ecx
976	shrl	$1,%ecx				// count / 2
977	jz		LFNegLast			// no aligned dwords to do
978	shrl	$1,%ecx				// (count / 2) / 2
979	jnc		LFNegMiddleLoop		// even number of aligned dwords to do
980
981	movl	%edx,%eax
982	subl	%ebx,%edx
983	shrl	$16,%eax
984	movl	%edx,%esi
985	subl	%ebx,%edx
986	andl	$0xFFFF0000,%esi
987	orl		%esi,%eax
988	movl	%eax,(%edi)
989	addl	$4,%edi
990	andl	%ecx,%ecx
991	jz		LFNegLast
992
993LFNegMiddleLoop:
994	movl	%edx,%eax
995	subl	%ebx,%edx
996	shrl	$16,%eax
997	movl	%edx,%esi
998	subl	%ebx,%edx
999	andl	$0xFFFF0000,%esi
1000	orl		%esi,%eax
1001	movl	%edx,%ebp
1002	movl	%eax,(%edi)
1003	subl	%ebx,%edx
1004	shrl	$16,%ebp
1005	movl	%edx,%esi
1006	subl	%ebx,%edx
1007	andl	$0xFFFF0000,%esi
1008	orl		%esi,%ebp
1009	movl	%ebp,4(%edi)	// FIXME: eliminate register contention
1010	addl	$8,%edi
1011
1012	decl	%ecx
1013	jnz		LFNegMiddleLoop
1014
1015LFNegLast:
1016	popl	%ecx			// retrieve count
1017	popl	%esi			// retrieve span pointer
1018
1019// do the last, unaligned pixel, if there is one
1020	andl	$1,%ecx			// is there an odd pixel left to do?
1021	jz		LFNegSpanDone	// no
1022	shrl	$16,%edx
1023	movw	%dx,(%edi)		// do the final pixel's z
1024
1025LFNegSpanDone:
1026	movl	espan_t_pnext(%esi),%esi
1027	testl	%esi,%esi
1028	jnz		LFNegSpanLoop
1029
1030LFDone:
1031	popl	%ebx				// restore register variables
1032	popl	%esi
1033	popl	%edi
1034	popl	%ebp				// restore the caller's stack frame
1035	ret
1036
1037#endif	// id386
1038