1/*
2Copyright (C) 1996-1997 Id Software, Inc.
3
4This program is free software; you can redistribute it and/or
5modify it under the terms of the GNU General Public License
6as published by the Free Software Foundation; either version 2
7of the License, or (at your option) any later version.
8
9This program is distributed in the hope that it will be useful,
10but WITHOUT ANY WARRANTY; without even the implied warranty of
11MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12
13See the GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with this program; if not, write to the Free Software
17Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
18
19*/
20//
21// d_parta.s
22// x86 assembly-language 8-bpp particle-drawing code.
23//
24
25#include "asm_i386.h"
26#include "quakeasm.h"
27#include "d_ifacea.h"
28#include "asm_draw.h"
29
30#if	id386
31
32//----------------------------------------------------------------------
33// 8-bpp particle drawing code.
34//----------------------------------------------------------------------
35
36//FIXME: comments, full optimization
37
38//----------------------------------------------------------------------
39// 8-bpp particle queueing code.
40//----------------------------------------------------------------------
41
42	.text
43
44#define P	12+4
45
46	.align 4
47.globl C(D_DrawParticle)
48C(D_DrawParticle):
49	pushl	%ebp				// preserve caller's stack frame
50	pushl	%edi				// preserve register variables
51	pushl	%ebx
52
53	movl	P(%esp),%edi
54
55// FIXME: better FP overlap in general here
56
57// transform point
58//	VectorSubtract (p->org, r_origin, local);
59	flds	C(r_origin)
60	fsubrs	pt_org(%edi)
61	flds	pt_org+4(%edi)
62	fsubs	C(r_origin)+4
63	flds	pt_org+8(%edi)
64	fsubs	C(r_origin)+8
65	fxch	%st(2)			// local[0] | local[1] | local[2]
66
67//	transformed[2] = DotProduct(local, r_ppn);
68	flds	C(r_ppn)		// r_ppn[0] | local[0] | local[1] | local[2]
69	fmul	%st(1),%st(0)	// dot0 | local[0] | local[1] | local[2]
70	flds	C(r_ppn)+4	// r_ppn[1] | dot0 | local[0] | local[1] | local[2]
71	fmul	%st(3),%st(0)	// dot1 | dot0 | local[0] | local[1] | local[2]
72	flds	C(r_ppn)+8	// r_ppn[2] | dot1 | dot0 | local[0] |
73						//  local[1] | local[2]
74	fmul	%st(5),%st(0)	// dot2 | dot1 | dot0 | local[0] | local[1] | local[2]
75	fxch	%st(2)		// dot0 | dot1 | dot2 | local[0] | local[1] | local[2]
76	faddp	%st(0),%st(1) // dot0 + dot1 | dot2 | local[0] | local[1] |
77						  //  local[2]
78	faddp	%st(0),%st(1) // z | local[0] | local[1] | local[2]
79	fld		%st(0)		// z | z | local[0] | local[1] |
80						//  local[2]
81	fdivrs	float_1		// 1/z | z | local[0] | local[1] | local[2]
82	fxch	%st(1)		// z | 1/z | local[0] | local[1] | local[2]
83
84//	if (transformed[2] < PARTICLE_Z_CLIP)
85//		return;
86	fcomps	float_particle_z_clip	// 1/z | local[0] | local[1] | local[2]
87	fxch	%st(3)					// local[2] | local[0] | local[1] | 1/z
88
89	flds	C(r_pup)	// r_pup[0] | local[2] | local[0] | local[1] | 1/z
90	fmul	%st(2),%st(0)	// dot0 | local[2] | local[0] | local[1] | 1/z
91	flds	C(r_pup)+4	// r_pup[1] | dot0 | local[2] | local[0] |
92						//  local[1] | 1/z
93
94	fnstsw	%ax
95	testb	$1,%ah
96	jnz		LPop6AndDone
97
98//	transformed[1] = DotProduct(local, r_pup);
99	fmul	%st(4),%st(0)	// dot1 | dot0 | local[2] | local[0] | local[1] | 1/z
100	flds	C(r_pup)+8	// r_pup[2] | dot1 | dot0 | local[2] |
101						//  local[0] | local[1] | 1/z
102	fmul	%st(3),%st(0)	// dot2 | dot1 | dot0 | local[2] | local[0] |
103						//  local[1] | 1/z
104	fxch	%st(2)		// dot0 | dot1 | dot2 | local[2] | local[0] |
105						//  local[1] | 1/z
106	faddp	%st(0),%st(1) // dot0 + dot1 | dot2 | local[2] | local[0] |
107						//  local[1] | 1/z
108	faddp	%st(0),%st(1) // y | local[2] | local[0] | local[1] | 1/z
109	fxch	%st(3)		// local[1] | local[2] | local[0] | y | 1/z
110
111//	transformed[0] = DotProduct(local, r_pright);
112	fmuls	C(r_pright)+4	// dot1 | local[2] | local[0] | y | 1/z
113	fxch	%st(2)		// local[0] | local[2] | dot1 | y | 1/z
114	fmuls	C(r_pright)	// dot0 | local[2] | dot1 | y | 1/z
115	fxch	%st(1)		// local[2] | dot0 | dot1 | y | 1/z
116	fmuls	C(r_pright)+8	// dot2 | dot0 | dot1 | y | 1/z
117	fxch	%st(2)		// dot1 | dot0 | dot2 | y | 1/z
118	faddp	%st(0),%st(1) // dot1 + dot0 | dot2 | y | 1/z
119
120	faddp	%st(0),%st(1)	// x | y | 1/z
121	fxch	%st(1)			// y | x | 1/z
122
123// project the point
124	fmul	%st(2),%st(0)	// y/z | x | 1/z
125	fxch	%st(1)			// x | y/z | 1/z
126	fmul	%st(2),%st(0)	// x/z | y/z | 1/z
127	fxch	%st(1)			// y/z | x/z | 1/z
128	fsubrs	C(ycenter)		// v | x/z | 1/z
129	fxch	%st(1)			// x/z | v | 1/z
130	fadds	C(xcenter)		// u | v | 1/z
131// FIXME: preadjust xcenter and ycenter
132	fxch	%st(1)			// v | u | 1/z
133	fadds	float_point5	// v | u | 1/z
134	fxch	%st(1)			// u | v | 1/z
135	fadds	float_point5	// u | v | 1/z
136	fxch	%st(2)			// 1/z | v | u
137	fmuls	DP_32768		// 1/z * 0x8000 | v | u
138	fxch	%st(2)			// u | v | 1/z * 0x8000
139
140// FIXME: use Terje's fp->int trick here?
141// FIXME: check we're getting proper rounding here
142	fistpl	DP_u			// v | 1/z * 0x8000
143	fistpl	DP_v			// 1/z * 0x8000
144
145	movl	DP_u,%eax
146	movl	DP_v,%edx
147
148// if ((v > d_vrectbottom_particle) ||
149// 	(u > d_vrectright_particle) ||
150// 	(v < d_vrecty) ||
151// 	(u < d_vrectx))
152// {
153// 	continue;
154// }
155
156	movl	C(d_vrectbottom_particle),%ebx
157	movl	C(d_vrectright_particle),%ecx
158	cmpl	%ebx,%edx
159	jg		LPop1AndDone
160	cmpl	%ecx,%eax
161	jg		LPop1AndDone
162	movl	C(d_vrecty),%ebx
163	movl	C(d_vrectx),%ecx
164	cmpl	%ebx,%edx
165	jl		LPop1AndDone
166
167	cmpl	%ecx,%eax
168	jl		LPop1AndDone
169
170	flds	pt_color(%edi)	// color | 1/z * 0x8000
171// FIXME: use Terje's fast fp->int trick?
172	fistpl	DP_Color		// 1/z * 0x8000
173
174	movl	C(d_viewbuffer),%ebx
175
176	addl	%eax,%ebx
177	movl	C(d_scantable)(,%edx,4),%edi		// point to the pixel
178
179	imull	C(d_zrowbytes),%edx		// point to the z pixel
180
181	leal	(%edx,%eax,2),%edx
182	movl	C(d_pzbuffer),%eax
183
184	fistpl	izi
185
186	addl	%ebx,%edi
187	addl	%eax,%edx
188
189// pix = izi >> d_pix_shift;
190
191	movl	izi,%eax
192	movl	C(d_pix_shift),%ecx
193	shrl	%cl,%eax
194	movl	izi,%ebp
195
196// if (pix < d_pix_min)
197// 		pix = d_pix_min;
198// else if (pix > d_pix_max)
199//  	pix = d_pix_max;
200
201	movl	C(d_pix_min),%ebx
202	movl	C(d_pix_max),%ecx
203	cmpl	%ebx,%eax
204	jnl		LTestPixMax
205	movl	%ebx,%eax
206	jmp		LTestDone
207
208LTestPixMax:
209	cmpl	%ecx,%eax
210	jng		LTestDone
211	movl	%ecx,%eax
212LTestDone:
213
214	movb	DP_Color,%ch
215
216	movl	C(d_y_aspect_shift),%ebx
217	testl	%ebx,%ebx
218	jnz		LDefault
219
220	cmpl	$4,%eax
221	ja		LDefault
222
223	jmp		DP_EntryTable-4(,%eax,4)
224
225// 1x1
226.globl	DP_1x1
227DP_1x1:
228	cmpw	%bp,(%edx)		// just one pixel to do
229	jg		LDone
230	movw	%bp,(%edx)
231	movb	%ch,(%edi)
232	jmp		LDone
233
234// 2x2
235.globl	DP_2x2
236DP_2x2:
237	pushl	%esi
238	movl	C(screenwidth),%ebx
239	movl	C(d_zrowbytes),%esi
240
241	cmpw	%bp,(%edx)
242	jg		L2x2_1
243	movw	%bp,(%edx)
244	movb	%ch,(%edi)
245L2x2_1:
246	cmpw	%bp,2(%edx)
247	jg		L2x2_2
248	movw	%bp,2(%edx)
249	movb	%ch,1(%edi)
250L2x2_2:
251	cmpw	%bp,(%edx,%esi,1)
252	jg		L2x2_3
253	movw	%bp,(%edx,%esi,1)
254	movb	%ch,(%edi,%ebx,1)
255L2x2_3:
256	cmpw	%bp,2(%edx,%esi,1)
257	jg		L2x2_4
258	movw	%bp,2(%edx,%esi,1)
259	movb	%ch,1(%edi,%ebx,1)
260L2x2_4:
261
262	popl	%esi
263	jmp		LDone
264
265// 3x3
266.globl	DP_3x3
267DP_3x3:
268	pushl	%esi
269	movl	C(screenwidth),%ebx
270	movl	C(d_zrowbytes),%esi
271
272	cmpw	%bp,(%edx)
273	jg		L3x3_1
274	movw	%bp,(%edx)
275	movb	%ch,(%edi)
276L3x3_1:
277	cmpw	%bp,2(%edx)
278	jg		L3x3_2
279	movw	%bp,2(%edx)
280	movb	%ch,1(%edi)
281L3x3_2:
282	cmpw	%bp,4(%edx)
283	jg		L3x3_3
284	movw	%bp,4(%edx)
285	movb	%ch,2(%edi)
286L3x3_3:
287
288	cmpw	%bp,(%edx,%esi,1)
289	jg		L3x3_4
290	movw	%bp,(%edx,%esi,1)
291	movb	%ch,(%edi,%ebx,1)
292L3x3_4:
293	cmpw	%bp,2(%edx,%esi,1)
294	jg		L3x3_5
295	movw	%bp,2(%edx,%esi,1)
296	movb	%ch,1(%edi,%ebx,1)
297L3x3_5:
298	cmpw	%bp,4(%edx,%esi,1)
299	jg		L3x3_6
300	movw	%bp,4(%edx,%esi,1)
301	movb	%ch,2(%edi,%ebx,1)
302L3x3_6:
303
304	cmpw	%bp,(%edx,%esi,2)
305	jg		L3x3_7
306	movw	%bp,(%edx,%esi,2)
307	movb	%ch,(%edi,%ebx,2)
308L3x3_7:
309	cmpw	%bp,2(%edx,%esi,2)
310	jg		L3x3_8
311	movw	%bp,2(%edx,%esi,2)
312	movb	%ch,1(%edi,%ebx,2)
313L3x3_8:
314	cmpw	%bp,4(%edx,%esi,2)
315	jg		L3x3_9
316	movw	%bp,4(%edx,%esi,2)
317	movb	%ch,2(%edi,%ebx,2)
318L3x3_9:
319
320	popl	%esi
321	jmp		LDone
322
323
324// 4x4
325.globl	DP_4x4
326DP_4x4:
327	pushl	%esi
328	movl	C(screenwidth),%ebx
329	movl	C(d_zrowbytes),%esi
330
331	cmpw	%bp,(%edx)
332	jg		L4x4_1
333	movw	%bp,(%edx)
334	movb	%ch,(%edi)
335L4x4_1:
336	cmpw	%bp,2(%edx)
337	jg		L4x4_2
338	movw	%bp,2(%edx)
339	movb	%ch,1(%edi)
340L4x4_2:
341	cmpw	%bp,4(%edx)
342	jg		L4x4_3
343	movw	%bp,4(%edx)
344	movb	%ch,2(%edi)
345L4x4_3:
346	cmpw	%bp,6(%edx)
347	jg		L4x4_4
348	movw	%bp,6(%edx)
349	movb	%ch,3(%edi)
350L4x4_4:
351
352	cmpw	%bp,(%edx,%esi,1)
353	jg		L4x4_5
354	movw	%bp,(%edx,%esi,1)
355	movb	%ch,(%edi,%ebx,1)
356L4x4_5:
357	cmpw	%bp,2(%edx,%esi,1)
358	jg		L4x4_6
359	movw	%bp,2(%edx,%esi,1)
360	movb	%ch,1(%edi,%ebx,1)
361L4x4_6:
362	cmpw	%bp,4(%edx,%esi,1)
363	jg		L4x4_7
364	movw	%bp,4(%edx,%esi,1)
365	movb	%ch,2(%edi,%ebx,1)
366L4x4_7:
367	cmpw	%bp,6(%edx,%esi,1)
368	jg		L4x4_8
369	movw	%bp,6(%edx,%esi,1)
370	movb	%ch,3(%edi,%ebx,1)
371L4x4_8:
372
373	leal	(%edx,%esi,2),%edx
374	leal	(%edi,%ebx,2),%edi
375
376	cmpw	%bp,(%edx)
377	jg		L4x4_9
378	movw	%bp,(%edx)
379	movb	%ch,(%edi)
380L4x4_9:
381	cmpw	%bp,2(%edx)
382	jg		L4x4_10
383	movw	%bp,2(%edx)
384	movb	%ch,1(%edi)
385L4x4_10:
386	cmpw	%bp,4(%edx)
387	jg		L4x4_11
388	movw	%bp,4(%edx)
389	movb	%ch,2(%edi)
390L4x4_11:
391	cmpw	%bp,6(%edx)
392	jg		L4x4_12
393	movw	%bp,6(%edx)
394	movb	%ch,3(%edi)
395L4x4_12:
396
397	cmpw	%bp,(%edx,%esi,1)
398	jg		L4x4_13
399	movw	%bp,(%edx,%esi,1)
400	movb	%ch,(%edi,%ebx,1)
401L4x4_13:
402	cmpw	%bp,2(%edx,%esi,1)
403	jg		L4x4_14
404	movw	%bp,2(%edx,%esi,1)
405	movb	%ch,1(%edi,%ebx,1)
406L4x4_14:
407	cmpw	%bp,4(%edx,%esi,1)
408	jg		L4x4_15
409	movw	%bp,4(%edx,%esi,1)
410	movb	%ch,2(%edi,%ebx,1)
411L4x4_15:
412	cmpw	%bp,6(%edx,%esi,1)
413	jg		L4x4_16
414	movw	%bp,6(%edx,%esi,1)
415	movb	%ch,3(%edi,%ebx,1)
416L4x4_16:
417
418	popl	%esi
419	jmp		LDone
420
421// default case, handling any size particle
422LDefault:
423
424// count = pix << d_y_aspect_shift;
425
426	movl	%eax,%ebx
427	movl	%eax,DP_Pix
428	movb	C(d_y_aspect_shift),%cl
429	shll	%cl,%ebx
430
431// for ( ; count ; count--, pz += d_zwidth, pdest += screenwidth)
432// {
433// 	for (i=0 ; i<pix ; i++)
434// 	{
435// 		if (pz[i] <= izi)
436// 		{
437// 			pz[i] = izi;
438// 			pdest[i] = color;
439// 		}
440// 	}
441// }
442
443LGenRowLoop:
444	movl	DP_Pix,%eax
445
446LGenColLoop:
447	cmpw	%bp,-2(%edx,%eax,2)
448	jg		LGSkip
449	movw	%bp,-2(%edx,%eax,2)
450	movb	%ch,-1(%edi,%eax,1)
451LGSkip:
452	decl	%eax			// --pix
453	jnz		LGenColLoop
454
455	addl	C(d_zrowbytes),%edx
456	addl	C(screenwidth),%edi
457
458	decl	%ebx			// --count
459	jnz		LGenRowLoop
460
461LDone:
462	popl	%ebx				// restore register variables
463	popl	%edi
464	popl	%ebp				// restore the caller's stack frame
465	ret
466
467LPop6AndDone:
468	fstp	%st(0)
469	fstp	%st(0)
470	fstp	%st(0)
471	fstp	%st(0)
472	fstp	%st(0)
473LPop1AndDone:
474	fstp	%st(0)
475	jmp		LDone
476
477#endif	// id386
478