1/*
2 * Copyright (C) 2008 Nicolai Haehnle.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28/**
29 * @file
30 *
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
33 *
34 */
35
36#include "radeon_program_alu.h"
37
38#include "radeon_compiler.h"
39#include "radeon_compiler_util.h"
40
41
42static struct rc_instruction *emit1(
43	struct radeon_compiler * c, struct rc_instruction * after,
44	rc_opcode Opcode, struct rc_sub_instruction * base,
45	struct rc_dst_register DstReg, struct rc_src_register SrcReg)
46{
47	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
48
49	if (base) {
50		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
51	}
52
53	fpi->U.I.Opcode = Opcode;
54	fpi->U.I.DstReg = DstReg;
55	fpi->U.I.SrcReg[0] = SrcReg;
56	return fpi;
57}
58
59static struct rc_instruction *emit2(
60	struct radeon_compiler * c, struct rc_instruction * after,
61	rc_opcode Opcode, struct rc_sub_instruction * base,
62	struct rc_dst_register DstReg,
63	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
64{
65	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
66
67	if (base) {
68		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
69	}
70
71	fpi->U.I.Opcode = Opcode;
72	fpi->U.I.DstReg = DstReg;
73	fpi->U.I.SrcReg[0] = SrcReg0;
74	fpi->U.I.SrcReg[1] = SrcReg1;
75	return fpi;
76}
77
78static struct rc_instruction *emit3(
79	struct radeon_compiler * c, struct rc_instruction * after,
80	rc_opcode Opcode, struct rc_sub_instruction * base,
81	struct rc_dst_register DstReg,
82	struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
83	struct rc_src_register SrcReg2)
84{
85	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
86
87	if (base) {
88		memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
89	}
90
91	fpi->U.I.Opcode = Opcode;
92	fpi->U.I.DstReg = DstReg;
93	fpi->U.I.SrcReg[0] = SrcReg0;
94	fpi->U.I.SrcReg[1] = SrcReg1;
95	fpi->U.I.SrcReg[2] = SrcReg2;
96	return fpi;
97}
98
99static struct rc_dst_register dstregtmpmask(int index, int mask)
100{
101	struct rc_dst_register dst = {0, 0, 0};
102	dst.File = RC_FILE_TEMPORARY;
103	dst.Index = index;
104	dst.WriteMask = mask;
105	return dst;
106}
107
108static const struct rc_src_register builtin_zero = {
109	.File = RC_FILE_NONE,
110	.Index = 0,
111	.Swizzle = RC_SWIZZLE_0000
112};
113static const struct rc_src_register builtin_one = {
114	.File = RC_FILE_NONE,
115	.Index = 0,
116	.Swizzle = RC_SWIZZLE_1111
117};
118
119static const struct rc_src_register builtin_half = {
120	.File = RC_FILE_NONE,
121	.Index = 0,
122	.Swizzle = RC_SWIZZLE_HHHH
123};
124
125static const struct rc_src_register srcreg_undefined = {
126	.File = RC_FILE_NONE,
127	.Index = 0,
128	.Swizzle = RC_SWIZZLE_XYZW
129};
130
131static struct rc_src_register srcreg(int file, int index)
132{
133	struct rc_src_register src = srcreg_undefined;
134	src.File = file;
135	src.Index = index;
136	return src;
137}
138
139static struct rc_src_register srcregswz(int file, int index, int swz)
140{
141	struct rc_src_register src = srcreg_undefined;
142	src.File = file;
143	src.Index = index;
144	src.Swizzle = swz;
145	return src;
146}
147
148static struct rc_src_register absolute(struct rc_src_register reg)
149{
150	struct rc_src_register newreg = reg;
151	newreg.Abs = 1;
152	newreg.Negate = RC_MASK_NONE;
153	return newreg;
154}
155
156static struct rc_src_register negate(struct rc_src_register reg)
157{
158	struct rc_src_register newreg = reg;
159	newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
160	return newreg;
161}
162
163static struct rc_src_register swizzle(struct rc_src_register reg,
164		rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
165{
166	struct rc_src_register swizzled = reg;
167	swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
168	return swizzled;
169}
170
171static struct rc_src_register swizzle_smear(struct rc_src_register reg,
172		rc_swizzle x)
173{
174	return swizzle(reg, x, x, x, x);
175}
176
177static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
178{
179	return swizzle_smear(reg, RC_SWIZZLE_X);
180}
181
182static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
183{
184	return swizzle_smear(reg, RC_SWIZZLE_Y);
185}
186
187static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
188{
189	return swizzle_smear(reg, RC_SWIZZLE_Z);
190}
191
192static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
193{
194	return swizzle_smear(reg, RC_SWIZZLE_W);
195}
196
197static int is_dst_safe_to_reuse(struct rc_instruction *inst)
198{
199	const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
200	unsigned i;
201
202	assert(info->HasDstReg);
203
204	if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
205		return 0;
206
207	for (i = 0; i < info->NumSrcRegs; i++) {
208		if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
209		    inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
210			return 0;
211	}
212
213	return 1;
214}
215
216static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
217					       struct rc_instruction *inst)
218{
219	unsigned tmp;
220
221	if (is_dst_safe_to_reuse(inst))
222		tmp = inst->U.I.DstReg.Index;
223	else
224		tmp = rc_find_free_temporary(c);
225
226	return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
227}
228
229static void transform_ABS(struct radeon_compiler* c,
230	struct rc_instruction* inst)
231{
232	struct rc_src_register src = inst->U.I.SrcReg[0];
233	src.Abs = 1;
234	src.Negate = RC_MASK_NONE;
235	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
236	rc_remove_instruction(inst);
237}
238
239static void transform_CEIL(struct radeon_compiler* c,
240	struct rc_instruction* inst)
241{
242	/* Assuming:
243	 *     ceil(x) = -floor(-x)
244	 *
245	 * After inlining floor:
246	 *     ceil(x) = -(-x-frac(-x))
247	 *
248	 * After simplification:
249	 *     ceil(x) = x+frac(-x)
250	 */
251
252	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
253	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
254	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
255		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
256	rc_remove_instruction(inst);
257}
258
259static void transform_CLAMP(struct radeon_compiler *c,
260	struct rc_instruction *inst)
261{
262	/* CLAMP dst, src, min, max
263	 *    into:
264	 * MIN tmp, src, max
265	 * MAX dst, tmp, min
266	 */
267	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
268	emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
269		inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
270	emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
271		srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
272	rc_remove_instruction(inst);
273}
274
275static void transform_DP2(struct radeon_compiler* c,
276	struct rc_instruction* inst)
277{
278	struct rc_src_register src0 = inst->U.I.SrcReg[0];
279	struct rc_src_register src1 = inst->U.I.SrcReg[1];
280	src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
281	src0.Swizzle &= ~(63 << (3 * 2));
282	src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
283	src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
284	src1.Swizzle &= ~(63 << (3 * 2));
285	src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
286	emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
287	rc_remove_instruction(inst);
288}
289
290static void transform_DPH(struct radeon_compiler* c,
291	struct rc_instruction* inst)
292{
293	struct rc_src_register src0 = inst->U.I.SrcReg[0];
294	src0.Negate &= ~RC_MASK_W;
295	src0.Swizzle &= ~(7 << (3 * 3));
296	src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
297	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
298	rc_remove_instruction(inst);
299}
300
301/**
302 * [1, src0.y*src1.y, src0.z, src1.w]
303 * So basically MUL with lotsa swizzling.
304 */
305static void transform_DST(struct radeon_compiler* c,
306	struct rc_instruction* inst)
307{
308	emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
309		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
310		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
311	rc_remove_instruction(inst);
312}
313
314static void transform_FLR(struct radeon_compiler* c,
315	struct rc_instruction* inst)
316{
317	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
318	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
319	emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
320		inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
321	rc_remove_instruction(inst);
322}
323
324static void transform_TRUNC(struct radeon_compiler* c,
325	struct rc_instruction* inst)
326{
327	/* Definition of trunc:
328	 *   trunc(x) = (abs(x) - fract(abs(x))) * sgn(x)
329	 *
330	 * The multiplication by sgn(x) can be simplified using CMP:
331	 *   y * sgn(x) = (x < 0 ? -y : y)
332	 */
333	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
334	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, absolute(inst->U.I.SrcReg[0]));
335	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, absolute(inst->U.I.SrcReg[0]),
336	      negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
337	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg, inst->U.I.SrcReg[0],
338	      negate(srcreg(RC_FILE_TEMPORARY, dst.Index)), srcreg(RC_FILE_TEMPORARY, dst.Index));
339	rc_remove_instruction(inst);
340}
341
342/**
343 * Definition of LIT (from ARB_fragment_program):
344 *
345 *  tmp = VectorLoad(op0);
346 *  if (tmp.x < 0) tmp.x = 0;
347 *  if (tmp.y < 0) tmp.y = 0;
348 *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
349 *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
350 *  result.x = 1.0;
351 *  result.y = tmp.x;
352 *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
353 *  result.w = 1.0;
354 *
355 * The longest path of computation is the one leading to result.z,
356 * consisting of 5 operations. This implementation of LIT takes
357 * 5 slots, if the subsequent optimization passes are clever enough
358 * to pair instructions correctly.
359 */
360static void transform_LIT(struct radeon_compiler* c,
361	struct rc_instruction* inst)
362{
363	unsigned int constant;
364	unsigned int constant_swizzle;
365	unsigned int temp;
366	struct rc_src_register srctemp;
367
368	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
369
370	if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
371		struct rc_instruction * inst_mov;
372
373		inst_mov = emit1(c, inst,
374			RC_OPCODE_MOV, 0, inst->U.I.DstReg,
375			srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
376
377		inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
378		inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
379		inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
380	}
381
382	temp = inst->U.I.DstReg.Index;
383	srctemp = srcreg(RC_FILE_TEMPORARY, temp);
384
385	/* tmp.x = max(0.0, Src.x); */
386	/* tmp.y = max(0.0, Src.y); */
387	/* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
388	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
389		dstregtmpmask(temp, RC_MASK_XYW),
390		inst->U.I.SrcReg[0],
391		swizzle(srcreg(RC_FILE_CONSTANT, constant),
392			RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
393	emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
394		dstregtmpmask(temp, RC_MASK_Z),
395		swizzle_wwww(srctemp),
396		negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
397
398	/* tmp.w = Pow(tmp.y, tmp.w) */
399	emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
400		dstregtmpmask(temp, RC_MASK_W),
401		swizzle_yyyy(srctemp));
402	emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
403		dstregtmpmask(temp, RC_MASK_W),
404		swizzle_wwww(srctemp),
405		swizzle_zzzz(srctemp));
406	emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
407		dstregtmpmask(temp, RC_MASK_W),
408		swizzle_wwww(srctemp));
409
410	/* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
411	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
412		dstregtmpmask(temp, RC_MASK_Z),
413		negate(swizzle_xxxx(srctemp)),
414		swizzle_wwww(srctemp),
415		builtin_zero);
416
417	/* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
418	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
419		dstregtmpmask(temp, RC_MASK_XYW),
420		swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
421
422	rc_remove_instruction(inst);
423}
424
425static void transform_LRP(struct radeon_compiler* c,
426	struct rc_instruction* inst)
427{
428	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
429
430	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
431		dst,
432		inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
433	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
434		inst->U.I.DstReg,
435		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
436
437	rc_remove_instruction(inst);
438}
439
440static void transform_POW(struct radeon_compiler* c,
441	struct rc_instruction* inst)
442{
443	struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
444	struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
445	tempdst.WriteMask = RC_MASK_W;
446	tempsrc.Swizzle = RC_SWIZZLE_WWWW;
447
448	emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
449	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
450	emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
451
452	rc_remove_instruction(inst);
453}
454
455/* dst = ROUND(src) :
456 *   add = src + .5
457 *   frac = FRC(add)
458 *   dst = add - frac
459 *
460 * According to the GLSL spec, the implementor can decide which way to round
461 * when the fraction is .5.  We round down for .5.
462 *
463 */
464static void transform_ROUND(struct radeon_compiler* c,
465	struct rc_instruction* inst)
466{
467	unsigned int mask = inst->U.I.DstReg.WriteMask;
468	unsigned int frac_index, add_index;
469	struct rc_dst_register frac_dst, add_dst;
470	struct rc_src_register frac_src, add_src;
471
472	/* add = src + .5 */
473	add_index = rc_find_free_temporary(c);
474	add_dst = dstregtmpmask(add_index, mask);
475	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],
476								builtin_half);
477	add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
478
479
480	/* frac = FRC(add) */
481	frac_index = rc_find_free_temporary(c);
482	frac_dst = dstregtmpmask(frac_index, mask);
483	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);
484	frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
485
486	/* dst = add - frac */
487	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,
488						add_src, negate(frac_src));
489	rc_remove_instruction(inst);
490}
491
492static void transform_RSQ(struct radeon_compiler* c,
493	struct rc_instruction* inst)
494{
495	inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
496}
497
498static void transform_SEQ(struct radeon_compiler* c,
499	struct rc_instruction* inst)
500{
501	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
502
503	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
504	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
505		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
506
507	rc_remove_instruction(inst);
508}
509
510static void transform_SFL(struct radeon_compiler* c,
511	struct rc_instruction* inst)
512{
513	emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
514	rc_remove_instruction(inst);
515}
516
517static void transform_SGE(struct radeon_compiler* c,
518	struct rc_instruction* inst)
519{
520	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
521
522	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
523	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
524		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
525
526	rc_remove_instruction(inst);
527}
528
529static void transform_SGT(struct radeon_compiler* c,
530	struct rc_instruction* inst)
531{
532	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
533
534	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
535	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
536		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
537
538	rc_remove_instruction(inst);
539}
540
541static void transform_SLE(struct radeon_compiler* c,
542	struct rc_instruction* inst)
543{
544	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
545
546	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
547	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
548		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
549
550	rc_remove_instruction(inst);
551}
552
553static void transform_SLT(struct radeon_compiler* c,
554	struct rc_instruction* inst)
555{
556	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
557
558	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
559	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
560		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
561
562	rc_remove_instruction(inst);
563}
564
565static void transform_SNE(struct radeon_compiler* c,
566	struct rc_instruction* inst)
567{
568	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
569
570	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
571	emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
572		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
573
574	rc_remove_instruction(inst);
575}
576
577static void transform_SSG(struct radeon_compiler* c,
578	struct rc_instruction* inst)
579{
580	/* result = sign(x)
581	 *
582	 *   CMP tmp0, -x, 1, 0
583	 *   CMP tmp1, x, 1, 0
584	 *   ADD result, tmp0, -tmp1;
585	 */
586	struct rc_dst_register dst0;
587	unsigned tmp1;
588
589	/* 0 < x */
590	dst0 = try_to_reuse_dst(c, inst);
591	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
592	      dst0,
593	      negate(inst->U.I.SrcReg[0]),
594	      builtin_one,
595	      builtin_zero);
596
597	/* x < 0 */
598	tmp1 = rc_find_free_temporary(c);
599	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
600	      dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
601	      inst->U.I.SrcReg[0],
602	      builtin_one,
603	      builtin_zero);
604
605	/* Either both are zero, or one of them is one and the other is zero. */
606	/* result = tmp0 - tmp1 */
607	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
608	      inst->U.I.DstReg,
609	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
610	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
611
612	rc_remove_instruction(inst);
613}
614
615static void transform_SUB(struct radeon_compiler* c,
616	struct rc_instruction* inst)
617{
618	inst->U.I.Opcode = RC_OPCODE_ADD;
619	inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
620}
621
622static void transform_SWZ(struct radeon_compiler* c,
623	struct rc_instruction* inst)
624{
625	inst->U.I.Opcode = RC_OPCODE_MOV;
626}
627
628static void transform_XPD(struct radeon_compiler* c,
629	struct rc_instruction* inst)
630{
631	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
632
633	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
634		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
635		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
636	emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
637		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
638		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
639		negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
640
641	rc_remove_instruction(inst);
642}
643
644
645/**
646 * Can be used as a transformation for @ref radeonClauseLocalTransform,
647 * no userData necessary.
648 *
649 * Eliminates the following ALU instructions:
650 *  ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
651 * using:
652 *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
653 *
654 * Transforms RSQ to Radeon's native RSQ by explicitly setting
655 * absolute value.
656 *
657 * @note should be applicable to R300 and R500 fragment programs.
658 */
659int radeonTransformALU(
660	struct radeon_compiler * c,
661	struct rc_instruction* inst,
662	void* unused)
663{
664	switch(inst->U.I.Opcode) {
665	case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
666	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
667	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
668	case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
669	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
670	case RC_OPCODE_DST: transform_DST(c, inst); return 1;
671	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
672	case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
673	case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
674	case RC_OPCODE_POW: transform_POW(c, inst); return 1;
675	case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
676	case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
677	case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
678	case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
679	case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
680	case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
681	case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
682	case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
683	case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
684	case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
685	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
686	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
687	case RC_OPCODE_TRUNC: transform_TRUNC(c, inst); return 1;
688	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
689	default:
690		return 0;
691	}
692}
693
694
695static void transform_r300_vertex_ABS(struct radeon_compiler* c,
696	struct rc_instruction* inst)
697{
698	/* Note: r500 can take absolute values, but r300 cannot. */
699	inst->U.I.Opcode = RC_OPCODE_MAX;
700	inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
701	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
702}
703
704static void transform_r300_vertex_CMP(struct radeon_compiler* c,
705	struct rc_instruction* inst)
706{
707	/* There is no decent CMP available, so let's rig one up.
708	 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
709	 * The following sequence consumes zero to two temps and two extra slots
710	 * (the second temp and the second slot is consumed by transform_LRP),
711	 * but should be equivalent:
712	 *
713	 * SLT tmp0, src0, 0.0
714	 * LRP dst, tmp0, src1, src2
715	 *
716	 * Yes, I know, I'm a mad scientist. ~ C. & M. */
717	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
718
719	/* SLT tmp0, src0, 0.0 */
720	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
721		dst,
722		inst->U.I.SrcReg[0], builtin_zero);
723
724	/* LRP dst, tmp0, src1, src2 */
725	transform_LRP(c,
726		emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
727		      inst->U.I.DstReg,
728		      srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
729
730	rc_remove_instruction(inst);
731}
732
733static void transform_r300_vertex_DP2(struct radeon_compiler* c,
734	struct rc_instruction* inst)
735{
736	struct rc_instruction *next_inst = inst->Next;
737	transform_DP2(c, inst);
738	next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
739}
740
741static void transform_r300_vertex_DP3(struct radeon_compiler* c,
742	struct rc_instruction* inst)
743{
744	struct rc_src_register src0 = inst->U.I.SrcReg[0];
745	struct rc_src_register src1 = inst->U.I.SrcReg[1];
746	src0.Negate &= ~RC_MASK_W;
747	src0.Swizzle &= ~(7 << (3 * 3));
748	src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
749	src1.Negate &= ~RC_MASK_W;
750	src1.Swizzle &= ~(7 << (3 * 3));
751	src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
752	emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
753	rc_remove_instruction(inst);
754}
755
756static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
757	struct rc_instruction* inst)
758{
759	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
760	unsigned constant_swizzle;
761	int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
762							 0.0000000000000000001,
763							 &constant_swizzle);
764
765	/* MOV dst, src */
766	dst.WriteMask = RC_MASK_XYZW;
767	emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
768		dst,
769		inst->U.I.SrcReg[0]);
770
771	/* MAX dst.y, src, 0.00...001 */
772	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
773		dstregtmpmask(dst.Index, RC_MASK_Y),
774		srcreg(RC_FILE_TEMPORARY, dst.Index),
775		srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
776
777	inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
778}
779
780static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
781	struct rc_instruction *inst)
782{
783	/* x = y  <==>  x >= y && y >= x */
784	int tmp = rc_find_free_temporary(c);
785
786	/* x <= y */
787	emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
788	      dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
789	      inst->U.I.SrcReg[0],
790	      inst->U.I.SrcReg[1]);
791
792	/* y <= x */
793	emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
794	      inst->U.I.DstReg,
795	      inst->U.I.SrcReg[1],
796	      inst->U.I.SrcReg[0]);
797
798	/* x && y  =  x * y */
799	emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
800	      inst->U.I.DstReg,
801	      srcreg(RC_FILE_TEMPORARY, tmp),
802	      srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
803
804	rc_remove_instruction(inst);
805}
806
807static void transform_r300_vertex_SNE(struct radeon_compiler *c,
808	struct rc_instruction *inst)
809{
810	/* x != y  <==>  x < y || y < x */
811	int tmp = rc_find_free_temporary(c);
812
813	/* x < y */
814	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
815	      dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
816	      inst->U.I.SrcReg[0],
817	      inst->U.I.SrcReg[1]);
818
819	/* y < x */
820	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
821	      inst->U.I.DstReg,
822	      inst->U.I.SrcReg[1],
823	      inst->U.I.SrcReg[0]);
824
825	/* x || y  =  max(x, y) */
826	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
827	      inst->U.I.DstReg,
828	      srcreg(RC_FILE_TEMPORARY, tmp),
829	      srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
830
831	rc_remove_instruction(inst);
832}
833
834static void transform_r300_vertex_SGT(struct radeon_compiler* c,
835	struct rc_instruction* inst)
836{
837	/* x > y  <==>  -x < -y */
838	inst->U.I.Opcode = RC_OPCODE_SLT;
839	inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
840	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
841}
842
843static void transform_r300_vertex_SLE(struct radeon_compiler* c,
844	struct rc_instruction* inst)
845{
846	/* x <= y  <==>  -x >= -y */
847	inst->U.I.Opcode = RC_OPCODE_SGE;
848	inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
849	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
850}
851
852static void transform_r300_vertex_SSG(struct radeon_compiler* c,
853	struct rc_instruction* inst)
854{
855	/* result = sign(x)
856	 *
857	 *   SLT tmp0, 0, x;
858	 *   SLT tmp1, x, 0;
859	 *   ADD result, tmp0, -tmp1;
860	 */
861	struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
862	unsigned tmp1;
863
864	/* 0 < x */
865	dst0 = try_to_reuse_dst(c, inst);
866	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
867	      dst0,
868	      builtin_zero,
869	      inst->U.I.SrcReg[0]);
870
871	/* x < 0 */
872	tmp1 = rc_find_free_temporary(c);
873	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
874	      dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
875	      inst->U.I.SrcReg[0],
876	      builtin_zero);
877
878	/* Either both are zero, or one of them is one and the other is zero. */
879	/* result = tmp0 - tmp1 */
880	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
881	      inst->U.I.DstReg,
882	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
883	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
884
885	rc_remove_instruction(inst);
886}
887
888static void transform_vertex_TRUNC(struct radeon_compiler* c,
889	struct rc_instruction* inst)
890{
891	struct rc_instruction *next = inst->Next;
892
893	/* next->Prev is removed after each transformation and replaced
894	 * by a new instruction. */
895	transform_TRUNC(c, next->Prev);
896	transform_r300_vertex_CMP(c, next->Prev);
897}
898
899/**
900 * For use with rc_local_transform, this transforms non-native ALU
901 * instructions of the r300 up to r500 vertex engine.
902 */
903int r300_transform_vertex_alu(
904	struct radeon_compiler * c,
905	struct rc_instruction* inst,
906	void* unused)
907{
908	switch(inst->U.I.Opcode) {
909	case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
910	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
911	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
912	case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
913	case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
914	case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
915	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
916	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
917	case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
918	case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
919	case RC_OPCODE_SEQ:
920		if (!c->is_r500) {
921			transform_r300_vertex_SEQ(c, inst);
922			return 1;
923		}
924		return 0;
925	case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
926	case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
927	case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
928	case RC_OPCODE_SNE:
929		if (!c->is_r500) {
930			transform_r300_vertex_SNE(c, inst);
931			return 1;
932		}
933		return 0;
934	case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
935	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
936	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
937	case RC_OPCODE_TRUNC: transform_vertex_TRUNC(c, inst); return 1;
938	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
939	default:
940		return 0;
941	}
942}
943
944static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
945{
946	static const float SinCosConsts[2][4] = {
947		{
948			1.273239545,		/* 4/PI */
949			-0.405284735,		/* -4/(PI*PI) */
950			3.141592654,		/* PI */
951			0.2225			/* weight */
952		},
953		{
954			0.75,
955			0.5,
956			0.159154943,		/* 1/(2*PI) */
957			6.283185307		/* 2*PI */
958		}
959	};
960	int i;
961
962	for(i = 0; i < 2; ++i)
963		constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
964}
965
966/**
967 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
968 *
969 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
970 * MAD tmp.x, tmp.y, |src|, tmp.x
971 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
972 * MAD dest, tmp.y, weight, tmp.x
973 */
974static void sin_approx(
975	struct radeon_compiler* c, struct rc_instruction * inst,
976	struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
977{
978	unsigned int tempreg = rc_find_free_temporary(c);
979
980	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
981		swizzle_xxxx(src),
982		srcreg(RC_FILE_CONSTANT, constants[0]));
983	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
984		swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
985		absolute(swizzle_xxxx(src)),
986		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
987	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
988		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
989		absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
990		negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
991	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
992		swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
993		swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
994		swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
995}
996
997/**
998 * Translate the trigonometric functions COS, SIN, and SCS
999 * using only the basic instructions
1000 *  MOV, ADD, MUL, MAD, FRC
1001 */
1002int r300_transform_trig_simple(struct radeon_compiler* c,
1003	struct rc_instruction* inst,
1004	void* unused)
1005{
1006	unsigned int constants[2];
1007	unsigned int tempreg;
1008
1009	if (inst->U.I.Opcode != RC_OPCODE_COS &&
1010	    inst->U.I.Opcode != RC_OPCODE_SIN &&
1011	    inst->U.I.Opcode != RC_OPCODE_SCS)
1012		return 0;
1013
1014	tempreg = rc_find_free_temporary(c);
1015
1016	sincos_constants(c, constants);
1017
1018	if (inst->U.I.Opcode == RC_OPCODE_COS) {
1019		/* MAD tmp.x, src, 1/(2*PI), 0.75 */
1020		/* FRC tmp.x, tmp.x */
1021		/* MAD tmp.z, tmp.x, 2*PI, -PI */
1022		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1023			swizzle_xxxx(inst->U.I.SrcReg[0]),
1024			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1025			swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
1026		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1027			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1028		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1029			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1030			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1031			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1032
1033		sin_approx(c, inst, inst->U.I.DstReg,
1034			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1035			constants);
1036	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1037		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1038			swizzle_xxxx(inst->U.I.SrcReg[0]),
1039			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1040			swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
1041		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1042			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1043		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1044			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1045			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1046			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1047
1048		sin_approx(c, inst, inst->U.I.DstReg,
1049			swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1050			constants);
1051	} else {
1052		struct rc_dst_register dst;
1053
1054		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1055			swizzle_xxxx(inst->U.I.SrcReg[0]),
1056			swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1057			swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
1058		emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1059			srcreg(RC_FILE_TEMPORARY, tempreg));
1060		emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1061			srcreg(RC_FILE_TEMPORARY, tempreg),
1062			swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1063			negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1064
1065		dst = inst->U.I.DstReg;
1066
1067		dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
1068		sin_approx(c, inst, dst,
1069			swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
1070			constants);
1071
1072		dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
1073		sin_approx(c, inst, dst,
1074			swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
1075			constants);
1076	}
1077
1078	rc_remove_instruction(inst);
1079
1080	return 1;
1081}
1082
1083static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
1084	struct rc_instruction *inst,
1085	unsigned srctmp)
1086{
1087	if (inst->U.I.Opcode == RC_OPCODE_COS) {
1088		emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
1089			srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1090	} else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1091		emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
1092			inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1093	} else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
1094		struct rc_dst_register moddst = inst->U.I.DstReg;
1095
1096		if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
1097			moddst.WriteMask = RC_MASK_X;
1098			emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
1099				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1100		}
1101		if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
1102			moddst.WriteMask = RC_MASK_Y;
1103			emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
1104				srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1105		}
1106	}
1107
1108	rc_remove_instruction(inst);
1109}
1110
1111
1112/**
1113 * Transform the trigonometric functions COS, SIN, and SCS
1114 * to include pre-scaling by 1/(2*PI) and taking the fractional
1115 * part, so that the input to COS and SIN is always in the range [0,1).
1116 * SCS is replaced by one COS and one SIN instruction.
1117 *
1118 * @warning This transformation implicitly changes the semantics of SIN and COS!
1119 */
1120int radeonTransformTrigScale(struct radeon_compiler* c,
1121	struct rc_instruction* inst,
1122	void* unused)
1123{
1124	static const float RCP_2PI = 0.15915494309189535;
1125	unsigned int temp;
1126	unsigned int constant;
1127	unsigned int constant_swizzle;
1128
1129	if (inst->U.I.Opcode != RC_OPCODE_COS &&
1130	    inst->U.I.Opcode != RC_OPCODE_SIN &&
1131	    inst->U.I.Opcode != RC_OPCODE_SCS)
1132		return 0;
1133
1134	temp = rc_find_free_temporary(c);
1135	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1136
1137	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
1138		swizzle_xxxx(inst->U.I.SrcReg[0]),
1139		srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1140	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1141		srcreg(RC_FILE_TEMPORARY, temp));
1142
1143	r300_transform_SIN_COS_SCS(c, inst, temp);
1144	return 1;
1145}
1146
1147/**
1148 * Transform the trigonometric functions COS, SIN, and SCS
1149 * so that the input to COS and SIN is always in the range [-PI, PI].
1150 * SCS is replaced by one COS and one SIN instruction.
1151 */
1152int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
1153	struct rc_instruction *inst,
1154	void *unused)
1155{
1156	static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1157	unsigned int temp;
1158	unsigned int constant;
1159
1160	if (inst->U.I.Opcode != RC_OPCODE_COS &&
1161	    inst->U.I.Opcode != RC_OPCODE_SIN &&
1162	    inst->U.I.Opcode != RC_OPCODE_SCS)
1163		return 0;
1164
1165	/* Repeat x in the range [-PI, PI]:
1166	 *
1167	 *   repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1168	 */
1169
1170	temp = rc_find_free_temporary(c);
1171	constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
1172
1173	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1174		swizzle_xxxx(inst->U.I.SrcReg[0]),
1175		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
1176		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
1177	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1178		srcreg(RC_FILE_TEMPORARY, temp));
1179	emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1180		srcreg(RC_FILE_TEMPORARY, temp),
1181		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
1182		srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
1183
1184	r300_transform_SIN_COS_SCS(c, inst, temp);
1185	return 1;
1186}
1187
1188/**
1189 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1190 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1191 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1192 *
1193 * @warning This explicitly changes the form of DDX and DDY!
1194 */
1195
1196int radeonTransformDeriv(struct radeon_compiler* c,
1197	struct rc_instruction* inst,
1198	void* unused)
1199{
1200	if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
1201		return 0;
1202
1203	inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
1204	inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
1205
1206	return 1;
1207}
1208
1209/**
1210 * IF Temp[0].x -> IF Temp[0].x
1211 * ...          -> ...
1212 * KILP         -> KIL -abs(Temp[0].x)
1213 * ...          -> ...
1214 * ENDIF        -> ENDIF
1215 *
1216 * === OR ===
1217 *
1218 * IF Temp[0].x -\
1219 * KILP         - > KIL -abs(Temp[0].x)
1220 * ENDIF        -/
1221 *
1222 * === OR ===
1223 *
1224 * IF Temp[0].x -> IF Temp[0].x
1225 * ...          -> ...
1226 * ELSE         -> ELSE
1227 * ...	        -> ...
1228 * KILP	        -> KIL -abs(Temp[0].x)
1229 * ...          -> ...
1230 * ENDIF        -> ENDIF
1231 *
1232 * === OR ===
1233 *
1234 * KILP         -> KIL -none.1111
1235 *
1236 * This needs to be done in its own pass, because it might modify the
1237 * instructions before and after KILP.
1238 */
1239void rc_transform_KILP(struct radeon_compiler * c, void *user)
1240{
1241	struct rc_instruction * inst;
1242	for (inst = c->Program.Instructions.Next;
1243			inst != &c->Program.Instructions; inst = inst->Next) {
1244		struct rc_instruction * if_inst;
1245		unsigned in_if = 0;
1246
1247		if (inst->U.I.Opcode != RC_OPCODE_KILP)
1248			continue;
1249
1250		for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
1251						if_inst = if_inst->Prev) {
1252
1253			if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
1254				in_if = 1;
1255				break;
1256			}
1257		}
1258
1259		inst->U.I.Opcode = RC_OPCODE_KIL;
1260
1261		if (!in_if) {
1262			inst->U.I.SrcReg[0] = negate(builtin_one);
1263		} else {
1264			/* This should work even if the KILP is inside the ELSE
1265			 * block, because -0.0 is considered negative. */
1266			inst->U.I.SrcReg[0] =
1267				negate(absolute(if_inst->U.I.SrcReg[0]));
1268
1269			if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
1270				&& inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
1271
1272				/* Optimize the special case:
1273				 * IF Temp[0].x
1274				 * KILP
1275				 * ENDIF
1276				 */
1277
1278				/* Remove IF */
1279				rc_remove_instruction(inst->Prev);
1280				/* Remove ENDIF */
1281				rc_remove_instruction(inst->Next);
1282			}
1283		}
1284	}
1285}
1286