1/*
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 *      Vadim Girlin
25 */
26
27#include <cmath>
28
29#include "sb_shader.h"
30
31namespace r600_sb {
32
33value* get_select_value_for_em(shader& sh, value* em) {
34	if (!em->def)
35		return NULL;
36
37	node *predset = em->def;
38	if (!predset->is_pred_set())
39		return NULL;
40
41	alu_node *s = sh.clone(static_cast<alu_node*>(predset));
42	convert_predset_to_set(sh, s);
43
44	predset->insert_after(s);
45
46	value* &d0 = s->dst[0];
47	d0 = sh.create_temp_value();
48	d0->def = s;
49	return d0;
50}
51
52void convert_to_mov(alu_node &n, value *src, bool neg, bool abs) {
53	n.src.resize(1);
54	n.src[0] = src;
55	n.bc.src[0].abs = abs;
56	n.bc.src[0].neg = neg;
57	n.bc.set_op(ALU_OP1_MOV);
58}
59
60expr_handler::expr_handler(shader& sh) : sh(sh), vt(sh.vt) {}
61
62value * expr_handler::get_const(const literal &l) {
63	value *v = sh.get_const_value(l);
64	if (!v->gvn_source)
65		vt.add_value(v);
66	return v;
67}
68
69void expr_handler::assign_source(value *dst, value *src) {
70	dst->gvn_source = src->gvn_source;
71}
72
73bool expr_handler::equal(value *l, value *r) {
74
75	assert(l != r);
76
77	if (l->gvalue() == r->gvalue())
78		return true;
79
80	if (l->def && r->def)
81		return defs_equal(l, r);
82
83	if (l->is_rel() && r->is_rel())
84		return ivars_equal(l, r);
85
86	return false;
87}
88
89bool expr_handler::ivars_equal(value* l, value* r) {
90	if (l->rel->gvalue() == r->rel->gvalue()
91			&& l->select == r->select) {
92
93		vvec &lv = l->mdef.empty() ? l->muse : l->mdef;
94		vvec &rv = r->mdef.empty() ? r->muse : r->mdef;
95
96		// FIXME: replace this with more precise aliasing test
97		return lv == rv;
98	}
99	return false;
100}
101
102bool expr_handler::defs_equal(value* l, value* r) {
103
104	node *d1 = l->def;
105	node *d2 = r->def;
106
107	if (d1->type != d2->type || d1->subtype != d2->subtype)
108		return false;
109
110	if (d1->is_pred_set() || d2->is_pred_set())
111		return false;
112
113	if (d1->type == NT_OP) {
114		switch (d1->subtype) {
115		case NST_ALU_INST:
116			return ops_equal(
117					static_cast<alu_node*>(d1),
118					static_cast<alu_node*>(d2));
119//		case NST_FETCH_INST: return ops_equal(static_cast<fetch_node*>(d1),
120//			static_cast<fetch_node*>(d2);
121//		case NST_CF_INST: return ops_equal(static_cast<cf_node*>(d1),
122//			static_cast<cf_node*>(d2);
123		default:
124			break;
125		}
126	}
127	return false;
128}
129
130bool expr_handler::try_fold(value* v) {
131	assert(!v->gvn_source);
132
133	if (v->def)
134		try_fold(v->def);
135
136	if (v->gvn_source)
137		return true;
138
139	return false;
140}
141
142bool expr_handler::try_fold(node* n) {
143	return n->fold_dispatch(this);
144}
145
146bool expr_handler::fold(node& n) {
147	if (n.subtype == NST_PHI) {
148
149		value *s = n.src[0];
150
151		// FIXME disabling phi folding for registers for now, otherwise we lose
152		// control flow information in some cases
153		// (GCM fails on tests/shaders/glsl-fs-if-nested-loop.shader_test)
154		// probably control flow transformation is required to enable it
155		if (s->is_sgpr())
156			return false;
157
158		for(vvec::iterator I = n.src.begin() + 1, E = n.src.end(); I != E; ++I) {
159			value *v = *I;
160			if (!s->v_equal(v))
161				return false;
162		}
163
164		assign_source(n.dst[0], s);
165	} else {
166		assert(n.subtype == NST_PSI);
167		assert(n.src.size() >= 6);
168
169		value *s = n.src[2];
170		assert(s->gvn_source);
171
172		for(vvec::iterator I = n.src.begin() + 3, E = n.src.end(); I != E; I += 3) {
173			value *v = *(I+2);
174			if (!s->v_equal(v))
175				return false;
176		}
177		assign_source(n.dst[0], s);
178	}
179	return true;
180}
181
182bool expr_handler::fold(container_node& n) {
183	return false;
184}
185
186bool expr_handler::fold_setcc(alu_node &n) {
187
188	value* v0 = n.src[0]->gvalue();
189	value* v1 = n.src[1]->gvalue();
190
191	assert(v0 && v1 && n.dst[0]);
192
193	unsigned flags = n.bc.op_ptr->flags;
194	unsigned cc = flags & AF_CC_MASK;
195	unsigned cmp_type = flags & AF_CMP_TYPE_MASK;
196	unsigned dst_type = flags & AF_DST_TYPE_MASK;
197
198	bool cond_result;
199	bool have_result = false;
200
201	bool isc0 = v0->is_const();
202	bool isc1 = v1->is_const();
203
204	literal dv, cv0, cv1;
205
206	if (isc0) {
207		cv0 = v0->get_const_value();
208		apply_alu_src_mod(n.bc, 0, cv0);
209	}
210
211	if (isc1) {
212		cv1 = v1->get_const_value();
213		apply_alu_src_mod(n.bc, 1, cv1);
214	}
215
216	if (isc0 && isc1) {
217		cond_result = evaluate_condition(flags, cv0, cv1);
218		have_result = true;
219	} else if (isc1) {
220		if (cmp_type == AF_FLOAT_CMP) {
221			if (n.bc.src[0].abs && !n.bc.src[0].neg) {
222				if (cv1.f < 0.0f && (cc == AF_CC_GT || cc == AF_CC_NE)) {
223					cond_result = true;
224					have_result = true;
225				} else if (cv1.f <= 0.0f && cc == AF_CC_GE) {
226					cond_result = true;
227					have_result = true;
228				}
229			} else if (n.bc.src[0].abs && n.bc.src[0].neg) {
230				if (cv1.f > 0.0f && (cc == AF_CC_GE || cc == AF_CC_E)) {
231					cond_result = false;
232					have_result = true;
233				} else if (cv1.f >= 0.0f && cc == AF_CC_GT) {
234					cond_result = false;
235					have_result = true;
236				}
237			}
238		} else if (cmp_type == AF_UINT_CMP && cv1.u == 0 && cc == AF_CC_GE) {
239			cond_result = true;
240			have_result = true;
241		}
242	} else if (isc0) {
243		if (cmp_type == AF_FLOAT_CMP) {
244			if (n.bc.src[1].abs && !n.bc.src[1].neg) {
245				if (cv0.f <= 0.0f && cc == AF_CC_GT) {
246					cond_result = false;
247					have_result = true;
248				} else if (cv0.f < 0.0f && (cc == AF_CC_GE || cc == AF_CC_E)) {
249					cond_result = false;
250					have_result = true;
251				}
252			} else if (n.bc.src[1].abs && n.bc.src[1].neg) {
253				if (cv0.f >= 0.0f && cc == AF_CC_GE) {
254					cond_result = true;
255					have_result = true;
256				} else if (cv0.f > 0.0f && (cc == AF_CC_GT || cc == AF_CC_NE)) {
257					cond_result = true;
258					have_result = true;
259				}
260			}
261		} else if (cmp_type == AF_UINT_CMP && cv0.u == 0 && cc == AF_CC_GT) {
262			cond_result = false;
263			have_result = true;
264		}
265	} else if (v0 == v1) {
266		bc_alu_src &s0 = n.bc.src[0], &s1 = n.bc.src[1];
267		if (s0.abs == s1.abs && s0.neg == s1.neg && cmp_type != AF_FLOAT_CMP) {
268			// NOTE can't handle float comparisons here because of NaNs
269			cond_result = (cc == AF_CC_E || cc == AF_CC_GE);
270			have_result = true;
271		}
272	}
273
274	if (have_result) {
275		literal result;
276
277		if (cond_result)
278			result = dst_type != AF_FLOAT_DST ?
279					literal(0xFFFFFFFFu) : literal(1.0f);
280		else
281			result = literal(0);
282
283		convert_to_mov(n, sh.get_const_value(result));
284		return fold_alu_op1(n);
285	}
286
287	return false;
288}
289
290bool expr_handler::fold(alu_node& n) {
291
292	switch (n.bc.op_ptr->src_count) {
293	case 1: return fold_alu_op1(n);
294	case 2: return fold_alu_op2(n);
295	case 3: return fold_alu_op3(n);
296	default:
297		assert(0);
298	}
299	return false;
300}
301
302bool expr_handler::fold(fetch_node& n) {
303
304	unsigned chan = 0;
305	for (vvec::iterator I = n.dst.begin(), E = n.dst.end(); I != E; ++I) {
306		value* &v = *I;
307		if (v) {
308			if (n.bc.dst_sel[chan] == SEL_0)
309				assign_source(*I, get_const(0.0f));
310			else if (n.bc.dst_sel[chan] == SEL_1)
311				assign_source(*I, get_const(1.0f));
312		}
313		++chan;
314	}
315	return false;
316}
317
318bool expr_handler::fold(cf_node& n) {
319	return false;
320}
321
322void expr_handler::apply_alu_src_mod(const bc_alu &bc, unsigned src,
323                                     literal &v) {
324	const bc_alu_src &s = bc.src[src];
325
326	if (s.abs)
327		v = fabs(v.f);
328	if (s.neg)
329		v = -v.f;
330}
331
332void expr_handler::apply_alu_dst_mod(const bc_alu &bc, literal &v) {
333	float omod_coeff[] = {2.0f, 4.0, 0.5f};
334
335	if (bc.omod)
336		v = v.f * omod_coeff[bc.omod - 1];
337	if (bc.clamp)
338		v = float_clamp(v.f);
339}
340
341bool expr_handler::args_equal(const vvec &l, const vvec &r) {
342
343	assert(l.size() == r.size());
344
345	int s = l.size();
346
347	for (int k = 0; k < s; ++k) {
348		if (!l[k]->v_equal(r[k]))
349			return false;
350	}
351
352	return true;
353}
354
355bool expr_handler::ops_equal(const alu_node *l, const alu_node* r) {
356	const bc_alu &b0 = l->bc;
357	const bc_alu &b1 = r->bc;
358
359	if (b0.op != b1.op)
360		return false;
361
362	unsigned src_count = b0.op_ptr->src_count;
363
364	if (b0.index_mode != b1.index_mode)
365		return false;
366
367	if (b0.clamp != b1.clamp || b0.omod != b1.omod)
368			return false;
369
370	for (unsigned s = 0; s < src_count; ++s) {
371		const bc_alu_src &s0 = b0.src[s];
372		const bc_alu_src &s1 = b1.src[s];
373
374		if (s0.abs != s1.abs || s0.neg != s1.neg)
375			return false;
376	}
377	return args_equal(l->src, r->src);
378}
379
380bool expr_handler::fold_alu_op1(alu_node& n) {
381
382	assert(!n.src.empty());
383	if (n.src.empty())
384		return false;
385
386	value* v0 = n.src[0]->gvalue();
387
388	assert(v0 && n.dst[0]);
389
390	if (!v0->is_const()) {
391		// handle (MOV -(MOV -x)) => (MOV x)
392		if (n.bc.op == ALU_OP1_MOV && n.bc.src[0].neg && !n.bc.src[1].abs
393				&& v0->def && v0->def->is_alu_op(ALU_OP1_MOV)) {
394			alu_node *sd = static_cast<alu_node*>(v0->def);
395			if (!sd->bc.clamp && !sd->bc.omod && !sd->bc.src[0].abs &&
396					sd->bc.src[0].neg) {
397				n.src[0] = sd->src[0];
398				n.bc.src[0].neg = 0;
399				v0 = n.src[0]->gvalue();
400			}
401		}
402
403		if ((n.bc.op == ALU_OP1_MOV || n.bc.op == ALU_OP1_MOVA_INT ||
404				n.bc.op == ALU_OP1_MOVA_GPR_INT)
405				&& n.bc.clamp == 0 && n.bc.omod == 0
406				&& n.bc.src[0].abs == 0 && n.bc.src[0].neg == 0 &&
407				n.src.size() == 1 /* RIM/SIM can be appended as additional values */) {
408			assign_source(n.dst[0], v0);
409			return true;
410		}
411		return false;
412	}
413
414	literal dv, cv = v0->get_const_value();
415	apply_alu_src_mod(n.bc, 0, cv);
416
417	switch (n.bc.op) {
418	case ALU_OP1_CEIL: dv = ceil(cv.f); break;
419	case ALU_OP1_COS: dv = cos(cv.f * 2.0f * M_PI); break;
420	case ALU_OP1_EXP_IEEE: dv = exp2(cv.f); break;
421	case ALU_OP1_FLOOR: dv = floor(cv.f); break;
422	case ALU_OP1_FLT_TO_INT: dv = (int)cv.f; break; // FIXME: round modes ????
423	case ALU_OP1_FLT_TO_INT_FLOOR: dv = (int32_t)floor(cv.f); break;
424	case ALU_OP1_FLT_TO_INT_RPI: dv = (int32_t)floor(cv.f + 0.5f); break;
425	case ALU_OP1_FLT_TO_INT_TRUNC: dv = (int32_t)trunc(cv.f); break;
426	case ALU_OP1_FLT_TO_UINT: dv = (uint32_t)cv.f; break;
427	case ALU_OP1_FRACT: dv = cv.f - floor(cv.f); break;
428	case ALU_OP1_INT_TO_FLT: dv = (float)cv.i; break;
429	case ALU_OP1_LOG_CLAMPED:
430	case ALU_OP1_LOG_IEEE:
431		if (cv.f != 0.0f)
432			dv = log2(cv.f);
433		else
434			// don't fold to NAN, let the GPU handle it for now
435			// (prevents degenerate LIT tests from failing)
436			return false;
437		break;
438	case ALU_OP1_MOV: dv = cv; break;
439	case ALU_OP1_MOVA_INT: dv = cv; break; // FIXME ???
440//	case ALU_OP1_MOVA_FLOOR: dv = (int32_t)floor(cv.f); break;
441//	case ALU_OP1_MOVA_GPR_INT:
442	case ALU_OP1_NOT_INT: dv = ~cv.i; break;
443	case ALU_OP1_PRED_SET_INV:
444		dv = cv.f == 0.0f ? 1.0f : (cv.f == 1.0f ? 0.0f : cv.f); break;
445	case ALU_OP1_PRED_SET_RESTORE: dv = cv; break;
446	case ALU_OP1_RECIPSQRT_CLAMPED:
447	case ALU_OP1_RECIPSQRT_FF:
448	case ALU_OP1_RECIPSQRT_IEEE: dv = 1.0f / sqrt(cv.f); break;
449	case ALU_OP1_RECIP_CLAMPED:
450	case ALU_OP1_RECIP_FF:
451	case ALU_OP1_RECIP_IEEE: dv = 1.0f / cv.f; break;
452//	case ALU_OP1_RECIP_INT:
453	case ALU_OP1_RECIP_UINT: dv.u = (1ull << 32) / cv.u; break;
454//	case ALU_OP1_RNDNE: dv = floor(cv.f + 0.5f); break;
455	case ALU_OP1_SIN: dv = sin(cv.f * 2.0f * M_PI); break;
456	case ALU_OP1_SQRT_IEEE: dv = sqrt(cv.f); break;
457	case ALU_OP1_TRUNC: dv = trunc(cv.f); break;
458
459	default:
460		return false;
461	}
462
463	apply_alu_dst_mod(n.bc, dv);
464	assign_source(n.dst[0], get_const(dv));
465	return true;
466}
467
468bool expr_handler::fold_mul_add(alu_node *n) {
469
470	bool ieee;
471	value* v0 = n->src[0]->gvalue();
472
473	alu_node *d0 = (v0->def && v0->def->is_alu_inst()) ?
474			static_cast<alu_node*>(v0->def) : NULL;
475
476	if (d0) {
477		if (d0->is_alu_op(ALU_OP2_MUL_IEEE))
478			ieee = true;
479		else if (d0->is_alu_op(ALU_OP2_MUL))
480			ieee = false;
481		else
482			return false;
483
484		if (!d0->bc.src[0].abs && !d0->bc.src[1].abs &&
485				!n->bc.src[1].abs && !n->bc.src[0].abs && !d0->bc.omod &&
486				!d0->bc.clamp && !n->bc.omod &&
487				(!d0->src[0]->is_kcache() || !d0->src[1]->is_kcache() ||
488						!n->src[1]->is_kcache())) {
489
490			bool mul_neg = n->bc.src[0].neg;
491
492			n->src.resize(3);
493			n->bc.set_op(ieee ? ALU_OP3_MULADD_IEEE : ALU_OP3_MULADD);
494			n->src[2] = n->src[1];
495			n->bc.src[2] = n->bc.src[1];
496			n->src[0] = d0->src[0];
497			n->bc.src[0] = d0->bc.src[0];
498			n->src[1] = d0->src[1];
499			n->bc.src[1] = d0->bc.src[1];
500
501			n->bc.src[0].neg ^= mul_neg;
502
503			fold_alu_op3(*n);
504			return true;
505		}
506	}
507
508	value* v1 = n->src[1]->gvalue();
509
510	alu_node *d1 = (v1->def && v1->def->is_alu_inst()) ?
511			static_cast<alu_node*>(v1->def) : NULL;
512
513	if (d1) {
514		if (d1->is_alu_op(ALU_OP2_MUL_IEEE))
515			ieee = true;
516		else if (d1->is_alu_op(ALU_OP2_MUL))
517			ieee = false;
518		else
519			return false;
520
521		if (!d1->bc.src[1].abs && !d1->bc.src[0].abs &&
522				!n->bc.src[0].abs && !n->bc.src[1].abs && !d1->bc.omod &&
523				!d1->bc.clamp && !n->bc.omod &&
524				(!d1->src[0]->is_kcache() || !d1->src[1]->is_kcache() ||
525						!n->src[0]->is_kcache())) {
526
527			bool mul_neg = n->bc.src[1].neg;
528
529			n->src.resize(3);
530			n->bc.set_op(ieee ? ALU_OP3_MULADD_IEEE : ALU_OP3_MULADD);
531			n->src[2] = n->src[0];
532			n->bc.src[2] = n->bc.src[0];
533			n->src[1] = d1->src[1];
534			n->bc.src[1] = d1->bc.src[1];
535			n->src[0] = d1->src[0];
536			n->bc.src[0] = d1->bc.src[0];
537
538			n->bc.src[1].neg ^= mul_neg;
539
540			fold_alu_op3(*n);
541			return true;
542		}
543	}
544
545	return false;
546}
547
548bool expr_handler::eval_const_op(unsigned op, literal &r,
549                                 literal cv0, literal cv1) {
550
551	switch (op) {
552	case ALU_OP2_ADD: r = cv0.f + cv1.f; break;
553	case ALU_OP2_ADDC_UINT:
554		r = (uint32_t)(((uint64_t)cv0.u + cv1.u)>>32); break;
555	case ALU_OP2_ADD_INT: r = cv0.i + cv1.i; break;
556	case ALU_OP2_AND_INT: r = cv0.i & cv1.i; break;
557	case ALU_OP2_ASHR_INT: r = cv0.i >> (cv1.i & 0x1F); break;
558	case ALU_OP2_BFM_INT:
559		r = (((1 << (cv0.i & 0x1F)) - 1) << (cv1.i & 0x1F)); break;
560	case ALU_OP2_LSHL_INT: r = cv0.i << cv1.i; break;
561	case ALU_OP2_LSHR_INT: r = cv0.u >> cv1.u; break;
562	case ALU_OP2_MAX:
563	case ALU_OP2_MAX_DX10: r = cv0.f > cv1.f ? cv0.f : cv1.f; break;
564	case ALU_OP2_MAX_INT: r = cv0.i > cv1.i ? cv0.i : cv1.i; break;
565	case ALU_OP2_MAX_UINT: r = cv0.u > cv1.u ? cv0.u : cv1.u; break;
566	case ALU_OP2_MIN:
567	case ALU_OP2_MIN_DX10: r = cv0.f < cv1.f ? cv0.f : cv1.f; break;
568	case ALU_OP2_MIN_INT: r = cv0.i < cv1.i ? cv0.i : cv1.i; break;
569	case ALU_OP2_MIN_UINT: r = cv0.u < cv1.u ? cv0.u : cv1.u; break;
570	case ALU_OP2_MUL:
571	case ALU_OP2_MUL_IEEE: r = cv0.f * cv1.f; break;
572	case ALU_OP2_MULHI_INT:
573		r = (int32_t)(((int64_t)cv0.u * cv1.u)>>32); break;
574	case ALU_OP2_MULHI_UINT:
575		r = (uint32_t)(((uint64_t)cv0.u * cv1.u)>>32); break;
576	case ALU_OP2_MULLO_INT:
577		r = (int32_t)(((int64_t)cv0.u * cv1.u) & 0xFFFFFFFF); break;
578	case ALU_OP2_MULLO_UINT:
579		r = (uint32_t)(((uint64_t)cv0.u * cv1.u) & 0xFFFFFFFF); break;
580	case ALU_OP2_OR_INT: r = cv0.i | cv1.i; break;
581	case ALU_OP2_SUB_INT: r = cv0.i - cv1.i; break;
582	case ALU_OP2_XOR_INT: r = cv0.i ^ cv1.i; break;
583
584	default:
585		return false;
586	}
587
588	return true;
589}
590
591// fold the chain of associative ops, e.g. (ADD 2, (ADD x, 3)) => (ADD x, 5)
592bool expr_handler::fold_assoc(alu_node *n) {
593
594	alu_node *a = n;
595	literal cr;
596
597	int last_arg = -3;
598
599	unsigned op = n->bc.op;
600	bool allow_neg = false, cur_neg = false;
601	bool distribute_neg = false;
602
603	switch(op) {
604	case ALU_OP2_ADD:
605		distribute_neg = true;
606		allow_neg = true;
607		break;
608	case ALU_OP2_MUL:
609	case ALU_OP2_MUL_IEEE:
610		allow_neg = true;
611		break;
612	case ALU_OP3_MULADD:
613		allow_neg = true;
614		op = ALU_OP2_MUL;
615		break;
616	case ALU_OP3_MULADD_IEEE:
617		allow_neg = true;
618		op = ALU_OP2_MUL_IEEE;
619		break;
620	default:
621		if (n->bc.op_ptr->src_count != 2)
622			return false;
623	}
624
625	// check if we can evaluate the op
626	if (!eval_const_op(op, cr, literal(0), literal(0)))
627		return false;
628
629	while (true) {
630
631		value *v0 = a->src[0]->gvalue();
632		value *v1 = a->src[1]->gvalue();
633
634		last_arg = -2;
635
636		if (v1->is_const()) {
637			literal arg = v1->get_const_value();
638			apply_alu_src_mod(a->bc, 1, arg);
639			if (cur_neg && distribute_neg)
640				arg.f = -arg.f;
641
642			if (a == n)
643				cr = arg;
644			else
645				eval_const_op(op, cr, cr, arg);
646
647			if (v0->def) {
648				alu_node *d0 = static_cast<alu_node*>(v0->def);
649				if ((d0->is_alu_op(op) ||
650						(op == ALU_OP2_MUL_IEEE &&
651								d0->is_alu_op(ALU_OP2_MUL))) &&
652						!d0->bc.omod && !d0->bc.clamp &&
653						!a->bc.src[0].abs &&
654						(!a->bc.src[0].neg || allow_neg)) {
655					cur_neg ^= a->bc.src[0].neg;
656					a = d0;
657					continue;
658				}
659			}
660			last_arg = 0;
661
662		}
663
664		if (v0->is_const()) {
665			literal arg = v0->get_const_value();
666			apply_alu_src_mod(a->bc, 0, arg);
667			if (cur_neg && distribute_neg)
668				arg.f = -arg.f;
669
670			if (last_arg == 0) {
671				eval_const_op(op, cr, cr, arg);
672				last_arg = -1;
673				break;
674			}
675
676			if (a == n)
677				cr = arg;
678			else
679				eval_const_op(op, cr, cr, arg);
680
681			if (v1->def) {
682				alu_node *d1 = static_cast<alu_node*>(v1->def);
683				if ((d1->is_alu_op(op) ||
684						(op == ALU_OP2_MUL_IEEE &&
685								d1->is_alu_op(ALU_OP2_MUL))) &&
686						!d1->bc.omod && !d1->bc.clamp &&
687						!a->bc.src[1].abs &&
688						(!a->bc.src[1].neg || allow_neg)) {
689					cur_neg ^= a->bc.src[1].neg;
690					a = d1;
691					continue;
692				}
693			}
694
695			last_arg = 1;
696		}
697
698		break;
699	};
700
701	if (last_arg == -1) {
702		// result is const
703		apply_alu_dst_mod(n->bc, cr);
704
705		if (n->bc.op == op) {
706			convert_to_mov(*n, sh.get_const_value(cr));
707			fold_alu_op1(*n);
708			return true;
709		} else { // MULADD => ADD
710			n->src[0] = n->src[2];
711			n->bc.src[0] = n->bc.src[2];
712			n->src[1] = sh.get_const_value(cr);
713			memset(&n->bc.src[1], 0, sizeof(bc_alu_src));
714
715			n->src.resize(2);
716			n->bc.set_op(ALU_OP2_ADD);
717		}
718	} else if (last_arg >= 0) {
719		n->src[0] = a->src[last_arg];
720		n->bc.src[0] = a->bc.src[last_arg];
721		n->bc.src[0].neg ^= cur_neg;
722		n->src[1] = sh.get_const_value(cr);
723		memset(&n->bc.src[1], 0, sizeof(bc_alu_src));
724	}
725
726	return false;
727}
728
729bool expr_handler::fold_alu_op2(alu_node& n) {
730
731	if (n.src.size() < 2)
732		return false;
733
734	unsigned flags = n.bc.op_ptr->flags;
735
736	if (flags & AF_SET) {
737		return fold_setcc(n);
738	}
739
740	if (!sh.safe_math && (flags & AF_M_ASSOC)) {
741		if (fold_assoc(&n))
742			return true;
743	}
744
745	value* v0 = n.src[0]->gvalue();
746	value* v1 = n.src[1]->gvalue();
747
748	assert(v0 && v1);
749
750	// handle some operations with equal args, e.g. x + x => x * 2
751	if (v0 == v1) {
752		if (n.bc.src[0].neg == n.bc.src[1].neg &&
753				n.bc.src[0].abs == n.bc.src[1].abs) {
754			switch (n.bc.op) {
755			case ALU_OP2_MIN: // (MIN x, x) => (MOV x)
756			case ALU_OP2_MAX:
757				convert_to_mov(n, v0, n.bc.src[0].neg, n.bc.src[0].abs);
758				return fold_alu_op1(n);
759			case ALU_OP2_ADD:  // (ADD x, x) => (MUL x, 2)
760				if (!sh.safe_math) {
761					n.src[1] = sh.get_const_value(2.0f);
762					memset(&n.bc.src[1], 0, sizeof(bc_alu_src));
763					n.bc.set_op(ALU_OP2_MUL);
764					return fold_alu_op2(n);
765				}
766				break;
767			}
768		}
769		if (n.bc.src[0].neg != n.bc.src[1].neg &&
770				n.bc.src[0].abs == n.bc.src[1].abs) {
771			switch (n.bc.op) {
772			case ALU_OP2_ADD:  // (ADD x, -x) => (MOV 0)
773				if (!sh.safe_math) {
774					convert_to_mov(n, sh.get_const_value(literal(0)));
775					return fold_alu_op1(n);
776				}
777				break;
778			}
779		}
780	}
781
782	if (n.bc.op == ALU_OP2_ADD) {
783		if (fold_mul_add(&n))
784			return true;
785	}
786
787	bool isc0 = v0->is_const();
788	bool isc1 = v1->is_const();
789
790	if (!isc0 && !isc1)
791		return false;
792
793	literal dv, cv0, cv1;
794
795	if (isc0) {
796		cv0 = v0->get_const_value();
797		apply_alu_src_mod(n.bc, 0, cv0);
798	}
799
800	if (isc1) {
801		cv1 = v1->get_const_value();
802		apply_alu_src_mod(n.bc, 1, cv1);
803	}
804
805	if (isc0 && isc1) {
806
807		if (!eval_const_op(n.bc.op, dv, cv0, cv1))
808			return false;
809
810	} else { // one source is const
811
812		if (isc0 && cv0 == literal(0)) {
813			switch (n.bc.op) {
814			case ALU_OP2_ADD:
815			case ALU_OP2_ADD_INT:
816			case ALU_OP2_MAX_UINT:
817			case ALU_OP2_OR_INT:
818			case ALU_OP2_XOR_INT:
819				convert_to_mov(n, n.src[1], n.bc.src[1].neg,  n.bc.src[1].abs);
820				return fold_alu_op1(n);
821			case ALU_OP2_AND_INT:
822			case ALU_OP2_ASHR_INT:
823			case ALU_OP2_LSHL_INT:
824			case ALU_OP2_LSHR_INT:
825			case ALU_OP2_MIN_UINT:
826			case ALU_OP2_MUL:
827			case ALU_OP2_MULHI_UINT:
828			case ALU_OP2_MULLO_UINT:
829				convert_to_mov(n, sh.get_const_value(literal(0)));
830				return fold_alu_op1(n);
831			}
832		} else if (isc1 && cv1 == literal(0)) {
833			switch (n.bc.op) {
834			case ALU_OP2_ADD:
835			case ALU_OP2_ADD_INT:
836			case ALU_OP2_ASHR_INT:
837			case ALU_OP2_LSHL_INT:
838			case ALU_OP2_LSHR_INT:
839			case ALU_OP2_MAX_UINT:
840			case ALU_OP2_OR_INT:
841			case ALU_OP2_SUB_INT:
842			case ALU_OP2_XOR_INT:
843				convert_to_mov(n, n.src[0], n.bc.src[0].neg,  n.bc.src[0].abs);
844				return fold_alu_op1(n);
845			case ALU_OP2_AND_INT:
846			case ALU_OP2_MIN_UINT:
847			case ALU_OP2_MUL:
848			case ALU_OP2_MULHI_UINT:
849			case ALU_OP2_MULLO_UINT:
850				convert_to_mov(n, sh.get_const_value(literal(0)));
851				return fold_alu_op1(n);
852			}
853		} else if (isc0 && cv0 == literal(1.0f)) {
854			switch (n.bc.op) {
855			case ALU_OP2_MUL:
856			case ALU_OP2_MUL_IEEE:
857				convert_to_mov(n, n.src[1], n.bc.src[1].neg,  n.bc.src[1].abs);
858				return fold_alu_op1(n);
859			}
860		} else if (isc1 && cv1 == literal(1.0f)) {
861			switch (n.bc.op) {
862			case ALU_OP2_MUL:
863			case ALU_OP2_MUL_IEEE:
864				convert_to_mov(n, n.src[0], n.bc.src[0].neg,  n.bc.src[0].abs);
865				return fold_alu_op1(n);
866			}
867		}
868
869		return false;
870	}
871
872	apply_alu_dst_mod(n.bc, dv);
873	assign_source(n.dst[0], get_const(dv));
874	return true;
875}
876
877bool expr_handler::evaluate_condition(unsigned alu_cnd_flags,
878                                      literal s1, literal s2) {
879
880	unsigned cmp_type = alu_cnd_flags & AF_CMP_TYPE_MASK;
881	unsigned cc = alu_cnd_flags & AF_CC_MASK;
882
883	switch (cmp_type) {
884	case AF_FLOAT_CMP: {
885		switch (cc) {
886		case AF_CC_E : return s1.f == s2.f;
887		case AF_CC_GT: return s1.f >  s2.f;
888		case AF_CC_GE: return s1.f >= s2.f;
889		case AF_CC_NE: return s1.f != s2.f;
890		case AF_CC_LT: return s1.f <  s2.f;
891		case AF_CC_LE: return s1.f <= s2.f;
892		default:
893			assert(!"invalid condition code");
894			return false;
895		}
896	}
897	case AF_INT_CMP: {
898		switch (cc) {
899		case AF_CC_E : return s1.i == s2.i;
900		case AF_CC_GT: return s1.i >  s2.i;
901		case AF_CC_GE: return s1.i >= s2.i;
902		case AF_CC_NE: return s1.i != s2.i;
903		case AF_CC_LT: return s1.i <  s2.i;
904		case AF_CC_LE: return s1.i <= s2.i;
905		default:
906			assert(!"invalid condition code");
907			return false;
908		}
909	}
910	case AF_UINT_CMP: {
911		switch (cc) {
912		case AF_CC_E : return s1.u == s2.u;
913		case AF_CC_GT: return s1.u >  s2.u;
914		case AF_CC_GE: return s1.u >= s2.u;
915		case AF_CC_NE: return s1.u != s2.u;
916		case AF_CC_LT: return s1.u <  s2.u;
917		case AF_CC_LE: return s1.u <= s2.u;
918		default:
919			assert(!"invalid condition code");
920			return false;
921		}
922	}
923	default:
924		assert(!"invalid cmp_type");
925		return false;
926	}
927}
928
929bool expr_handler::fold_alu_op3(alu_node& n) {
930
931	if (n.src.size() < 3)
932		return false;
933
934	if (!sh.safe_math && (n.bc.op_ptr->flags & AF_M_ASSOC)) {
935		if (fold_assoc(&n))
936			return true;
937	}
938
939	value* v0 = n.src[0]->gvalue();
940	value* v1 = n.src[1]->gvalue();
941	value* v2 = n.src[2]->gvalue();
942
943	assert(v0 && v1 && v2 && n.dst[0]);
944
945	bool isc0 = v0->is_const();
946	bool isc1 = v1->is_const();
947	bool isc2 = v2->is_const();
948
949	literal dv, cv0, cv1, cv2;
950
951	if (isc0) {
952		cv0 = v0->get_const_value();
953		apply_alu_src_mod(n.bc, 0, cv0);
954	}
955
956	if (isc1) {
957		cv1 = v1->get_const_value();
958		apply_alu_src_mod(n.bc, 1, cv1);
959	}
960
961	if (isc2) {
962		cv2 = v2->get_const_value();
963		apply_alu_src_mod(n.bc, 2, cv2);
964	}
965
966	unsigned flags = n.bc.op_ptr->flags;
967
968	if (flags & AF_CMOV) {
969		int src = 0;
970
971		if (v1 == v2 && n.bc.src[1].neg == n.bc.src[2].neg) {
972			// result doesn't depend on condition, convert to MOV
973			src = 1;
974		} else if (isc0) {
975			// src0 is const, condition can be evaluated, convert to MOV
976			bool cond = evaluate_condition(n.bc.op_ptr->flags & (AF_CC_MASK |
977					AF_CMP_TYPE_MASK), cv0, literal(0));
978			src = cond ? 1 : 2;
979		}
980
981		if (src) {
982			// if src is selected, convert to MOV
983			convert_to_mov(n, n.src[src], n.bc.src[src].neg);
984			return fold_alu_op1(n);
985		}
986	}
987
988	// handle (MULADD a, x, MUL (x, b)) => (MUL x, ADD (a, b))
989	if (!sh.safe_math && (n.bc.op == ALU_OP3_MULADD ||
990			n.bc.op == ALU_OP3_MULADD_IEEE)) {
991
992		unsigned op = n.bc.op == ALU_OP3_MULADD_IEEE ?
993				ALU_OP2_MUL_IEEE : ALU_OP2_MUL;
994
995		if (!isc2 && v2->def && v2->def->is_alu_op(op)) {
996
997			alu_node *md = static_cast<alu_node*>(v2->def);
998			value *mv0 = md->src[0]->gvalue();
999			value *mv1 = md->src[1]->gvalue();
1000
1001			int es0 = -1, es1;
1002
1003			if (v0 == mv0) {
1004				es0 = 0;
1005				es1 = 0;
1006			} else if (v0 == mv1) {
1007				es0 = 0;
1008				es1 = 1;
1009			} else if (v1 == mv0) {
1010				es0 = 1;
1011				es1 = 0;
1012			} else if (v1 == mv1) {
1013				es0 = 1;
1014				es1 = 1;
1015			}
1016
1017			if (es0 != -1) {
1018				value *va0 = es0 == 0 ? v1 : v0;
1019				value *va1 = es1 == 0 ? mv1 : mv0;
1020
1021				alu_node *add = sh.create_alu();
1022				add->bc.set_op(ALU_OP2_ADD);
1023
1024				add->dst.resize(1);
1025				add->src.resize(2);
1026
1027				value *t = sh.create_temp_value();
1028				t->def = add;
1029				add->dst[0] = t;
1030				add->src[0] = va0;
1031				add->src[1] = va1;
1032				add->bc.src[0] = n.bc.src[!es0];
1033				add->bc.src[1] = md->bc.src[!es1];
1034
1035				add->bc.src[1].neg ^= n.bc.src[2].neg ^
1036						(n.bc.src[es0].neg != md->bc.src[es1].neg);
1037
1038				n.insert_before(add);
1039				vt.add_value(t);
1040
1041				t = t->gvalue();
1042
1043				if (es0 == 1) {
1044					n.src[0] = n.src[1];
1045					n.bc.src[0] = n.bc.src[1];
1046				}
1047
1048				n.src[1] = t;
1049				memset(&n.bc.src[1], 0, sizeof(bc_alu_src));
1050
1051				n.src.resize(2);
1052
1053				n.bc.set_op(op);
1054				return fold_alu_op2(n);
1055			}
1056		}
1057	}
1058
1059	if (!isc0 && !isc1 && !isc2)
1060		return false;
1061
1062	if (isc0 && isc1 && isc2) {
1063		switch (n.bc.op) {
1064		case ALU_OP3_MULADD_IEEE:
1065		case ALU_OP3_MULADD: dv = cv0.f * cv1.f + cv2.f; break;
1066
1067		// TODO
1068
1069		default:
1070			return false;
1071		}
1072	} else {
1073		if (isc0 && isc1) {
1074			switch (n.bc.op) {
1075			case ALU_OP3_MULADD:
1076			case ALU_OP3_MULADD_IEEE:
1077				dv = cv0.f * cv1.f;
1078				n.bc.set_op(ALU_OP2_ADD);
1079				n.src[0] = sh.get_const_value(dv);
1080				memset(&n.bc.src[0], 0, sizeof(bc_alu_src));
1081				n.src[1] = n.src[2];
1082				n.bc.src[1] = n.bc.src[2];
1083				n.src.resize(2);
1084				return fold_alu_op2(n);
1085			}
1086		}
1087
1088		if (n.bc.op == ALU_OP3_MULADD) {
1089			if ((isc0 && cv0 == literal(0)) || (isc1 && cv1 == literal(0))) {
1090				convert_to_mov(n, n.src[2], n.bc.src[2].neg,  n.bc.src[2].abs);
1091				return fold_alu_op1(n);
1092			}
1093		}
1094
1095		if (n.bc.op == ALU_OP3_MULADD || n.bc.op == ALU_OP3_MULADD_IEEE) {
1096			unsigned op = n.bc.op == ALU_OP3_MULADD_IEEE ?
1097					ALU_OP2_MUL_IEEE : ALU_OP2_MUL;
1098
1099			if (isc1 && v0 == v2) {
1100				cv1.f += (n.bc.src[2].neg != n.bc.src[0].neg ? -1.0f : 1.0f);
1101				n.src[1] = sh.get_const_value(cv1);
1102				n.bc.src[1].neg = 0;
1103				n.bc.src[1].abs = 0;
1104				n.bc.set_op(op);
1105				n.src.resize(2);
1106				return fold_alu_op2(n);
1107			} else if (isc0 && v1 == v2) {
1108				cv0.f += (n.bc.src[2].neg != n.bc.src[1].neg ? -1.0f : 1.0f);
1109				n.src[0] = sh.get_const_value(cv0);
1110				n.bc.src[0].neg = 0;
1111				n.bc.src[0].abs = 0;
1112				n.bc.set_op(op);
1113				n.src.resize(2);
1114				return fold_alu_op2(n);
1115			}
1116		}
1117
1118		return false;
1119	}
1120
1121	apply_alu_dst_mod(n.bc, dv);
1122	assign_source(n.dst[0], get_const(dv));
1123	return true;
1124}
1125
1126unsigned invert_setcc_condition(unsigned cc, bool &swap_args) {
1127	unsigned ncc = 0;
1128
1129	switch (cc) {
1130	case AF_CC_E: ncc = AF_CC_NE; break;
1131	case AF_CC_NE: ncc = AF_CC_E; break;
1132	case AF_CC_GE: ncc = AF_CC_GT; swap_args = true; break;
1133	case AF_CC_GT: ncc = AF_CC_GE; swap_args = true; break;
1134	default:
1135		assert(!"unexpected condition code");
1136		break;
1137	}
1138	return ncc;
1139}
1140
1141unsigned get_setcc_op(unsigned cc, unsigned cmp_type, bool int_dst) {
1142
1143	if (int_dst && cmp_type == AF_FLOAT_CMP) {
1144		switch (cc) {
1145		case AF_CC_E: return ALU_OP2_SETE_DX10;
1146		case AF_CC_NE: return ALU_OP2_SETNE_DX10;
1147		case AF_CC_GT: return ALU_OP2_SETGT_DX10;
1148		case AF_CC_GE: return ALU_OP2_SETGE_DX10;
1149		}
1150	} else {
1151
1152		switch(cmp_type) {
1153		case AF_FLOAT_CMP: {
1154			switch (cc) {
1155			case AF_CC_E: return ALU_OP2_SETE;
1156			case AF_CC_NE: return ALU_OP2_SETNE;
1157			case AF_CC_GT: return ALU_OP2_SETGT;
1158			case AF_CC_GE: return ALU_OP2_SETGE;
1159			}
1160			break;
1161		}
1162		case AF_INT_CMP: {
1163			switch (cc) {
1164			case AF_CC_E: return ALU_OP2_SETE_INT;
1165			case AF_CC_NE: return ALU_OP2_SETNE_INT;
1166			case AF_CC_GT: return ALU_OP2_SETGT_INT;
1167			case AF_CC_GE: return ALU_OP2_SETGE_INT;
1168			}
1169			break;
1170		}
1171		case AF_UINT_CMP: {
1172			switch (cc) {
1173			case AF_CC_E: return ALU_OP2_SETE_INT;
1174			case AF_CC_NE: return ALU_OP2_SETNE_INT;
1175			case AF_CC_GT: return ALU_OP2_SETGT_UINT;
1176			case AF_CC_GE: return ALU_OP2_SETGE_UINT;
1177			}
1178			break;
1179		}
1180		}
1181	}
1182
1183	assert(!"unexpected cc&cmp_type combination");
1184	return ~0u;
1185}
1186
1187unsigned get_predsetcc_op(unsigned cc, unsigned cmp_type) {
1188
1189	switch(cmp_type) {
1190	case AF_FLOAT_CMP: {
1191		switch (cc) {
1192		case AF_CC_E: return ALU_OP2_PRED_SETE;
1193		case AF_CC_NE: return ALU_OP2_PRED_SETNE;
1194		case AF_CC_GT: return ALU_OP2_PRED_SETGT;
1195		case AF_CC_GE: return ALU_OP2_PRED_SETGE;
1196		}
1197		break;
1198	}
1199	case AF_INT_CMP: {
1200		switch (cc) {
1201		case AF_CC_E: return ALU_OP2_PRED_SETE_INT;
1202		case AF_CC_NE: return ALU_OP2_PRED_SETNE_INT;
1203		case AF_CC_GT: return ALU_OP2_PRED_SETGT_INT;
1204		case AF_CC_GE: return ALU_OP2_PRED_SETGE_INT;
1205		}
1206		break;
1207	}
1208	case AF_UINT_CMP: {
1209		switch (cc) {
1210		case AF_CC_E: return ALU_OP2_PRED_SETE_INT;
1211		case AF_CC_NE: return ALU_OP2_PRED_SETNE_INT;
1212		case AF_CC_GT: return ALU_OP2_PRED_SETGT_UINT;
1213		case AF_CC_GE: return ALU_OP2_PRED_SETGE_UINT;
1214		}
1215		break;
1216	}
1217	}
1218
1219	assert(!"unexpected cc&cmp_type combination");
1220	return ~0u;
1221}
1222
1223unsigned get_killcc_op(unsigned cc, unsigned cmp_type) {
1224
1225	switch(cmp_type) {
1226	case AF_FLOAT_CMP: {
1227		switch (cc) {
1228		case AF_CC_E: return ALU_OP2_KILLE;
1229		case AF_CC_NE: return ALU_OP2_KILLNE;
1230		case AF_CC_GT: return ALU_OP2_KILLGT;
1231		case AF_CC_GE: return ALU_OP2_KILLGE;
1232		}
1233		break;
1234	}
1235	case AF_INT_CMP: {
1236		switch (cc) {
1237		case AF_CC_E: return ALU_OP2_KILLE_INT;
1238		case AF_CC_NE: return ALU_OP2_KILLNE_INT;
1239		case AF_CC_GT: return ALU_OP2_KILLGT_INT;
1240		case AF_CC_GE: return ALU_OP2_KILLGE_INT;
1241		}
1242		break;
1243	}
1244	case AF_UINT_CMP: {
1245		switch (cc) {
1246		case AF_CC_E: return ALU_OP2_KILLE_INT;
1247		case AF_CC_NE: return ALU_OP2_KILLNE_INT;
1248		case AF_CC_GT: return ALU_OP2_KILLGT_UINT;
1249		case AF_CC_GE: return ALU_OP2_KILLGE_UINT;
1250		}
1251		break;
1252	}
1253	}
1254
1255	assert(!"unexpected cc&cmp_type combination");
1256	return ~0u;
1257}
1258
1259unsigned get_cndcc_op(unsigned cc, unsigned cmp_type) {
1260
1261	switch(cmp_type) {
1262	case AF_FLOAT_CMP: {
1263		switch (cc) {
1264		case AF_CC_E: return ALU_OP3_CNDE;
1265		case AF_CC_GT: return ALU_OP3_CNDGT;
1266		case AF_CC_GE: return ALU_OP3_CNDGE;
1267		}
1268		break;
1269	}
1270	case AF_INT_CMP: {
1271		switch (cc) {
1272		case AF_CC_E: return ALU_OP3_CNDE_INT;
1273		case AF_CC_GT: return ALU_OP3_CNDGT_INT;
1274		case AF_CC_GE: return ALU_OP3_CNDGE_INT;
1275		}
1276		break;
1277	}
1278	}
1279
1280	assert(!"unexpected cc&cmp_type combination");
1281	return ~0u;
1282}
1283
1284
1285void convert_predset_to_set(shader& sh, alu_node* a) {
1286
1287	unsigned flags = a->bc.op_ptr->flags;
1288	unsigned cc = flags & AF_CC_MASK;
1289	unsigned cmp_type = flags & AF_CMP_TYPE_MASK;
1290
1291	bool swap_args = false;
1292
1293	cc = invert_setcc_condition(cc, swap_args);
1294
1295	unsigned newop = get_setcc_op(cc, cmp_type, true);
1296
1297	a->dst.resize(1);
1298	a->bc.set_op(newop);
1299
1300	if (swap_args) {
1301		std::swap(a->src[0], a->src[1]);
1302		std::swap(a->bc.src[0], a->bc.src[1]);
1303	}
1304
1305	a->bc.update_exec_mask = 0;
1306	a->bc.update_pred = 0;
1307}
1308
1309} // namespace r600_sb
1310