1/*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "nv50_ir.h"
24#include "nv50_ir_target.h"
25#include "nv50_ir_build_util.h"
26
27extern "C" {
28#include "util/u_math.h"
29}
30
31namespace nv50_ir {
32
33bool
34Instruction::isNop() const
35{
36   if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
37      return true;
38   if (terminator || join) // XXX: should terminator imply flow ?
39      return false;
40   if (!fixed && op == OP_NOP)
41      return true;
42
43   if (defExists(0) && def(0).rep()->reg.data.id < 0) {
44      for (int d = 1; defExists(d); ++d)
45         if (def(d).rep()->reg.data.id >= 0)
46            WARN("part of vector result is unused !\n");
47      return true;
48   }
49
50   if (op == OP_MOV || op == OP_UNION) {
51      if (!getDef(0)->equals(getSrc(0)))
52         return false;
53      if (op == OP_UNION)
54         if (!def(0).rep()->equals(getSrc(1)))
55            return false;
56      return true;
57   }
58
59   return false;
60}
61
62bool Instruction::isDead() const
63{
64   if (op == OP_STORE ||
65       op == OP_EXPORT ||
66       op == OP_WRSV)
67      return false;
68
69   for (int d = 0; defExists(d); ++d)
70      if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
71         return false;
72
73   if (terminator || asFlow())
74      return false;
75   if (fixed)
76      return false;
77
78   return true;
79};
80
81// =============================================================================
82
83class CopyPropagation : public Pass
84{
85private:
86   virtual bool visit(BasicBlock *);
87};
88
89// Propagate all MOVs forward to make subsequent optimization easier, except if
90// the sources stem from a phi, in which case we don't want to mess up potential
91// swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
92bool
93CopyPropagation::visit(BasicBlock *bb)
94{
95   Instruction *mov, *si, *next;
96
97   for (mov = bb->getEntry(); mov; mov = next) {
98      next = mov->next;
99      if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
100         continue;
101      if (mov->getPredicate())
102         continue;
103      if (mov->def(0).getFile() != mov->src(0).getFile())
104         continue;
105      si = mov->getSrc(0)->getInsn();
106      if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
107         // propagate
108         mov->def(0).replace(mov->getSrc(0), false);
109         delete_Instruction(prog, mov);
110      }
111   }
112   return true;
113}
114
115// =============================================================================
116
117class LoadPropagation : public Pass
118{
119private:
120   virtual bool visit(BasicBlock *);
121
122   void checkSwapSrc01(Instruction *);
123
124   bool isCSpaceLoad(Instruction *);
125   bool isImmd32Load(Instruction *);
126   bool isAttribOrSharedLoad(Instruction *);
127};
128
129bool
130LoadPropagation::isCSpaceLoad(Instruction *ld)
131{
132   return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
133}
134
135bool
136LoadPropagation::isImmd32Load(Instruction *ld)
137{
138   if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
139      return false;
140   return ld->src(0).getFile() == FILE_IMMEDIATE;
141}
142
143bool
144LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
145{
146   return ld &&
147      (ld->op == OP_VFETCH ||
148       (ld->op == OP_LOAD &&
149        (ld->src(0).getFile() == FILE_SHADER_INPUT ||
150         ld->src(0).getFile() == FILE_MEMORY_SHARED)));
151}
152
153void
154LoadPropagation::checkSwapSrc01(Instruction *insn)
155{
156   if (!prog->getTarget()->getOpInfo(insn).commutative)
157      if (insn->op != OP_SET && insn->op != OP_SLCT)
158         return;
159   if (insn->src(1).getFile() != FILE_GPR)
160      return;
161
162   Instruction *i0 = insn->getSrc(0)->getInsn();
163   Instruction *i1 = insn->getSrc(1)->getInsn();
164
165   if (isCSpaceLoad(i0)) {
166      if (!isCSpaceLoad(i1))
167         insn->swapSources(0, 1);
168      else
169         return;
170   } else
171   if (isImmd32Load(i0)) {
172      if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
173         insn->swapSources(0, 1);
174      else
175         return;
176   } else
177   if (isAttribOrSharedLoad(i1)) {
178      if (!isAttribOrSharedLoad(i0))
179         insn->swapSources(0, 1);
180      else
181         return;
182   } else {
183      return;
184   }
185
186   if (insn->op == OP_SET)
187      insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
188   else
189   if (insn->op == OP_SLCT)
190      insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
191}
192
193bool
194LoadPropagation::visit(BasicBlock *bb)
195{
196   const Target *targ = prog->getTarget();
197   Instruction *next;
198
199   for (Instruction *i = bb->getEntry(); i; i = next) {
200      next = i->next;
201
202      if (i->srcExists(1))
203         checkSwapSrc01(i);
204
205      for (int s = 0; i->srcExists(s); ++s) {
206         Instruction *ld = i->getSrc(s)->getInsn();
207
208         if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
209            continue;
210         if (!targ->insnCanLoad(i, s, ld))
211            continue;
212
213         // propagate !
214         i->setSrc(s, ld->getSrc(0));
215         if (ld->src(0).isIndirect(0))
216            i->setIndirect(s, 0, ld->getIndirect(0, 0));
217
218         if (ld->getDef(0)->refCount() == 0)
219            delete_Instruction(prog, ld);
220      }
221   }
222   return true;
223}
224
225// =============================================================================
226
227// Evaluate constant expressions.
228class ConstantFolding : public Pass
229{
230public:
231   bool foldAll(Program *);
232
233private:
234   virtual bool visit(BasicBlock *);
235
236   void expr(Instruction *, ImmediateValue&, ImmediateValue&);
237   void opnd(Instruction *, ImmediateValue&, int s);
238
239   void unary(Instruction *, const ImmediateValue&);
240
241   void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
242
243   // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
244   CmpInstruction *findOriginForTestWithZero(Value *);
245
246   unsigned int foldCount;
247
248   BuildUtil bld;
249};
250
251// TODO: remember generated immediates and only revisit these
252bool
253ConstantFolding::foldAll(Program *prog)
254{
255   unsigned int iterCount = 0;
256   do {
257      foldCount = 0;
258      if (!run(prog))
259         return false;
260   } while (foldCount && ++iterCount < 2);
261   return true;
262}
263
264bool
265ConstantFolding::visit(BasicBlock *bb)
266{
267   Instruction *i, *next;
268
269   for (i = bb->getEntry(); i; i = next) {
270      next = i->next;
271      if (i->op == OP_MOV || i->op == OP_CALL)
272         continue;
273
274      ImmediateValue src0, src1;
275
276      if (i->srcExists(1) &&
277          i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
278         expr(i, src0, src1);
279      else
280      if (i->srcExists(0) && i->src(0).getImmediate(src0))
281         opnd(i, src0, 0);
282      else
283      if (i->srcExists(1) && i->src(1).getImmediate(src1))
284         opnd(i, src1, 1);
285   }
286   return true;
287}
288
289CmpInstruction *
290ConstantFolding::findOriginForTestWithZero(Value *value)
291{
292   if (!value)
293      return NULL;
294   Instruction *insn = value->getInsn();
295
296   while (insn && insn->op != OP_SET) {
297      Instruction *next = NULL;
298      switch (insn->op) {
299      case OP_NEG:
300      case OP_ABS:
301      case OP_CVT:
302         next = insn->getSrc(0)->getInsn();
303         if (insn->sType != next->dType)
304            return NULL;
305         break;
306      case OP_MOV:
307         next = insn->getSrc(0)->getInsn();
308         break;
309      default:
310         return NULL;
311      }
312      insn = next;
313   }
314   return insn ? insn->asCmp() : NULL;
315}
316
317void
318Modifier::applyTo(ImmediateValue& imm) const
319{
320   switch (imm.reg.type) {
321   case TYPE_F32:
322      if (bits & NV50_IR_MOD_ABS)
323         imm.reg.data.f32 = fabsf(imm.reg.data.f32);
324      if (bits & NV50_IR_MOD_NEG)
325         imm.reg.data.f32 = -imm.reg.data.f32;
326      if (bits & NV50_IR_MOD_SAT) {
327         if (imm.reg.data.f32 < 0.0f)
328            imm.reg.data.f32 = 0.0f;
329         else
330         if (imm.reg.data.f32 > 1.0f)
331            imm.reg.data.f32 = 1.0f;
332      }
333      assert(!(bits & NV50_IR_MOD_NOT));
334      break;
335
336   case TYPE_S8: // NOTE: will be extended
337   case TYPE_S16:
338   case TYPE_S32:
339   case TYPE_U8: // NOTE: treated as signed
340   case TYPE_U16:
341   case TYPE_U32:
342      if (bits & NV50_IR_MOD_ABS)
343         imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
344            imm.reg.data.s32 : -imm.reg.data.s32;
345      if (bits & NV50_IR_MOD_NEG)
346         imm.reg.data.s32 = -imm.reg.data.s32;
347      if (bits & NV50_IR_MOD_NOT)
348         imm.reg.data.s32 = ~imm.reg.data.s32;
349      break;
350
351   case TYPE_F64:
352      if (bits & NV50_IR_MOD_ABS)
353         imm.reg.data.f64 = fabs(imm.reg.data.f64);
354      if (bits & NV50_IR_MOD_NEG)
355         imm.reg.data.f64 = -imm.reg.data.f64;
356      if (bits & NV50_IR_MOD_SAT) {
357         if (imm.reg.data.f64 < 0.0)
358            imm.reg.data.f64 = 0.0;
359         else
360         if (imm.reg.data.f64 > 1.0)
361            imm.reg.data.f64 = 1.0;
362      }
363      assert(!(bits & NV50_IR_MOD_NOT));
364      break;
365
366   default:
367      assert(!"invalid/unhandled type");
368      imm.reg.data.u64 = 0;
369      break;
370   }
371}
372
373operation
374Modifier::getOp() const
375{
376   switch (bits) {
377   case NV50_IR_MOD_ABS: return OP_ABS;
378   case NV50_IR_MOD_NEG: return OP_NEG;
379   case NV50_IR_MOD_SAT: return OP_SAT;
380   case NV50_IR_MOD_NOT: return OP_NOT;
381   case 0:
382      return OP_MOV;
383   default:
384      return OP_CVT;
385   }
386}
387
388void
389ConstantFolding::expr(Instruction *i,
390                      ImmediateValue &imm0, ImmediateValue &imm1)
391{
392   struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
393   struct Storage res;
394
395   memset(&res.data, 0, sizeof(res.data));
396
397   switch (i->op) {
398   case OP_MAD:
399   case OP_FMA:
400   case OP_MUL:
401      if (i->dnz && i->dType == TYPE_F32) {
402         if (!isfinite(a->data.f32))
403            a->data.f32 = 0.0f;
404         if (!isfinite(b->data.f32))
405            b->data.f32 = 0.0f;
406      }
407      switch (i->dType) {
408      case TYPE_F32: res.data.f32 = a->data.f32 * b->data.f32; break;
409      case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
410      case TYPE_S32:
411      case TYPE_U32: res.data.u32 = a->data.u32 * b->data.u32; break;
412      default:
413         return;
414      }
415      break;
416   case OP_DIV:
417      if (b->data.u32 == 0)
418         break;
419      switch (i->dType) {
420      case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
421      case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
422      case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
423      case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
424      default:
425         return;
426      }
427      break;
428   case OP_ADD:
429      switch (i->dType) {
430      case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
431      case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
432      case TYPE_S32:
433      case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
434      default:
435         return;
436      }
437      break;
438   case OP_POW:
439      switch (i->dType) {
440      case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
441      case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
442      default:
443         return;
444      }
445      break;
446   case OP_MAX:
447      switch (i->dType) {
448      case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
449      case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
450      case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
451      case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
452      default:
453         return;
454      }
455      break;
456   case OP_MIN:
457      switch (i->dType) {
458      case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
459      case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
460      case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
461      case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
462      default:
463         return;
464      }
465      break;
466   case OP_AND:
467      res.data.u64 = a->data.u64 & b->data.u64;
468      break;
469   case OP_OR:
470      res.data.u64 = a->data.u64 | b->data.u64;
471      break;
472   case OP_XOR:
473      res.data.u64 = a->data.u64 ^ b->data.u64;
474      break;
475   case OP_SHL:
476      res.data.u32 = a->data.u32 << b->data.u32;
477      break;
478   case OP_SHR:
479      switch (i->dType) {
480      case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
481      case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
482      default:
483         return;
484      }
485      break;
486   case OP_SLCT:
487      if (a->data.u32 != b->data.u32)
488         return;
489      res.data.u32 = a->data.u32;
490      break;
491   default:
492      return;
493   }
494   ++foldCount;
495
496   i->src(0).mod = Modifier(0);
497   i->src(1).mod = Modifier(0);
498
499   i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
500   i->setSrc(1, NULL);
501
502   i->getSrc(0)->reg.data = res.data;
503
504   if (i->op == OP_MAD || i->op == OP_FMA) {
505      i->op = OP_ADD;
506
507      i->setSrc(1, i->getSrc(0));
508      i->src(1).mod = i->src(2).mod;
509      i->setSrc(0, i->getSrc(2));
510      i->setSrc(2, NULL);
511
512      ImmediateValue src0;
513      if (i->src(0).getImmediate(src0))
514         expr(i, src0, *i->getSrc(1)->asImm());
515   } else {
516      i->op = OP_MOV;
517   }
518}
519
520void
521ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
522{
523   Storage res;
524
525   if (i->dType != TYPE_F32)
526      return;
527   switch (i->op) {
528   case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
529   case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
530   case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
531   case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
532   case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
533   case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
534   case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
535   case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
536   case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
537   case OP_PRESIN:
538   case OP_PREEX2:
539      // these should be handled in subsequent OP_SIN/COS/EX2
540      res.data.f32 = imm.reg.data.f32;
541      break;
542   default:
543      return;
544   }
545   i->op = OP_MOV;
546   i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
547   i->src(0).mod = Modifier(0);
548}
549
550void
551ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
552                                        const int s, ImmediateValue& imm2)
553{
554   const int t = s ? 0 : 1;
555   Instruction *insn;
556   Instruction *mul1 = NULL; // mul1 before mul2
557   int e = 0;
558   float f = imm2.reg.data.f32;
559   ImmediateValue imm1;
560
561   assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
562
563   if (mul2->getSrc(t)->refCount() == 1) {
564      insn = mul2->getSrc(t)->getInsn();
565      if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
566         mul1 = insn;
567      if (mul1 && !mul1->saturate) {
568         int s1;
569
570         if (mul1->src(s1 = 0).getImmediate(imm1) ||
571             mul1->src(s1 = 1).getImmediate(imm1)) {
572            bld.setPosition(mul1, false);
573            // a = mul r, imm1
574            // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
575            mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
576            mul1->src(s1).mod = Modifier(0);
577            mul2->def(0).replace(mul1->getDef(0), false);
578         } else
579         if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
580            // c = mul a, b
581            // d = mul c, imm   -> d = mul_x_imm a, b
582            mul1->postFactor = e;
583            mul2->def(0).replace(mul1->getDef(0), false);
584            if (f < 0)
585               mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
586         }
587         mul1->saturate = mul2->saturate;
588         return;
589      }
590   }
591   if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
592      // b = mul a, imm
593      // d = mul b, c   -> d = mul_x_imm a, c
594      int s2, t2;
595      insn = mul2->getDef(0)->uses.front()->getInsn();
596      if (!insn)
597         return;
598      mul1 = mul2;
599      mul2 = NULL;
600      s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
601      t2 = s2 ? 0 : 1;
602      if (insn->op == OP_MUL && insn->dType == TYPE_F32)
603         if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
604            mul2 = insn;
605      if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
606         mul2->postFactor = e;
607         mul2->setSrc(s2, mul1->src(t));
608         if (f < 0)
609            mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
610      }
611   }
612}
613
614void
615ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
616{
617   const int t = !s;
618   const operation op = i->op;
619
620   switch (i->op) {
621   case OP_MUL:
622      if (i->dType == TYPE_F32)
623         tryCollapseChainedMULs(i, s, imm0);
624
625      if (imm0.isInteger(0)) {
626         i->op = OP_MOV;
627         i->setSrc(0, new_ImmediateValue(prog, 0u));
628         i->src(0).mod = Modifier(0);
629         i->setSrc(1, NULL);
630      } else
631      if (imm0.isInteger(1) || imm0.isInteger(-1)) {
632         if (imm0.isNegative())
633            i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
634         i->op = i->src(t).mod.getOp();
635         if (s == 0) {
636            i->setSrc(0, i->getSrc(1));
637            i->src(0).mod = i->src(1).mod;
638            i->src(1).mod = 0;
639         }
640         if (i->op != OP_CVT)
641            i->src(0).mod = 0;
642         i->setSrc(1, NULL);
643      } else
644      if (imm0.isInteger(2) || imm0.isInteger(-2)) {
645         if (imm0.isNegative())
646            i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
647         i->op = OP_ADD;
648         i->setSrc(s, i->getSrc(t));
649         i->src(s).mod = i->src(t).mod;
650      } else
651      if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
652         i->op = OP_SHL;
653         imm0.applyLog2();
654         i->setSrc(0, i->getSrc(t));
655         i->src(0).mod = i->src(t).mod;
656         i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
657         i->src(1).mod = 0;
658      }
659      break;
660   case OP_ADD:
661      if (imm0.isInteger(0)) {
662         if (s == 0) {
663            i->setSrc(0, i->getSrc(1));
664            i->src(0).mod = i->src(1).mod;
665         }
666         i->setSrc(1, NULL);
667         i->op = i->src(0).mod.getOp();
668         if (i->op != OP_CVT)
669            i->src(0).mod = Modifier(0);
670      }
671      break;
672
673   case OP_DIV:
674      if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
675         break;
676      bld.setPosition(i, false);
677      if (imm0.reg.data.u32 == 0) {
678         break;
679      } else
680      if (imm0.reg.data.u32 == 1) {
681         i->op = OP_MOV;
682         i->setSrc(1, NULL);
683      } else
684      if (i->dType == TYPE_U32 && imm0.isPow2()) {
685         i->op = OP_SHR;
686         i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
687      } else
688      if (i->dType == TYPE_U32) {
689         Instruction *mul;
690         Value *tA, *tB;
691         const uint32_t d = imm0.reg.data.u32;
692         uint32_t m;
693         int r, s;
694         uint32_t l = util_logbase2(d);
695         if (((uint32_t)1 << l) < d)
696            ++l;
697         m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
698         r = l ? 1 : 0;
699         s = l ? (l - 1) : 0;
700
701         tA = bld.getSSA();
702         tB = bld.getSSA();
703         mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
704                         bld.loadImm(NULL, m));
705         mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
706         bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
707         tA = bld.getSSA();
708         if (r)
709            bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
710         else
711            tA = tB;
712         tB = s ? bld.getSSA() : i->getDef(0);
713         bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
714         if (s)
715            bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
716
717         delete_Instruction(prog, i);
718      } else
719      if (imm0.reg.data.s32 == -1) {
720         i->op = OP_NEG;
721         i->setSrc(1, NULL);
722      } else {
723         LValue *tA, *tB;
724         LValue *tD;
725         const int32_t d = imm0.reg.data.s32;
726         int32_t m;
727         int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
728         if ((1 << l) < abs(d))
729            ++l;
730         if (!l)
731            l = 1;
732         m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
733
734         tA = bld.getSSA();
735         tB = bld.getSSA();
736         bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
737                   i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
738         if (l > 1)
739            bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
740         else
741            tB = tA;
742         tA = bld.getSSA();
743         bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, i->getSrc(0), bld.mkImm(0));
744         tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
745         bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
746         if (d < 0)
747            bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
748
749         delete_Instruction(prog, i);
750      }
751      break;
752
753   case OP_MOD:
754      if (i->sType == TYPE_U32 && imm0.isPow2()) {
755         bld.setPosition(i, false);
756         i->op = OP_AND;
757         i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
758      }
759      break;
760
761   case OP_SET: // TODO: SET_AND,OR,XOR
762   {
763      CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
764      CondCode cc, ccZ;
765      if (i->src(t).mod != Modifier(0))
766         return;
767      if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET)
768         return;
769      cc = si->setCond;
770      ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
771      if (s == 0)
772         ccZ = reverseCondCode(ccZ);
773      switch (ccZ) {
774      case CC_LT: cc = CC_FL; break;
775      case CC_GE: cc = CC_TR; break;
776      case CC_EQ: cc = inverseCondCode(cc); break;
777      case CC_LE: cc = inverseCondCode(cc); break;
778      case CC_GT: break;
779      case CC_NE: break;
780      default:
781         return;
782      }
783      i->asCmp()->setCond = cc;
784      i->setSrc(0, si->src(0));
785      i->setSrc(1, si->src(1));
786      i->sType = si->sType;
787   }
788      break;
789
790   case OP_SHL:
791   {
792      if (s != 1 || i->src(0).mod != Modifier(0))
793         break;
794      // try to concatenate shifts
795      Instruction *si = i->getSrc(0)->getInsn();
796      if (!si || si->op != OP_SHL)
797         break;
798      ImmediateValue imm1;
799      if (si->src(1).getImmediate(imm1)) {
800         bld.setPosition(i, false);
801         i->setSrc(0, si->getSrc(0));
802         i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
803      }
804   }
805      break;
806
807   case OP_ABS:
808   case OP_NEG:
809   case OP_LG2:
810   case OP_RCP:
811   case OP_SQRT:
812   case OP_RSQ:
813   case OP_PRESIN:
814   case OP_SIN:
815   case OP_COS:
816   case OP_PREEX2:
817   case OP_EX2:
818      unary(i, imm0);
819      break;
820   default:
821      return;
822   }
823   if (i->op != op)
824      foldCount++;
825}
826
827// =============================================================================
828
829// Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
830class ModifierFolding : public Pass
831{
832private:
833   virtual bool visit(BasicBlock *);
834};
835
836bool
837ModifierFolding::visit(BasicBlock *bb)
838{
839   const Target *target = prog->getTarget();
840
841   Instruction *i, *next, *mi;
842   Modifier mod;
843
844   for (i = bb->getEntry(); i; i = next) {
845      next = i->next;
846
847      if (0 && i->op == OP_SUB) {
848         // turn "sub" into "add neg" (do we really want this ?)
849         i->op = OP_ADD;
850         i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
851      }
852
853      for (int s = 0; s < 3 && i->srcExists(s); ++s) {
854         mi = i->getSrc(s)->getInsn();
855         if (!mi ||
856             mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
857            continue;
858         if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
859            if ((i->op != OP_ADD &&
860                 i->op != OP_MUL) ||
861                (mi->op != OP_ABS &&
862                 mi->op != OP_NEG))
863               continue;
864         } else
865         if (i->sType != mi->dType) {
866            continue;
867         }
868         if ((mod = Modifier(mi->op)) == Modifier(0))
869            continue;
870         mod *= mi->src(0).mod;
871
872         if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
873            // abs neg [abs] = abs
874            mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
875         } else
876         if ((i->op == OP_NEG) && mod.neg()) {
877            assert(s == 0);
878            // neg as both opcode and modifier on same insn is prohibited
879            // neg neg abs = abs, neg neg = identity
880            mod = mod & Modifier(~NV50_IR_MOD_NEG);
881            i->op = mod.getOp();
882            mod = mod & Modifier(~NV50_IR_MOD_ABS);
883            if (mod == Modifier(0))
884               i->op = OP_MOV;
885         }
886
887         if (target->isModSupported(i, s, mod)) {
888            i->setSrc(s, mi->getSrc(0));
889            i->src(s).mod *= mod;
890         }
891      }
892
893      if (i->op == OP_SAT) {
894         mi = i->getSrc(0)->getInsn();
895         if (mi &&
896             mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
897            mi->saturate = 1;
898            mi->setDef(0, i->getDef(0));
899            delete_Instruction(prog, i);
900         }
901      }
902   }
903
904   return true;
905}
906
907// =============================================================================
908
909// MUL + ADD -> MAD/FMA
910// MIN/MAX(a, a) -> a, etc.
911// SLCT(a, b, const) -> cc(const) ? a : b
912// RCP(RCP(a)) -> a
913// MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
914class AlgebraicOpt : public Pass
915{
916private:
917   virtual bool visit(BasicBlock *);
918
919   void handleABS(Instruction *);
920   bool handleADD(Instruction *);
921   bool tryADDToMADOrSAD(Instruction *, operation toOp);
922   void handleMINMAX(Instruction *);
923   void handleRCP(Instruction *);
924   void handleSLCT(Instruction *);
925   void handleLOGOP(Instruction *);
926   void handleCVT(Instruction *);
927
928   BuildUtil bld;
929};
930
931void
932AlgebraicOpt::handleABS(Instruction *abs)
933{
934   Instruction *sub = abs->getSrc(0)->getInsn();
935   DataType ty;
936   if (!sub ||
937       !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
938      return;
939   // expect not to have mods yet, if we do, bail
940   if (sub->src(0).mod || sub->src(1).mod)
941      return;
942   // hidden conversion ?
943   ty = intTypeToSigned(sub->dType);
944   if (abs->dType != abs->sType || ty != abs->sType)
945      return;
946
947   if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
948       sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
949       sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
950         return;
951
952   Value *src0 = sub->getSrc(0);
953   Value *src1 = sub->getSrc(1);
954
955   if (sub->op == OP_ADD) {
956      Instruction *neg = sub->getSrc(1)->getInsn();
957      if (neg && neg->op != OP_NEG) {
958         neg = sub->getSrc(0)->getInsn();
959         src0 = sub->getSrc(1);
960      }
961      if (!neg || neg->op != OP_NEG ||
962          neg->dType != neg->sType || neg->sType != ty)
963         return;
964      src1 = neg->getSrc(0);
965   }
966
967   // found ABS(SUB))
968   abs->moveSources(1, 2); // move sources >=1 up by 2
969   abs->op = OP_SAD;
970   abs->setType(sub->dType);
971   abs->setSrc(0, src0);
972   abs->setSrc(1, src1);
973   bld.setPosition(abs, false);
974   abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
975}
976
977bool
978AlgebraicOpt::handleADD(Instruction *add)
979{
980   Value *src0 = add->getSrc(0);
981   Value *src1 = add->getSrc(1);
982
983   if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
984      return false;
985
986   bool changed = false;
987   if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
988      changed = tryADDToMADOrSAD(add, OP_MAD);
989   if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
990      changed = tryADDToMADOrSAD(add, OP_SAD);
991   return changed;
992}
993
994// ADD(SAD(a,b,0), c) -> SAD(a,b,c)
995// ADD(MUL(a,b), c) -> MAD(a,b,c)
996bool
997AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
998{
999   Value *src0 = add->getSrc(0);
1000   Value *src1 = add->getSrc(1);
1001   Value *src;
1002   int s;
1003   const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
1004   const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
1005   Modifier mod[4];
1006
1007   if (src0->refCount() == 1 &&
1008       src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
1009      s = 0;
1010   else
1011   if (src1->refCount() == 1 &&
1012       src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
1013      s = 1;
1014   else
1015      return false;
1016
1017   if ((src0->getUniqueInsn() && src0->getUniqueInsn()->bb != add->bb) ||
1018       (src1->getUniqueInsn() && src1->getUniqueInsn()->bb != add->bb))
1019      return false;
1020
1021   src = add->getSrc(s);
1022
1023   if (src->getInsn()->postFactor)
1024      return false;
1025   if (toOp == OP_SAD) {
1026      ImmediateValue imm;
1027      if (!src->getInsn()->src(2).getImmediate(imm))
1028         return false;
1029      if (!imm.isInteger(0))
1030         return false;
1031   }
1032
1033   mod[0] = add->src(0).mod;
1034   mod[1] = add->src(1).mod;
1035   mod[2] = src->getUniqueInsn()->src(0).mod;
1036   mod[3] = src->getUniqueInsn()->src(1).mod;
1037
1038   if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
1039      return false;
1040
1041   add->op = toOp;
1042   add->subOp = src->getInsn()->subOp; // potentially mul-high
1043
1044   add->setSrc(2, add->src(s ? 0 : 1));
1045
1046   add->setSrc(0, src->getInsn()->getSrc(0));
1047   add->src(0).mod = mod[2] ^ mod[s];
1048   add->setSrc(1, src->getInsn()->getSrc(1));
1049   add->src(1).mod = mod[3];
1050
1051   return true;
1052}
1053
1054void
1055AlgebraicOpt::handleMINMAX(Instruction *minmax)
1056{
1057   Value *src0 = minmax->getSrc(0);
1058   Value *src1 = minmax->getSrc(1);
1059
1060   if (src0 != src1 || src0->reg.file != FILE_GPR)
1061      return;
1062   if (minmax->src(0).mod == minmax->src(1).mod) {
1063      if (minmax->def(0).mayReplace(minmax->src(0))) {
1064         minmax->def(0).replace(minmax->src(0), false);
1065         minmax->bb->remove(minmax);
1066      } else {
1067         minmax->op = OP_CVT;
1068         minmax->setSrc(1, NULL);
1069      }
1070   } else {
1071      // TODO:
1072      // min(x, -x) = -abs(x)
1073      // min(x, -abs(x)) = -abs(x)
1074      // min(x, abs(x)) = x
1075      // max(x, -abs(x)) = x
1076      // max(x, abs(x)) = abs(x)
1077      // max(x, -x) = abs(x)
1078   }
1079}
1080
1081void
1082AlgebraicOpt::handleRCP(Instruction *rcp)
1083{
1084   Instruction *si = rcp->getSrc(0)->getUniqueInsn();
1085
1086   if (si && si->op == OP_RCP) {
1087      Modifier mod = rcp->src(0).mod * si->src(0).mod;
1088      rcp->op = mod.getOp();
1089      rcp->setSrc(0, si->getSrc(0));
1090   }
1091}
1092
1093void
1094AlgebraicOpt::handleSLCT(Instruction *slct)
1095{
1096   if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
1097      if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
1098         slct->setSrc(0, slct->getSrc(1));
1099   } else
1100   if (slct->getSrc(0) != slct->getSrc(1)) {
1101      return;
1102   }
1103   slct->op = OP_MOV;
1104   slct->setSrc(1, NULL);
1105   slct->setSrc(2, NULL);
1106}
1107
1108void
1109AlgebraicOpt::handleLOGOP(Instruction *logop)
1110{
1111   Value *src0 = logop->getSrc(0);
1112   Value *src1 = logop->getSrc(1);
1113
1114   if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
1115      return;
1116
1117   if (src0 == src1) {
1118      if ((logop->op == OP_AND || logop->op == OP_OR) &&
1119          logop->def(0).mayReplace(logop->src(0))) {
1120         logop->def(0).replace(logop->src(0), false);
1121         delete_Instruction(prog, logop);
1122      }
1123   } else {
1124      // try AND(SET, SET) -> SET_AND(SET)
1125      Instruction *set0 = src0->getInsn();
1126      Instruction *set1 = src1->getInsn();
1127
1128      if (!set0 || set0->fixed || !set1 || set1->fixed)
1129         return;
1130      if (set1->op != OP_SET) {
1131         Instruction *xchg = set0;
1132         set0 = set1;
1133         set1 = xchg;
1134         if (set1->op != OP_SET)
1135            return;
1136      }
1137      operation redOp = (logop->op == OP_AND ? OP_SET_AND :
1138                         logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
1139      if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
1140         return;
1141      if (set0->op != OP_SET &&
1142          set0->op != OP_SET_AND &&
1143          set0->op != OP_SET_OR &&
1144          set0->op != OP_SET_XOR)
1145         return;
1146      if (set0->getDef(0)->refCount() > 1 &&
1147          set1->getDef(0)->refCount() > 1)
1148         return;
1149      if (set0->getPredicate() || set1->getPredicate())
1150         return;
1151      // check that they don't source each other
1152      for (int s = 0; s < 2; ++s)
1153         if (set0->getSrc(s) == set1->getDef(0) ||
1154             set1->getSrc(s) == set0->getDef(0))
1155            return;
1156
1157      set0 = cloneForward(func, set0);
1158      set1 = cloneShallow(func, set1);
1159      logop->bb->insertAfter(logop, set1);
1160      logop->bb->insertAfter(logop, set0);
1161
1162      set0->dType = TYPE_U8;
1163      set0->getDef(0)->reg.file = FILE_PREDICATE;
1164      set0->getDef(0)->reg.size = 1;
1165      set1->setSrc(2, set0->getDef(0));
1166      set1->op = redOp;
1167      set1->setDef(0, logop->getDef(0));
1168      delete_Instruction(prog, logop);
1169   }
1170}
1171
1172// F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
1173// nv50:
1174//  F2I(NEG(I2F(ABS(SET))))
1175void
1176AlgebraicOpt::handleCVT(Instruction *cvt)
1177{
1178   if (cvt->sType != TYPE_F32 ||
1179       cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
1180      return;
1181   Instruction *insn = cvt->getSrc(0)->getInsn();
1182   if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
1183      return;
1184   if (insn->src(0).mod != Modifier(0))
1185      return;
1186   insn = insn->getSrc(0)->getInsn();
1187
1188   // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
1189   if (insn && insn->op == OP_CVT &&
1190       insn->dType == TYPE_F32 &&
1191       insn->sType == TYPE_S32) {
1192      insn = insn->getSrc(0)->getInsn();
1193      if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
1194          insn->src(0).mod)
1195         return;
1196      insn = insn->getSrc(0)->getInsn();
1197      if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
1198         return;
1199   } else
1200   if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
1201      return;
1202   }
1203
1204   Instruction *bset = cloneShallow(func, insn);
1205   bset->dType = TYPE_U32;
1206   bset->setDef(0, cvt->getDef(0));
1207   cvt->bb->insertAfter(cvt, bset);
1208   delete_Instruction(prog, cvt);
1209}
1210
1211bool
1212AlgebraicOpt::visit(BasicBlock *bb)
1213{
1214   Instruction *next;
1215   for (Instruction *i = bb->getEntry(); i; i = next) {
1216      next = i->next;
1217      switch (i->op) {
1218      case OP_ABS:
1219         handleABS(i);
1220         break;
1221      case OP_ADD:
1222         handleADD(i);
1223         break;
1224      case OP_RCP:
1225         handleRCP(i);
1226         break;
1227      case OP_MIN:
1228      case OP_MAX:
1229         handleMINMAX(i);
1230         break;
1231      case OP_SLCT:
1232         handleSLCT(i);
1233         break;
1234      case OP_AND:
1235      case OP_OR:
1236      case OP_XOR:
1237         handleLOGOP(i);
1238         break;
1239      case OP_CVT:
1240         handleCVT(i);
1241         break;
1242      default:
1243         break;
1244      }
1245   }
1246
1247   return true;
1248}
1249
1250// =============================================================================
1251
1252static inline void
1253updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
1254{
1255   if (offset != ldst->getSrc(0)->reg.data.offset) {
1256      if (ldst->getSrc(0)->refCount() > 1)
1257         ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
1258      ldst->getSrc(0)->reg.data.offset = offset;
1259   }
1260}
1261
1262// Combine loads and stores, forward stores to loads where possible.
1263class MemoryOpt : public Pass
1264{
1265private:
1266   class Record
1267   {
1268   public:
1269      Record *next;
1270      Instruction *insn;
1271      const Value *rel[2];
1272      const Value *base;
1273      int32_t offset;
1274      int8_t fileIndex;
1275      uint8_t size;
1276      bool locked;
1277      Record *prev;
1278
1279      bool overlaps(const Instruction *ldst) const;
1280
1281      inline void link(Record **);
1282      inline void unlink(Record **);
1283      inline void set(const Instruction *ldst);
1284   };
1285
1286public:
1287   MemoryOpt();
1288
1289   Record *loads[DATA_FILE_COUNT];
1290   Record *stores[DATA_FILE_COUNT];
1291
1292   MemoryPool recordPool;
1293
1294private:
1295   virtual bool visit(BasicBlock *);
1296   bool runOpt(BasicBlock *);
1297
1298   Record **getList(const Instruction *);
1299
1300   Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
1301
1302   // merge @insn into load/store instruction from @rec
1303   bool combineLd(Record *rec, Instruction *ld);
1304   bool combineSt(Record *rec, Instruction *st);
1305
1306   bool replaceLdFromLd(Instruction *ld, Record *ldRec);
1307   bool replaceLdFromSt(Instruction *ld, Record *stRec);
1308   bool replaceStFromSt(Instruction *restrict st, Record *stRec);
1309
1310   void addRecord(Instruction *ldst);
1311   void purgeRecords(Instruction *const st, DataFile);
1312   void lockStores(Instruction *const ld);
1313   void reset();
1314
1315private:
1316   Record *prevRecord;
1317};
1318
1319MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
1320{
1321   for (int i = 0; i < DATA_FILE_COUNT; ++i) {
1322      loads[i] = NULL;
1323      stores[i] = NULL;
1324   }
1325   prevRecord = NULL;
1326}
1327
1328void
1329MemoryOpt::reset()
1330{
1331   for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
1332      Record *it, *next;
1333      for (it = loads[i]; it; it = next) {
1334         next = it->next;
1335         recordPool.release(it);
1336      }
1337      loads[i] = NULL;
1338      for (it = stores[i]; it; it = next) {
1339         next = it->next;
1340         recordPool.release(it);
1341      }
1342      stores[i] = NULL;
1343   }
1344}
1345
1346bool
1347MemoryOpt::combineLd(Record *rec, Instruction *ld)
1348{
1349   int32_t offRc = rec->offset;
1350   int32_t offLd = ld->getSrc(0)->reg.data.offset;
1351   int sizeRc = rec->size;
1352   int sizeLd = typeSizeof(ld->dType);
1353   int size = sizeRc + sizeLd;
1354   int d, j;
1355
1356   if (!prog->getTarget()->
1357       isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
1358      return false;
1359   // no unaligned loads
1360   if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
1361       ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
1362      return false;
1363
1364   assert(sizeRc + sizeLd <= 16 && offRc != offLd);
1365
1366   for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
1367
1368   if (offLd < offRc) {
1369      int sz;
1370      for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
1371      // d: nr of definitions in ld
1372      // j: nr of definitions in rec->insn, move:
1373      for (d = d + j - 1; j > 0; --j, --d)
1374         rec->insn->setDef(d, rec->insn->getDef(j - 1));
1375
1376      if (rec->insn->getSrc(0)->refCount() > 1)
1377         rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
1378      rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
1379
1380      d = 0;
1381   } else {
1382      d = j;
1383   }
1384   // move definitions of @ld to @rec->insn
1385   for (j = 0; sizeLd; ++j, ++d) {
1386      sizeLd -= ld->getDef(j)->reg.size;
1387      rec->insn->setDef(d, ld->getDef(j));
1388   }
1389
1390   rec->size = size;
1391   rec->insn->getSrc(0)->reg.size = size;
1392   rec->insn->setType(typeOfSize(size));
1393
1394   delete_Instruction(prog, ld);
1395
1396   return true;
1397}
1398
1399bool
1400MemoryOpt::combineSt(Record *rec, Instruction *st)
1401{
1402   int32_t offRc = rec->offset;
1403   int32_t offSt = st->getSrc(0)->reg.data.offset;
1404   int sizeRc = rec->size;
1405   int sizeSt = typeSizeof(st->dType);
1406   int s = sizeSt / 4;
1407   int size = sizeRc + sizeSt;
1408   int j, k;
1409   Value *src[4]; // no modifiers in ValueRef allowed for st
1410   Value *extra[3];
1411
1412   if (!prog->getTarget()->
1413       isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
1414      return false;
1415   if (size == 8 && MIN2(offRc, offSt) & 0x7)
1416      return false;
1417
1418   st->takeExtraSources(0, extra); // save predicate and indirect address
1419
1420   if (offRc < offSt) {
1421      // save values from @st
1422      for (s = 0; sizeSt; ++s) {
1423         sizeSt -= st->getSrc(s + 1)->reg.size;
1424         src[s] = st->getSrc(s + 1);
1425      }
1426      // set record's values as low sources of @st
1427      for (j = 1; sizeRc; ++j) {
1428         sizeRc -= rec->insn->getSrc(j)->reg.size;
1429         st->setSrc(j, rec->insn->getSrc(j));
1430      }
1431      // set saved values as high sources of @st
1432      for (k = j, j = 0; j < s; ++j)
1433         st->setSrc(k++, src[j]);
1434
1435      updateLdStOffset(st, offRc, func);
1436   } else {
1437      for (j = 1; sizeSt; ++j)
1438         sizeSt -= st->getSrc(j)->reg.size;
1439      for (s = 1; sizeRc; ++j, ++s) {
1440         sizeRc -= rec->insn->getSrc(s)->reg.size;
1441         st->setSrc(j, rec->insn->getSrc(s));
1442      }
1443      rec->offset = offSt;
1444   }
1445   st->putExtraSources(0, extra); // restore pointer and predicate
1446
1447   delete_Instruction(prog, rec->insn);
1448   rec->insn = st;
1449   rec->size = size;
1450   rec->insn->getSrc(0)->reg.size = size;
1451   rec->insn->setType(typeOfSize(size));
1452   return true;
1453}
1454
1455void
1456MemoryOpt::Record::set(const Instruction *ldst)
1457{
1458   const Symbol *mem = ldst->getSrc(0)->asSym();
1459   fileIndex = mem->reg.fileIndex;
1460   rel[0] = ldst->getIndirect(0, 0);
1461   rel[1] = ldst->getIndirect(0, 1);
1462   offset = mem->reg.data.offset;
1463   base = mem->getBase();
1464   size = typeSizeof(ldst->sType);
1465}
1466
1467void
1468MemoryOpt::Record::link(Record **list)
1469{
1470   next = *list;
1471   if (next)
1472      next->prev = this;
1473   prev = NULL;
1474   *list = this;
1475}
1476
1477void
1478MemoryOpt::Record::unlink(Record **list)
1479{
1480   if (next)
1481      next->prev = prev;
1482   if (prev)
1483      prev->next = next;
1484   else
1485      *list = next;
1486}
1487
1488MemoryOpt::Record **
1489MemoryOpt::getList(const Instruction *insn)
1490{
1491   if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
1492      return &loads[insn->src(0).getFile()];
1493   return &stores[insn->src(0).getFile()];
1494}
1495
1496void
1497MemoryOpt::addRecord(Instruction *i)
1498{
1499   Record **list = getList(i);
1500   Record *it = reinterpret_cast<Record *>(recordPool.allocate());
1501
1502   it->link(list);
1503   it->set(i);
1504   it->insn = i;
1505   it->locked = false;
1506}
1507
1508MemoryOpt::Record *
1509MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
1510{
1511   const Symbol *sym = insn->getSrc(0)->asSym();
1512   const int size = typeSizeof(insn->sType);
1513   Record *rec = NULL;
1514   Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
1515
1516   for (; it; it = it->next) {
1517      if (it->locked && insn->op != OP_LOAD)
1518         continue;
1519      if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
1520          it->rel[0] != insn->getIndirect(0, 0) ||
1521          it->fileIndex != sym->reg.fileIndex ||
1522          it->rel[1] != insn->getIndirect(0, 1))
1523         continue;
1524
1525      if (it->offset < sym->reg.data.offset) {
1526         if (it->offset + it->size >= sym->reg.data.offset) {
1527            isAdj = (it->offset + it->size == sym->reg.data.offset);
1528            if (!isAdj)
1529               return it;
1530            if (!(it->offset & 0x7))
1531               rec = it;
1532         }
1533      } else {
1534         isAdj = it->offset != sym->reg.data.offset;
1535         if (size <= it->size && !isAdj)
1536            return it;
1537         else
1538         if (!(sym->reg.data.offset & 0x7))
1539            if (it->offset - size <= sym->reg.data.offset)
1540               rec = it;
1541      }
1542   }
1543   return rec;
1544}
1545
1546bool
1547MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
1548{
1549   Instruction *st = rec->insn;
1550   int32_t offSt = rec->offset;
1551   int32_t offLd = ld->getSrc(0)->reg.data.offset;
1552   int d, s;
1553
1554   for (s = 1; offSt != offLd && st->srcExists(s); ++s)
1555      offSt += st->getSrc(s)->reg.size;
1556   if (offSt != offLd)
1557      return false;
1558
1559   for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
1560      if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
1561         return false;
1562      if (st->getSrc(s)->reg.file != FILE_GPR)
1563         return false;
1564      ld->def(d).replace(st->src(s), false);
1565   }
1566   ld->bb->remove(ld);
1567   return true;
1568}
1569
1570bool
1571MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
1572{
1573   Instruction *ldR = rec->insn;
1574   int32_t offR = rec->offset;
1575   int32_t offE = ldE->getSrc(0)->reg.data.offset;
1576   int dR, dE;
1577
1578   assert(offR <= offE);
1579   for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
1580      offR += ldR->getDef(dR)->reg.size;
1581   if (offR != offE)
1582      return false;
1583
1584   for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
1585      if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
1586         return false;
1587      ldE->def(dE).replace(ldR->getDef(dR), false);
1588   }
1589
1590   delete_Instruction(prog, ldE);
1591   return true;
1592}
1593
1594bool
1595MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
1596{
1597   const Instruction *const ri = rec->insn;
1598   Value *extra[3];
1599
1600   int32_t offS = st->getSrc(0)->reg.data.offset;
1601   int32_t offR = rec->offset;
1602   int32_t endS = offS + typeSizeof(st->dType);
1603   int32_t endR = offR + typeSizeof(ri->dType);
1604
1605   rec->size = MAX2(endS, endR) - MIN2(offS, offR);
1606
1607   st->takeExtraSources(0, extra);
1608
1609   if (offR < offS) {
1610      Value *vals[10];
1611      int s, n;
1612      int k = 0;
1613      // get non-replaced sources of ri
1614      for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
1615         vals[k++] = ri->getSrc(s);
1616      n = s;
1617      // get replaced sources of st
1618      for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
1619         vals[k++] = st->getSrc(s);
1620      // skip replaced sources of ri
1621      for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
1622      // get non-replaced sources after values covered by st
1623      for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
1624         vals[k++] = ri->getSrc(s);
1625      assert((unsigned int)k <= Elements(vals));
1626      for (s = 0; s < k; ++s)
1627         st->setSrc(s + 1, vals[s]);
1628      st->setSrc(0, ri->getSrc(0));
1629   } else
1630   if (endR > endS) {
1631      int j, s;
1632      for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
1633      for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
1634      for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
1635         st->setSrc(s++, ri->getSrc(j));
1636   }
1637   st->putExtraSources(0, extra);
1638
1639   delete_Instruction(prog, rec->insn);
1640
1641   rec->insn = st;
1642   rec->offset = st->getSrc(0)->reg.data.offset;
1643
1644   st->setType(typeOfSize(rec->size));
1645
1646   return true;
1647}
1648
1649bool
1650MemoryOpt::Record::overlaps(const Instruction *ldst) const
1651{
1652   Record that;
1653   that.set(ldst);
1654
1655   if (this->fileIndex != that.fileIndex)
1656      return false;
1657
1658   if (this->rel[0] || that.rel[0])
1659      return this->base == that.base;
1660   return
1661      (this->offset < that.offset + that.size) &&
1662      (this->offset + this->size > that.offset);
1663}
1664
1665// We must not eliminate stores that affect the result of @ld if
1666// we find later stores to the same location, and we may no longer
1667// merge them with later stores.
1668// The stored value can, however, still be used to determine the value
1669// returned by future loads.
1670void
1671MemoryOpt::lockStores(Instruction *const ld)
1672{
1673   for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
1674      if (!r->locked && r->overlaps(ld))
1675         r->locked = true;
1676}
1677
1678// Prior loads from the location of @st are no longer valid.
1679// Stores to the location of @st may no longer be used to derive
1680// the value at it nor be coalesced into later stores.
1681void
1682MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
1683{
1684   if (st)
1685      f = st->src(0).getFile();
1686
1687   for (Record *r = loads[f]; r; r = r->next)
1688      if (!st || r->overlaps(st))
1689         r->unlink(&loads[f]);
1690
1691   for (Record *r = stores[f]; r; r = r->next)
1692      if (!st || r->overlaps(st))
1693         r->unlink(&stores[f]);
1694}
1695
1696bool
1697MemoryOpt::visit(BasicBlock *bb)
1698{
1699   bool ret = runOpt(bb);
1700   // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
1701   // where 96 bit memory operations are forbidden.
1702   if (ret)
1703      ret = runOpt(bb);
1704   return ret;
1705}
1706
1707bool
1708MemoryOpt::runOpt(BasicBlock *bb)
1709{
1710   Instruction *ldst, *next;
1711   Record *rec;
1712   bool isAdjacent = true;
1713
1714   for (ldst = bb->getEntry(); ldst; ldst = next) {
1715      bool keep = true;
1716      bool isLoad = true;
1717      next = ldst->next;
1718
1719      if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
1720         if (ldst->isDead()) {
1721            // might have been produced by earlier optimization
1722            delete_Instruction(prog, ldst);
1723            continue;
1724         }
1725      } else
1726      if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
1727         isLoad = false;
1728      } else {
1729         // TODO: maybe have all fixed ops act as barrier ?
1730         if (ldst->op == OP_CALL) {
1731            purgeRecords(NULL, FILE_MEMORY_LOCAL);
1732            purgeRecords(NULL, FILE_MEMORY_GLOBAL);
1733            purgeRecords(NULL, FILE_MEMORY_SHARED);
1734            purgeRecords(NULL, FILE_SHADER_OUTPUT);
1735         } else
1736         if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
1737            purgeRecords(NULL, FILE_SHADER_OUTPUT);
1738         }
1739         continue;
1740      }
1741      if (ldst->getPredicate()) // TODO: handle predicated ld/st
1742         continue;
1743
1744      if (isLoad) {
1745         DataFile file = ldst->src(0).getFile();
1746
1747         // if ld l[]/g[] look for previous store to eliminate the reload
1748         if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
1749            // TODO: shared memory ?
1750            rec = findRecord(ldst, false, isAdjacent);
1751            if (rec && !isAdjacent)
1752               keep = !replaceLdFromSt(ldst, rec);
1753         }
1754
1755         // or look for ld from the same location and replace this one
1756         rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
1757         if (rec) {
1758            if (!isAdjacent)
1759               keep = !replaceLdFromLd(ldst, rec);
1760            else
1761               // or combine a previous load with this one
1762               keep = !combineLd(rec, ldst);
1763         }
1764         if (keep)
1765            lockStores(ldst);
1766      } else {
1767         rec = findRecord(ldst, false, isAdjacent);
1768         if (rec) {
1769            if (!isAdjacent)
1770               keep = !replaceStFromSt(ldst, rec);
1771            else
1772               keep = !combineSt(rec, ldst);
1773         }
1774         if (keep)
1775            purgeRecords(ldst, DATA_FILE_COUNT);
1776      }
1777      if (keep)
1778         addRecord(ldst);
1779   }
1780   reset();
1781
1782   return true;
1783}
1784
1785// =============================================================================
1786
1787// Turn control flow into predicated instructions (after register allocation !).
1788// TODO:
1789// Could move this to before register allocation on NVC0 and also handle nested
1790// constructs.
1791class FlatteningPass : public Pass
1792{
1793private:
1794   virtual bool visit(BasicBlock *);
1795
1796   bool tryPredicateConditional(BasicBlock *);
1797   void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
1798   void tryPropagateBranch(BasicBlock *);
1799   inline bool isConstantCondition(Value *pred);
1800   inline bool mayPredicate(const Instruction *, const Value *pred) const;
1801   inline void removeFlow(Instruction *);
1802};
1803
1804bool
1805FlatteningPass::isConstantCondition(Value *pred)
1806{
1807   Instruction *insn = pred->getUniqueInsn();
1808   assert(insn);
1809   if (insn->op != OP_SET || insn->srcExists(2))
1810      return false;
1811
1812   for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
1813      Instruction *ld = insn->getSrc(s)->getUniqueInsn();
1814      DataFile file;
1815      if (ld) {
1816         if (ld->op != OP_MOV && ld->op != OP_LOAD)
1817            return false;
1818         if (ld->src(0).isIndirect(0))
1819            return false;
1820         file = ld->src(0).getFile();
1821      } else {
1822         file = insn->src(s).getFile();
1823         // catch $r63 on NVC0
1824         if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR)
1825            file = FILE_IMMEDIATE;
1826      }
1827      if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
1828         return false;
1829   }
1830   return true;
1831}
1832
1833void
1834FlatteningPass::removeFlow(Instruction *insn)
1835{
1836   FlowInstruction *term = insn ? insn->asFlow() : NULL;
1837   if (!term)
1838      return;
1839   Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
1840
1841   if (term->op == OP_BRA) {
1842      // TODO: this might get more difficult when we get arbitrary BRAs
1843      if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
1844         return;
1845   } else
1846   if (term->op != OP_JOIN)
1847      return;
1848
1849   Value *pred = term->getPredicate();
1850
1851   delete_Instruction(prog, term);
1852
1853   if (pred && pred->refCount() == 0) {
1854      Instruction *pSet = pred->getUniqueInsn();
1855      pred->join->reg.data.id = -1; // deallocate
1856      if (pSet->isDead())
1857         delete_Instruction(prog, pSet);
1858   }
1859}
1860
1861void
1862FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
1863{
1864   for (Instruction *i = bb->getEntry(); i; i = i->next) {
1865      if (i->isNop())
1866         continue;
1867      assert(!i->getPredicate());
1868      i->setPredicate(cc, pred);
1869   }
1870   removeFlow(bb->getExit());
1871}
1872
1873bool
1874FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
1875{
1876   if (insn->isPseudo())
1877      return true;
1878   // TODO: calls where we don't know which registers are modified
1879
1880   if (!prog->getTarget()->mayPredicate(insn, pred))
1881      return false;
1882   for (int d = 0; insn->defExists(d); ++d)
1883      if (insn->getDef(d)->equals(pred))
1884         return false;
1885   return true;
1886}
1887
1888// If we conditionally skip over or to a branch instruction, replace it.
1889// NOTE: We do not update the CFG anymore here !
1890void
1891FlatteningPass::tryPropagateBranch(BasicBlock *bb)
1892{
1893   BasicBlock *bf = NULL;
1894   unsigned int i;
1895
1896   if (bb->cfg.outgoingCount() != 2)
1897      return;
1898   if (!bb->getExit() || bb->getExit()->op != OP_BRA)
1899      return;
1900   Graph::EdgeIterator ei = bb->cfg.outgoing();
1901
1902   for (i = 0; !ei.end(); ++i, ei.next()) {
1903      bf = BasicBlock::get(ei.getNode());
1904      if (bf->getInsnCount() == 1)
1905         break;
1906   }
1907   if (ei.end() || !bf->getExit())
1908      return;
1909   FlowInstruction *bra = bb->getExit()->asFlow();
1910   FlowInstruction *rep = bf->getExit()->asFlow();
1911
1912   if (rep->getPredicate())
1913      return;
1914   if (rep->op != OP_BRA &&
1915       rep->op != OP_JOIN &&
1916       rep->op != OP_EXIT)
1917      return;
1918
1919   bra->op = rep->op;
1920   bra->target.bb = rep->target.bb;
1921   if (i) // 2nd out block means branch not taken
1922      bra->cc = inverseCondCode(bra->cc);
1923   bf->remove(rep);
1924}
1925
1926bool
1927FlatteningPass::visit(BasicBlock *bb)
1928{
1929   if (tryPredicateConditional(bb))
1930      return true;
1931
1932   // try to attach join to previous instruction
1933   Instruction *insn = bb->getExit();
1934   if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
1935      insn = insn->prev;
1936      if (insn && !insn->getPredicate() &&
1937          !insn->asFlow() &&
1938          insn->op != OP_TEXBAR &&
1939          !isTextureOp(insn->op) && // probably just nve4
1940          insn->op != OP_LINTERP && // probably just nve4
1941          insn->op != OP_PINTERP && // probably just nve4
1942          ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
1943           typeSizeof(insn->dType) <= 4) &&
1944          !insn->isNop()) {
1945         insn->join = 1;
1946         bb->remove(bb->getExit());
1947         return true;
1948      }
1949   }
1950
1951   tryPropagateBranch(bb);
1952
1953   return true;
1954}
1955
1956bool
1957FlatteningPass::tryPredicateConditional(BasicBlock *bb)
1958{
1959   BasicBlock *bL = NULL, *bR = NULL;
1960   unsigned int nL = 0, nR = 0, limit = 12;
1961   Instruction *insn;
1962   unsigned int mask;
1963
1964   mask = bb->initiatesSimpleConditional();
1965   if (!mask)
1966      return false;
1967
1968   assert(bb->getExit());
1969   Value *pred = bb->getExit()->getPredicate();
1970   assert(pred);
1971
1972   if (isConstantCondition(pred))
1973      limit = 4;
1974
1975   Graph::EdgeIterator ei = bb->cfg.outgoing();
1976
1977   if (mask & 1) {
1978      bL = BasicBlock::get(ei.getNode());
1979      for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
1980         if (!mayPredicate(insn, pred))
1981            return false;
1982      if (nL > limit)
1983         return false; // too long, do a real branch
1984   }
1985   ei.next();
1986
1987   if (mask & 2) {
1988      bR = BasicBlock::get(ei.getNode());
1989      for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
1990         if (!mayPredicate(insn, pred))
1991            return false;
1992      if (nR > limit)
1993         return false; // too long, do a real branch
1994   }
1995
1996   if (bL)
1997      predicateInstructions(bL, pred, bb->getExit()->cc);
1998   if (bR)
1999      predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
2000
2001   if (bb->joinAt) {
2002      bb->remove(bb->joinAt);
2003      bb->joinAt = NULL;
2004   }
2005   removeFlow(bb->getExit()); // delete the branch/join at the fork point
2006
2007   // remove potential join operations at the end of the conditional
2008   if (prog->getTarget()->joinAnterior) {
2009      bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
2010      if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
2011         removeFlow(bb->getEntry());
2012   }
2013
2014   return true;
2015}
2016
2017// =============================================================================
2018
2019// Common subexpression elimination. Stupid O^2 implementation.
2020class LocalCSE : public Pass
2021{
2022private:
2023   virtual bool visit(BasicBlock *);
2024
2025   inline bool tryReplace(Instruction **, Instruction *);
2026
2027   DLList ops[OP_LAST + 1];
2028};
2029
2030class GlobalCSE : public Pass
2031{
2032private:
2033   virtual bool visit(BasicBlock *);
2034};
2035
2036bool
2037Instruction::isActionEqual(const Instruction *that) const
2038{
2039   if (this->op != that->op ||
2040       this->dType != that->dType ||
2041       this->sType != that->sType)
2042      return false;
2043   if (this->cc != that->cc)
2044      return false;
2045
2046   if (this->asTex()) {
2047      if (memcmp(&this->asTex()->tex,
2048                 &that->asTex()->tex,
2049                 sizeof(this->asTex()->tex)))
2050         return false;
2051   } else
2052   if (this->asCmp()) {
2053      if (this->asCmp()->setCond != that->asCmp()->setCond)
2054         return false;
2055   } else
2056   if (this->asFlow()) {
2057      return false;
2058   } else {
2059      if (this->atomic != that->atomic ||
2060          this->ipa != that->ipa ||
2061          this->lanes != that->lanes ||
2062          this->perPatch != that->perPatch)
2063         return false;
2064      if (this->postFactor != that->postFactor)
2065         return false;
2066   }
2067
2068   if (this->subOp != that->subOp ||
2069       this->saturate != that->saturate ||
2070       this->rnd != that->rnd ||
2071       this->ftz != that->ftz ||
2072       this->dnz != that->dnz ||
2073       this->cache != that->cache)
2074      return false;
2075
2076   return true;
2077}
2078
2079bool
2080Instruction::isResultEqual(const Instruction *that) const
2081{
2082   unsigned int d, s;
2083
2084   // NOTE: location of discard only affects tex with liveOnly and quadops
2085   if (!this->defExists(0) && this->op != OP_DISCARD)
2086      return false;
2087
2088   if (!isActionEqual(that))
2089      return false;
2090
2091   if (this->predSrc != that->predSrc)
2092      return false;
2093
2094   for (d = 0; this->defExists(d); ++d) {
2095      if (!that->defExists(d) ||
2096          !this->getDef(d)->equals(that->getDef(d), false))
2097         return false;
2098   }
2099   if (that->defExists(d))
2100      return false;
2101
2102   for (s = 0; this->srcExists(s); ++s) {
2103      if (!that->srcExists(s))
2104         return false;
2105      if (this->src(s).mod != that->src(s).mod)
2106         return false;
2107      if (!this->getSrc(s)->equals(that->getSrc(s), true))
2108         return false;
2109   }
2110   if (that->srcExists(s))
2111      return false;
2112
2113   if (op == OP_LOAD || op == OP_VFETCH) {
2114      switch (src(0).getFile()) {
2115      case FILE_MEMORY_CONST:
2116      case FILE_SHADER_INPUT:
2117         return true;
2118      default:
2119         return false;
2120      }
2121   }
2122
2123   return true;
2124}
2125
2126// pull through common expressions from different in-blocks
2127bool
2128GlobalCSE::visit(BasicBlock *bb)
2129{
2130   Instruction *phi, *next, *ik;
2131   int s;
2132
2133   // TODO: maybe do this with OP_UNION, too
2134
2135   for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
2136      next = phi->next;
2137      if (phi->getSrc(0)->refCount() > 1)
2138         continue;
2139      ik = phi->getSrc(0)->getInsn();
2140      if (!ik)
2141         continue; // probably a function input
2142      for (s = 1; phi->srcExists(s); ++s) {
2143         if (phi->getSrc(s)->refCount() > 1)
2144            break;
2145         if (!phi->getSrc(s)->getInsn() ||
2146             !phi->getSrc(s)->getInsn()->isResultEqual(ik))
2147            break;
2148      }
2149      if (!phi->srcExists(s)) {
2150         Instruction *entry = bb->getEntry();
2151         ik->bb->remove(ik);
2152         if (!entry || entry->op != OP_JOIN)
2153            bb->insertHead(ik);
2154         else
2155            bb->insertAfter(entry, ik);
2156         ik->setDef(0, phi->getDef(0));
2157         delete_Instruction(prog, phi);
2158      }
2159   }
2160
2161   return true;
2162}
2163
2164bool
2165LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
2166{
2167   Instruction *old = *ptr;
2168
2169   // TODO: maybe relax this later (causes trouble with OP_UNION)
2170   if (i->isPredicated())
2171      return false;
2172
2173   if (!old->isResultEqual(i))
2174      return false;
2175
2176   for (int d = 0; old->defExists(d); ++d)
2177      old->def(d).replace(i->getDef(d), false);
2178   delete_Instruction(prog, old);
2179   *ptr = NULL;
2180   return true;
2181}
2182
2183bool
2184LocalCSE::visit(BasicBlock *bb)
2185{
2186   unsigned int replaced;
2187
2188   do {
2189      Instruction *ir, *next;
2190
2191      replaced = 0;
2192
2193      // will need to know the order of instructions
2194      int serial = 0;
2195      for (ir = bb->getFirst(); ir; ir = ir->next)
2196         ir->serial = serial++;
2197
2198      for (ir = bb->getEntry(); ir; ir = next) {
2199         int s;
2200         Value *src = NULL;
2201
2202         next = ir->next;
2203
2204         if (ir->fixed) {
2205            ops[ir->op].insert(ir);
2206            continue;
2207         }
2208
2209         for (s = 0; ir->srcExists(s); ++s)
2210            if (ir->getSrc(s)->asLValue())
2211               if (!src || ir->getSrc(s)->refCount() < src->refCount())
2212                  src = ir->getSrc(s);
2213
2214         if (src) {
2215            for (Value::UseIterator it = src->uses.begin();
2216                 it != src->uses.end(); ++it) {
2217               Instruction *ik = (*it)->getInsn();
2218               if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
2219                  if (tryReplace(&ir, ik))
2220                     break;
2221            }
2222         } else {
2223            DLLIST_FOR_EACH(&ops[ir->op], iter)
2224            {
2225               Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
2226               if (tryReplace(&ir, ik))
2227                  break;
2228            }
2229         }
2230
2231         if (ir)
2232            ops[ir->op].insert(ir);
2233         else
2234            ++replaced;
2235      }
2236      for (unsigned int i = 0; i <= OP_LAST; ++i)
2237         ops[i].clear();
2238
2239   } while (replaced);
2240
2241   return true;
2242}
2243
2244// =============================================================================
2245
2246// Remove computations of unused values.
2247class DeadCodeElim : public Pass
2248{
2249public:
2250   bool buryAll(Program *);
2251
2252private:
2253   virtual bool visit(BasicBlock *);
2254
2255   void checkSplitLoad(Instruction *ld); // for partially dead loads
2256
2257   unsigned int deadCount;
2258};
2259
2260bool
2261DeadCodeElim::buryAll(Program *prog)
2262{
2263   do {
2264      deadCount = 0;
2265      if (!this->run(prog, false, false))
2266         return false;
2267   } while (deadCount);
2268
2269   return true;
2270}
2271
2272bool
2273DeadCodeElim::visit(BasicBlock *bb)
2274{
2275   Instruction *next;
2276
2277   for (Instruction *i = bb->getFirst(); i; i = next) {
2278      next = i->next;
2279      if (i->isDead()) {
2280         ++deadCount;
2281         delete_Instruction(prog, i);
2282      } else
2283      if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) {
2284         checkSplitLoad(i);
2285      }
2286   }
2287   return true;
2288}
2289
2290void
2291DeadCodeElim::checkSplitLoad(Instruction *ld1)
2292{
2293   Instruction *ld2 = NULL; // can get at most 2 loads
2294   Value *def1[4];
2295   Value *def2[4];
2296   int32_t addr1, addr2;
2297   int32_t size1, size2;
2298   int d, n1, n2;
2299   uint32_t mask = 0xffffffff;
2300
2301   for (d = 0; ld1->defExists(d); ++d)
2302      if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
2303         mask &= ~(1 << d);
2304   if (mask == 0xffffffff)
2305      return;
2306
2307   addr1 = ld1->getSrc(0)->reg.data.offset;
2308   n1 = n2 = 0;
2309   size1 = size2 = 0;
2310   for (d = 0; ld1->defExists(d); ++d) {
2311      if (mask & (1 << d)) {
2312         if (size1 && (addr1 & 0x7))
2313            break;
2314         def1[n1] = ld1->getDef(d);
2315         size1 += def1[n1++]->reg.size;
2316      } else
2317      if (!n1) {
2318         addr1 += ld1->getDef(d)->reg.size;
2319      } else {
2320         break;
2321      }
2322   }
2323   for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
2324      if (mask & (1 << d)) {
2325         def2[n2] = ld1->getDef(d);
2326         size2 += def2[n2++]->reg.size;
2327      } else {
2328         assert(!n2);
2329         addr2 += ld1->getDef(d)->reg.size;
2330      }
2331   }
2332
2333   updateLdStOffset(ld1, addr1, func);
2334   ld1->setType(typeOfSize(size1));
2335   for (d = 0; d < 4; ++d)
2336      ld1->setDef(d, (d < n1) ? def1[d] : NULL);
2337
2338   if (!n2)
2339      return;
2340
2341   ld2 = cloneShallow(func, ld1);
2342   updateLdStOffset(ld2, addr2, func);
2343   ld2->setType(typeOfSize(size2));
2344   for (d = 0; d < 4; ++d)
2345      ld2->setDef(d, (d < n2) ? def2[d] : NULL);
2346
2347   ld1->bb->insertAfter(ld1, ld2);
2348}
2349
2350// =============================================================================
2351
2352#define RUN_PASS(l, n, f)                       \
2353   if (level >= (l)) {                          \
2354      if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
2355         INFO("PEEPHOLE: %s\n", #n);            \
2356      n pass;                                   \
2357      if (!pass.f(this))                        \
2358         return false;                          \
2359   }
2360
2361bool
2362Program::optimizeSSA(int level)
2363{
2364   RUN_PASS(1, DeadCodeElim, buryAll);
2365   RUN_PASS(1, CopyPropagation, run);
2366   RUN_PASS(2, GlobalCSE, run);
2367   RUN_PASS(1, LocalCSE, run);
2368   RUN_PASS(2, AlgebraicOpt, run);
2369   RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
2370   RUN_PASS(1, ConstantFolding, foldAll);
2371   RUN_PASS(1, LoadPropagation, run);
2372   RUN_PASS(2, MemoryOpt, run);
2373   RUN_PASS(2, LocalCSE, run);
2374   RUN_PASS(0, DeadCodeElim, buryAll);
2375
2376   return true;
2377}
2378
2379bool
2380Program::optimizePostRA(int level)
2381{
2382   RUN_PASS(2, FlatteningPass, run);
2383   return true;
2384}
2385
2386}
2387