1/*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
18 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
19 * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 * SOFTWARE.
21 */
22
23#include "nv50/codegen/nv50_ir.h"
24#include "nv50/codegen/nv50_ir_build_util.h"
25
26#include "nv50_ir_target_nv50.h"
27
28namespace nv50_ir {
29
30// nv50 doesn't support 32 bit integer multiplication
31//
32//       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33// -------------------
34//    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35// ah*bh 00 00                 (           carry1) << 16 + ( carry2)
36//       al*bl
37//    ah*bl 00
38//
39// fffe0001 + fffe0001
40static bool
41expandIntegerMUL(BuildUtil *bld, Instruction *mul)
42{
43   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
44
45   DataType fTy = mul->sType; // full type
46   DataType hTy;
47   switch (fTy) {
48   case TYPE_S32: hTy = TYPE_S16; break;
49   case TYPE_U32: hTy = TYPE_U16; break;
50   case TYPE_U64: hTy = TYPE_U32; break;
51   case TYPE_S64: hTy = TYPE_S32; break;
52   default:
53      return false;
54   }
55   unsigned int fullSize = typeSizeof(fTy);
56   unsigned int halfSize = typeSizeof(hTy);
57
58   Instruction *i[9];
59
60   bld->setPosition(mul, true);
61
62   Value *a[2], *b[2];
63   Value *c[2];
64   Value *t[4];
65   for (int j = 0; j < 4; ++j)
66      t[j] = bld->getSSA(fullSize);
67
68   // split sources into halves
69   i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
70   i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
71
72   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
73   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
74   i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
75   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
76
77   if (highResult) {
78      Value *r[3];
79      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
80      c[0] = bld->getSSA(1, FILE_FLAGS);
81      c[1] = bld->getSSA(1, FILE_FLAGS);
82      for (int j = 0; j < 3; ++j)
83         r[j] = bld->getSSA(fullSize);
84
85      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
86      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
87      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
88      i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
89
90      // set carry defs / sources
91      i[3]->setFlagsDef(1, c[0]);
92      i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
93      i[6]->setPredicate(CC_C, c[0]);
94      i[5]->setFlagsSrc(3, c[1]);
95   } else {
96      bld->mkMov(mul->getDef(0), t[3]);
97   }
98   delete_Instruction(bld->getProgram(), mul);
99
100   for (int j = 2; j <= (highResult ? 5 : 4); ++j)
101      if (i[j])
102         i[j]->sType = hTy;
103
104   return true;
105}
106
107#define QOP_ADD  0
108#define QOP_SUBR 1
109#define QOP_SUB  2
110#define QOP_MOV2 3
111
112//             UL UR LL LR
113#define QUADOP(q, r, s, t)            \
114   ((QOP_##q << 6) | (QOP_##r << 4) | \
115    (QOP_##s << 2) | (QOP_##t << 0))
116
117class NV50LegalizePostRA : public Pass
118{
119private:
120   virtual bool visit(Function *);
121   virtual bool visit(BasicBlock *);
122
123   void handlePRERET(FlowInstruction *);
124   void replaceZero(Instruction *);
125   void split64BitOp(Instruction *);
126
127   LValue *r63;
128};
129
130bool
131NV50LegalizePostRA::visit(Function *fn)
132{
133   Program *prog = fn->getProgram();
134
135   r63 = new_LValue(fn, FILE_GPR);
136   r63->reg.data.id = 63;
137
138   // this is actually per-program, but we can do it all on visiting main()
139   std::list<Instruction *> *outWrites =
140      reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
141
142   if (outWrites) {
143      for (std::list<Instruction *>::iterator it = outWrites->begin();
144           it != outWrites->end(); ++it)
145         (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
146      // instructions will be deleted on exit
147      outWrites->clear();
148   }
149
150   return true;
151}
152
153void
154NV50LegalizePostRA::replaceZero(Instruction *i)
155{
156   for (int s = 0; i->srcExists(s); ++s) {
157      ImmediateValue *imm = i->getSrc(s)->asImm();
158      if (imm && imm->reg.data.u64 == 0)
159         i->setSrc(s, r63);
160   }
161}
162
163void
164NV50LegalizePostRA::split64BitOp(Instruction *i)
165{
166   if (i->dType == TYPE_F64) {
167      if (i->op == OP_MAD)
168         i->op = OP_FMA;
169      if (i->op == OP_ADD || i->op == OP_MUL || i->op == OP_FMA ||
170          i->op == OP_CVT || i->op == OP_MIN || i->op == OP_MAX ||
171          i->op == OP_SET)
172         return;
173      i->dType = i->sType = TYPE_U32;
174
175      i->bb->insertAfter(i, cloneForward(func, i));
176   }
177}
178
179// Emulate PRERET: jump to the target and call to the origin from there
180//
181// WARNING: atm only works if BBs are affected by at most a single PRERET
182//
183// BB:0
184// preret BB:3
185// (...)
186// BB:3
187// (...)
188//             --->
189// BB:0
190// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
191// (...)
192// BB:3
193// bra BB:3 + n1 (skip the call)
194// call BB:0 + n2 (skip bra at beginning of BB:0)
195// (...)
196void
197NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
198{
199   BasicBlock *bbE = pre->bb;
200   BasicBlock *bbT = pre->target.bb;
201
202   pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
203   bbE->remove(pre);
204   bbE->insertHead(pre);
205
206   Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
207   Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
208
209   bbT->insertHead(call);
210   bbT->insertHead(skip);
211
212   // NOTE: maybe split blocks to prevent the instructions from moving ?
213
214   skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
215   call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
216}
217
218bool
219NV50LegalizePostRA::visit(BasicBlock *bb)
220{
221   Instruction *i, *next;
222
223   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
224   for (i = bb->getFirst(); i; i = next) {
225      next = i->next;
226      if (i->isNop()) {
227         bb->remove(i);
228      } else
229      if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
230         handlePRERET(i->asFlow());
231      } else {
232         if (i->op != OP_MOV && i->op != OP_PFETCH &&
233             (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
234            replaceZero(i);
235         if (typeSizeof(i->dType) == 8)
236            split64BitOp(i);
237      }
238   }
239   if (!bb->getEntry())
240      return true;
241
242   return true;
243}
244
245class NV50LegalizeSSA : public Pass
246{
247public:
248   NV50LegalizeSSA(Program *);
249
250   virtual bool visit(BasicBlock *bb);
251
252private:
253   void propagateWriteToOutput(Instruction *);
254   void handleDIV(Instruction *);
255   void handleMOD(Instruction *);
256   void handleMUL(Instruction *);
257   void handleAddrDef(Instruction *);
258
259   inline bool isARL(const Instruction *) const;
260
261   BuildUtil bld;
262
263   std::list<Instruction *> *outWrites;
264};
265
266NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
267{
268   bld.setProgram(prog);
269
270   if (prog->optLevel >= 2 &&
271       (prog->getType() == Program::TYPE_GEOMETRY ||
272        prog->getType() == Program::TYPE_VERTEX))
273      outWrites =
274         reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
275   else
276      outWrites = NULL;
277}
278
279void
280NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
281{
282   if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
283      return;
284
285   // check def instruction can store
286   Instruction *di = st->getSrc(1)->defs.front()->getInsn();
287
288   // TODO: move exports (if beneficial) in common opt pass
289   if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
290      return;
291   for (int s = 0; di->srcExists(s); ++s)
292      if (di->src(s).getFile() == FILE_IMMEDIATE)
293         return;
294
295   // We cannot set defs to non-lvalues before register allocation, so
296   // save & remove (to save registers) the exports and replace later.
297   outWrites->push_back(st);
298   st->bb->remove(st);
299}
300
301bool
302NV50LegalizeSSA::isARL(const Instruction *i) const
303{
304   ImmediateValue imm;
305
306   if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
307      return false;
308   if (!i->src(1).getImmediate(imm))
309      return false;
310   return imm.isInteger(0);
311}
312
313void
314NV50LegalizeSSA::handleAddrDef(Instruction *i)
315{
316   Instruction *arl;
317
318   i->getDef(0)->reg.size = 2; // $aX are only 16 bit
319
320   // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
321   if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
322      if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
323         return;
324      if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
325         return;
326   }
327
328   // turn $a sources into $r sources (can't operate on $a)
329   for (int s = 0; i->srcExists(s); ++s) {
330      Value *a = i->getSrc(s);
331      Value *r;
332      if (a->reg.file == FILE_ADDRESS) {
333         if (a->getInsn() && isARL(a->getInsn())) {
334            i->setSrc(s, a->getInsn()->getSrc(0));
335         } else {
336            bld.setPosition(i, false);
337            r = bld.getSSA();
338            bld.mkMov(r, a);
339            i->setSrc(s, r);
340         }
341      }
342   }
343   if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
344      return;
345
346   // turn result back into $a
347   bld.setPosition(i, true);
348   arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
349   i->setDef(0, arl->getSrc(0));
350}
351
352void
353NV50LegalizeSSA::handleMUL(Instruction *mul)
354{
355   if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
356      return;
357   Value *def = mul->getDef(0);
358   Value *pred = mul->getPredicate();
359   CondCode cc = mul->cc;
360   if (pred)
361      mul->setPredicate(CC_ALWAYS, NULL);
362
363   if (mul->op == OP_MAD) {
364      Instruction *add = mul;
365      bld.setPosition(add, false);
366      Value *res = cloneShallow(func, mul->getDef(0));
367      mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
368      add->op = OP_ADD;
369      add->setSrc(0, mul->getDef(0));
370      add->setSrc(1, add->getSrc(2));
371      for (int s = 2; add->srcExists(s); ++s)
372         add->setSrc(s, NULL);
373      mul->subOp = add->subOp;
374      add->subOp = 0;
375   }
376   expandIntegerMUL(&bld, mul);
377   if (pred)
378      def->getInsn()->setPredicate(cc, pred);
379}
380
381// Use f32 division: first compute an approximate result, use it to reduce
382// the dividend, which should then be representable as f32, divide the reduced
383// dividend, and add the quotients.
384void
385NV50LegalizeSSA::handleDIV(Instruction *div)
386{
387   const DataType ty = div->sType;
388
389   if (ty != TYPE_U32 && ty != TYPE_S32)
390      return;
391
392   Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
393
394   bld.setPosition(div, false);
395
396   Value *a, *af = bld.getSSA();
397   Value *b, *bf = bld.getSSA();
398
399   bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
400   bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
401
402   if (isSignedType(ty)) {
403      af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
404      bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
405      a = bld.getSSA();
406      b = bld.getSSA();
407      bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
408      bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
409   } else {
410      a = div->getSrc(0);
411      b = div->getSrc(1);
412   }
413
414   bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
415   bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
416
417   bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
418   bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
419
420   // get error of 1st result
421   expandIntegerMUL(&bld,
422      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
423   bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
424
425   bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
426
427   bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
428   bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
429      ->rnd = ROUND_Z;
430   bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
431
432   // correction: if modulus >= divisor, add 1
433   expandIntegerMUL(&bld,
434      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
435   bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
436   bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
437   if (!isSignedType(ty)) {
438      div->op = OP_SUB;
439      div->setSrc(0, q);
440      div->setSrc(1, s);
441   } else {
442      t = q;
443      bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
444      s = bld.getSSA();
445      t = bld.getSSA();
446      // fix the sign
447      bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
448         ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
449      bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
450      bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
451
452      div->op = OP_UNION;
453      div->setSrc(0, s);
454      div->setSrc(1, t);
455   }
456}
457
458void
459NV50LegalizeSSA::handleMOD(Instruction *mod)
460{
461   if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
462      return;
463   bld.setPosition(mod, false);
464
465   Value *q = bld.getSSA();
466   Value *m = bld.getSSA();
467
468   bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
469   handleDIV(q->getInsn());
470
471   bld.setPosition(mod, false);
472   expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
473
474   mod->op = OP_SUB;
475   mod->setSrc(1, m);
476}
477
478bool
479NV50LegalizeSSA::visit(BasicBlock *bb)
480{
481   Instruction *insn, *next;
482   // skipping PHIs (don't pass them to handleAddrDef) !
483   for (insn = bb->getEntry(); insn; insn = next) {
484      next = insn->next;
485
486      switch (insn->op) {
487      case OP_EXPORT:
488         if (outWrites)
489            propagateWriteToOutput(insn);
490         break;
491      case OP_DIV:
492         handleDIV(insn);
493         break;
494      case OP_MOD:
495         handleMOD(insn);
496         break;
497      case OP_MAD:
498      case OP_MUL:
499         handleMUL(insn);
500         break;
501      default:
502         break;
503      }
504
505      if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
506         handleAddrDef(insn);
507   }
508   return true;
509}
510
511class NV50LoweringPreSSA : public Pass
512{
513public:
514   NV50LoweringPreSSA(Program *);
515
516private:
517   virtual bool visit(Instruction *);
518   virtual bool visit(Function *);
519
520   bool handleRDSV(Instruction *);
521   bool handleWRSV(Instruction *);
522
523   bool handleEXPORT(Instruction *);
524
525   bool handleDIV(Instruction *);
526   bool handleSQRT(Instruction *);
527   bool handlePOW(Instruction *);
528
529   bool handleSET(Instruction *);
530   bool handleSLCT(CmpInstruction *);
531   bool handleSELP(Instruction *);
532
533   bool handleTEX(TexInstruction *);
534   bool handleTXB(TexInstruction *); // I really
535   bool handleTXL(TexInstruction *); // hate
536   bool handleTXD(TexInstruction *); // these 3
537
538   bool handleCALL(Instruction *);
539   bool handlePRECONT(Instruction *);
540   bool handleCONT(Instruction *);
541
542   void checkPredicate(Instruction *);
543
544private:
545   const Target *const targ;
546
547   BuildUtil bld;
548
549   Value *tid;
550};
551
552NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
553   targ(prog->getTarget()), tid(NULL)
554{
555   bld.setProgram(prog);
556}
557
558bool
559NV50LoweringPreSSA::visit(Function *f)
560{
561   BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
562
563   if (prog->getType() == Program::TYPE_COMPUTE) {
564      // Add implicit "thread id" argument in $r0 to the function
565      Value *arg = new_LValue(func, FILE_GPR);
566      arg->reg.data.id = 0;
567      f->ins.push_back(arg);
568
569      bld.setPosition(root, false);
570      tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
571   }
572
573   return true;
574}
575
576// move array source to first slot, convert to u16, add indirections
577bool
578NV50LoweringPreSSA::handleTEX(TexInstruction *i)
579{
580   const int arg = i->tex.target.getArgCount();
581   const int dref = arg;
582   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
583
584   // dref comes before bias/lod
585   if (i->tex.target.isShadow())
586      if (i->op == OP_TXB || i->op == OP_TXL)
587         i->swapSources(dref, lod);
588
589   // array index must be converted to u32
590   if (i->tex.target.isArray()) {
591      Value *layer = i->getSrc(arg - 1);
592      LValue *src = new_LValue(func, FILE_GPR);
593      bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
594      bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
595      i->setSrc(arg - 1, src);
596
597      if (i->tex.target.isCube()) {
598         // Value *face = layer;
599         Value *x, *y;
600         x = new_LValue(func, FILE_GPR);
601         y = new_LValue(func, FILE_GPR);
602         layer = new_LValue(func, FILE_GPR);
603
604         i->tex.target = TEX_TARGET_2D_ARRAY;
605
606         // TODO: use TEXPREP to convert x,y,z,face -> x,y,layer
607         bld.mkMov(x, i->getSrc(0));
608         bld.mkMov(y, i->getSrc(1));
609         bld.mkMov(layer, i->getSrc(3));
610
611         i->setSrc(0, x);
612         i->setSrc(1, y);
613         i->setSrc(2, layer);
614         i->setSrc(3, i->getSrc(4));
615         i->setSrc(4, NULL);
616      }
617   }
618
619   // texel offsets are 3 immediate fields in the instruction,
620   // nv50 cannot do textureGatherOffsets
621   assert(i->tex.useOffsets <= 1);
622
623   return true;
624}
625
626// Bias must be equal for all threads of a quad or lod calculation will fail.
627//
628// The lanes of a quad are grouped by the bit in the condition register they
629// have set, which is selected by differing bias values.
630// Move the input values for TEX into a new register set for each group and
631// execute TEX only for a specific group.
632// We always need to use 4 new registers for the inputs/outputs because the
633// implicitly calculated derivatives must be correct.
634//
635// TODO: move to SSA phase so we can easily determine whether bias is constant
636bool
637NV50LoweringPreSSA::handleTXB(TexInstruction *i)
638{
639   const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
640   int l, d;
641
642   handleTEX(i);
643   Value *bias = i->getSrc(i->tex.target.getArgCount());
644   if (bias->isUniform())
645      return true;
646
647   Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
648                                 bld.loadImm(NULL, 1));
649   bld.setPosition(cond, false);
650
651   for (l = 1; l < 4; ++l) {
652      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
653      Value *bit = bld.getSSA();
654      Value *pred = bld.getScratch(1, FILE_FLAGS);
655      Value *imm = bld.loadImm(NULL, (1 << l));
656      bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
657      bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
658      cond->setSrc(l, bit);
659   }
660   Value *flags = bld.getScratch(1, FILE_FLAGS);
661   bld.setPosition(cond, true);
662   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
663
664   Instruction *tex[4];
665   for (l = 0; l < 4; ++l) {
666      (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
667      bld.insert(tex[l]);
668   }
669
670   Value *res[4][4];
671   for (d = 0; i->defExists(d); ++d)
672      res[0][d] = tex[0]->getDef(d);
673   for (l = 1; l < 4; ++l) {
674      for (d = 0; tex[l]->defExists(d); ++d) {
675         res[l][d] = cloneShallow(func, res[0][d]);
676         bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
677      }
678   }
679
680   for (d = 0; i->defExists(d); ++d) {
681      Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
682      for (l = 0; l < 4; ++l)
683         dst->setSrc(l, res[l][d]);
684   }
685   delete_Instruction(prog, i);
686   return true;
687}
688
689// LOD must be equal for all threads of a quad.
690// Unlike with TXB, here we can just diverge since there's no LOD calculation
691// that would require all 4 threads' sources to be set up properly.
692bool
693NV50LoweringPreSSA::handleTXL(TexInstruction *i)
694{
695   handleTEX(i);
696   Value *lod = i->getSrc(i->tex.target.getArgCount());
697   if (lod->isUniform())
698      return true;
699
700   BasicBlock *currBB = i->bb;
701   BasicBlock *texiBB = i->bb->splitBefore(i, false);
702   BasicBlock *joinBB = i->bb->splitAfter(i);
703
704   bld.setPosition(currBB, true);
705   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
706
707   for (int l = 0; l <= 3; ++l) {
708      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
709      Value *pred = bld.getScratch(1, FILE_FLAGS);
710      bld.setPosition(currBB, true);
711      bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
712      bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
713      currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
714      if (l <= 2) {
715         BasicBlock *laneBB = new BasicBlock(func);
716         currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
717         currBB = laneBB;
718      }
719   }
720   bld.setPosition(joinBB, false);
721   bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
722   return true;
723}
724
725bool
726NV50LoweringPreSSA::handleTXD(TexInstruction *i)
727{
728   static const uint8_t qOps[4][2] =
729   {
730      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
731      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
732      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
733      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
734   };
735   Value *def[4][4];
736   Value *crd[3];
737   Instruction *tex;
738   Value *zero = bld.loadImm(bld.getSSA(), 0);
739   int l, c;
740   const int dim = i->tex.target.getDim();
741
742   handleTEX(i);
743   i->op = OP_TEX; // no need to clone dPdx/dPdy later
744
745   for (c = 0; c < dim; ++c)
746      crd[c] = bld.getScratch();
747
748   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
749   for (l = 0; l < 4; ++l) {
750      // mov coordinates from lane l to all lanes
751      for (c = 0; c < dim; ++c)
752         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
753      // add dPdx from lane l to lanes dx
754      for (c = 0; c < dim; ++c)
755         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
756      // add dPdy from lane l to lanes dy
757      for (c = 0; c < dim; ++c)
758         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
759      // texture
760      bld.insert(tex = cloneForward(func, i));
761      for (c = 0; c < dim; ++c)
762         tex->setSrc(c, crd[c]);
763      // save results
764      for (c = 0; i->defExists(c); ++c) {
765         Instruction *mov;
766         def[c][l] = bld.getSSA();
767         mov = bld.mkMov(def[c][l], tex->getDef(c));
768         mov->fixed = 1;
769         mov->lanes = 1 << l;
770      }
771   }
772   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
773
774   for (c = 0; i->defExists(c); ++c) {
775      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
776      for (l = 0; l < 4; ++l)
777         u->setSrc(l, def[c][l]);
778   }
779
780   i->bb->remove(i);
781   return true;
782}
783
784bool
785NV50LoweringPreSSA::handleSET(Instruction *i)
786{
787   if (i->dType == TYPE_F32) {
788      bld.setPosition(i, true);
789      i->dType = TYPE_U32;
790      bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
791      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
792   }
793   return true;
794}
795
796bool
797NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
798{
799   Value *src0 = bld.getSSA();
800   Value *src1 = bld.getSSA();
801   Value *pred = bld.getScratch(1, FILE_FLAGS);
802
803   Value *v0 = i->getSrc(0);
804   Value *v1 = i->getSrc(1);
805   // XXX: these probably shouldn't be immediates in the first place ...
806   if (v0->asImm())
807      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
808   if (v1->asImm())
809      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
810
811   bld.setPosition(i, true);
812   bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
813   bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
814   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
815
816   bld.setPosition(i, false);
817   i->op = OP_SET;
818   i->setFlagsDef(0, pred);
819   i->dType = TYPE_U8;
820   i->setSrc(0, i->getSrc(2));
821   i->setSrc(2, NULL);
822   i->setSrc(1, bld.loadImm(NULL, 0));
823
824   return true;
825}
826
827bool
828NV50LoweringPreSSA::handleSELP(Instruction *i)
829{
830   Value *src0 = bld.getSSA();
831   Value *src1 = bld.getSSA();
832
833   Value *v0 = i->getSrc(0);
834   Value *v1 = i->getSrc(1);
835   if (v0->asImm())
836      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
837   if (v1->asImm())
838      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
839
840   bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
841   bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
842   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
843   delete_Instruction(prog, i);
844   return true;
845}
846
847bool
848NV50LoweringPreSSA::handleWRSV(Instruction *i)
849{
850   Symbol *sym = i->getSrc(0)->asSym();
851
852   // these are all shader outputs, $sreg are not writeable
853   uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
854   if (addr >= 0x400)
855      return false;
856   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
857
858   bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
859
860   bld.getBB()->remove(i);
861   return true;
862}
863
864bool
865NV50LoweringPreSSA::handleCALL(Instruction *i)
866{
867   if (prog->getType() == Program::TYPE_COMPUTE) {
868      // Add implicit "thread id" argument in $r0 to the function
869      i->setSrc(i->srcCount(), tid);
870   }
871   return true;
872}
873
874bool
875NV50LoweringPreSSA::handlePRECONT(Instruction *i)
876{
877   delete_Instruction(prog, i);
878   return true;
879}
880
881bool
882NV50LoweringPreSSA::handleCONT(Instruction *i)
883{
884   i->op = OP_BRA;
885   return true;
886}
887
888bool
889NV50LoweringPreSSA::handleRDSV(Instruction *i)
890{
891   Symbol *sym = i->getSrc(0)->asSym();
892   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
893   Value *def = i->getDef(0);
894   SVSemantic sv = sym->reg.data.sv.sv;
895   int idx = sym->reg.data.sv.index;
896
897   if (addr >= 0x400) // mov $sreg
898      return true;
899
900   switch (sv) {
901   case SV_POSITION:
902      assert(prog->getType() == Program::TYPE_FRAGMENT);
903      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
904      break;
905   case SV_FACE:
906      bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
907      if (i->dType == TYPE_F32) {
908         bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
909         bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
910      }
911      break;
912   case SV_NCTAID:
913   case SV_CTAID:
914   case SV_NTID:
915      if ((sv == SV_NCTAID && idx >= 2) ||
916          (sv == SV_NTID && idx >= 3)) {
917         bld.mkMov(def, bld.mkImm(1));
918      } else if (sv == SV_CTAID && idx >= 2) {
919         bld.mkMov(def, bld.mkImm(0));
920      } else {
921         Value *x = bld.getSSA(2);
922         bld.mkOp1(OP_LOAD, TYPE_U16, x,
923                   bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
924         bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
925      }
926      break;
927   case SV_TID:
928      if (idx == 0) {
929         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
930      } else if (idx == 1) {
931         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
932         bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
933      } else if (idx == 2) {
934         bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
935      } else {
936         bld.mkMov(def, bld.mkImm(0));
937      }
938      break;
939   default:
940      bld.mkFetch(i->getDef(0), i->dType,
941                  FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
942      break;
943   }
944   bld.getBB()->remove(i);
945   return true;
946}
947
948bool
949NV50LoweringPreSSA::handleDIV(Instruction *i)
950{
951   if (!isFloatType(i->dType))
952      return true;
953   bld.setPosition(i, false);
954   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
955   i->op = OP_MUL;
956   i->setSrc(1, rcp->getDef(0));
957   return true;
958}
959
960bool
961NV50LoweringPreSSA::handleSQRT(Instruction *i)
962{
963   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
964                                bld.getSSA(), i->getSrc(0));
965   i->op = OP_MUL;
966   i->setSrc(1, rsq->getDef(0));
967
968   return true;
969}
970
971bool
972NV50LoweringPreSSA::handlePOW(Instruction *i)
973{
974   LValue *val = bld.getScratch();
975
976   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
977   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
978   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
979
980   i->op = OP_EX2;
981   i->setSrc(0, val);
982   i->setSrc(1, NULL);
983
984   return true;
985}
986
987bool
988NV50LoweringPreSSA::handleEXPORT(Instruction *i)
989{
990   if (prog->getType() == Program::TYPE_FRAGMENT) {
991      if (i->getIndirect(0, 0)) {
992         // TODO: redirect to l[] here, load to GPRs at exit
993         return false;
994      } else {
995         int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
996
997         i->op = OP_MOV;
998         i->subOp = NV50_IR_SUBOP_MOV_FINAL;
999         i->src(0).set(i->src(1));
1000         i->setSrc(1, NULL);
1001         i->setDef(0, new_LValue(func, FILE_GPR));
1002         i->getDef(0)->reg.data.id = id;
1003
1004         prog->maxGPR = MAX2(prog->maxGPR, id);
1005      }
1006   }
1007   return true;
1008}
1009
1010// Set flags according to predicate and make the instruction read $cX.
1011void
1012NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1013{
1014   Value *pred = insn->getPredicate();
1015   Value *cdst;
1016
1017   if (!pred || pred->reg.file == FILE_FLAGS)
1018      return;
1019   cdst = bld.getSSA(1, FILE_FLAGS);
1020
1021   bld.mkCmp(OP_SET, CC_NEU, TYPE_U32, cdst, bld.loadImm(NULL, 0), pred);
1022
1023   insn->setPredicate(insn->cc, cdst);
1024}
1025
1026//
1027// - add quadop dance for texturing
1028// - put FP outputs in GPRs
1029// - convert instruction sequences
1030//
1031bool
1032NV50LoweringPreSSA::visit(Instruction *i)
1033{
1034   bld.setPosition(i, false);
1035
1036   if (i->cc != CC_ALWAYS)
1037      checkPredicate(i);
1038
1039   switch (i->op) {
1040   case OP_TEX:
1041   case OP_TXF:
1042   case OP_TXG:
1043      return handleTEX(i->asTex());
1044   case OP_TXB:
1045      return handleTXB(i->asTex());
1046   case OP_TXL:
1047      return handleTXL(i->asTex());
1048   case OP_TXD:
1049      return handleTXD(i->asTex());
1050   case OP_EX2:
1051      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1052      i->setSrc(0, i->getDef(0));
1053      break;
1054   case OP_SET:
1055      return handleSET(i);
1056   case OP_SLCT:
1057      return handleSLCT(i->asCmp());
1058   case OP_SELP:
1059      return handleSELP(i);
1060   case OP_POW:
1061      return handlePOW(i);
1062   case OP_DIV:
1063      return handleDIV(i);
1064   case OP_SQRT:
1065      return handleSQRT(i);
1066   case OP_EXPORT:
1067      return handleEXPORT(i);
1068   case OP_RDSV:
1069      return handleRDSV(i);
1070   case OP_WRSV:
1071      return handleWRSV(i);
1072   case OP_CALL:
1073      return handleCALL(i);
1074   case OP_PRECONT:
1075      return handlePRECONT(i);
1076   case OP_CONT:
1077      return handleCONT(i);
1078   default:
1079      break;
1080   }
1081   return true;
1082}
1083
1084bool
1085TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1086{
1087   bool ret = false;
1088
1089   if (stage == CG_STAGE_PRE_SSA) {
1090      NV50LoweringPreSSA pass(prog);
1091      ret = pass.run(prog, false, true);
1092   } else
1093   if (stage == CG_STAGE_SSA) {
1094      if (!prog->targetPriv)
1095         prog->targetPriv = new std::list<Instruction *>();
1096      NV50LegalizeSSA pass(prog);
1097      ret = pass.run(prog, false, true);
1098   } else
1099   if (stage == CG_STAGE_POST_RA) {
1100      NV50LegalizePostRA pass;
1101      ret = pass.run(prog, false, true);
1102      if (prog->targetPriv)
1103         delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
1104   }
1105   return ret;
1106}
1107
1108} // namespace nv50_ir
1109