1/*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include "codegen/nv50_ir.h"
24#include "codegen/nv50_ir_build_util.h"
25
26#include "codegen/nv50_ir_target_nv50.h"
27
28namespace nv50_ir {
29
30// nv50 doesn't support 32 bit integer multiplication
31//
32//       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
33// -------------------
34//    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
35// ah*bh 00 00                 (           carry1) << 16 + ( carry2)
36//       al*bl
37//    ah*bl 00
38//
39// fffe0001 + fffe0001
40//
41// Note that this sort of splitting doesn't work for signed values, so we
42// compute the sign on those manually and then perform an unsigned multiply.
43static bool
44expandIntegerMUL(BuildUtil *bld, Instruction *mul)
45{
46   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
47   ImmediateValue src1;
48   bool src1imm = mul->src(1).getImmediate(src1);
49
50   DataType fTy; // full type
51   switch (mul->sType) {
52   case TYPE_S32: fTy = TYPE_U32; break;
53   case TYPE_S64: fTy = TYPE_U64; break;
54   default: fTy = mul->sType; break;
55   }
56
57   DataType hTy; // half type
58   switch (fTy) {
59   case TYPE_U32: hTy = TYPE_U16; break;
60   case TYPE_U64: hTy = TYPE_U32; break;
61   default:
62      return false;
63   }
64   unsigned int fullSize = typeSizeof(fTy);
65   unsigned int halfSize = typeSizeof(hTy);
66
67   Instruction *i[9];
68
69   bld->setPosition(mul, true);
70
71   Value *s[2];
72   Value *a[2], *b[2];
73   Value *t[4];
74   for (int j = 0; j < 4; ++j)
75      t[j] = bld->getSSA(fullSize);
76
77   if (isSignedType(mul->sType) && highResult) {
78      s[0] = bld->getSSA(fullSize);
79      s[1] = bld->getSSA(fullSize);
80      bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
81      bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
82      src1.reg.data.s32 = abs(src1.reg.data.s32);
83   } else {
84      s[0] = mul->getSrc(0);
85      s[1] = mul->getSrc(1);
86   }
87
88   // split sources into halves
89   i[0] = bld->mkSplit(a, halfSize, s[0]);
90   i[1] = bld->mkSplit(b, halfSize, s[1]);
91
92   if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
93      i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
94                               bld->mkImm(src1.reg.data.u32 & 0xffff));
95   } else {
96      i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
97                        src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
98      if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
99         i[3] = i[2];
100         t[1] = t[0];
101      } else {
102         i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
103      }
104   }
105   i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
106   if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
107      i[4] = i[3];
108      t[3] = t[2];
109   } else {
110      i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
111   }
112
113   if (highResult) {
114      Value *c[2];
115      Value *r[5];
116      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
117      c[0] = bld->getSSA(1, FILE_FLAGS);
118      c[1] = bld->getSSA(1, FILE_FLAGS);
119      for (int j = 0; j < 5; ++j)
120         r[j] = bld->getSSA(fullSize);
121
122      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
123      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
124      bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
125      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
126      i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
127
128      // set carry defs / sources
129      i[3]->setFlagsDef(1, c[0]);
130      // actual result required in negative case, but ignored for
131      // unsigned. for some reason the compiler ends up dropping the whole
132      // instruction if the destination is unused but the flags are.
133      if (isSignedType(mul->sType))
134         i[4]->setFlagsDef(1, c[1]);
135      else
136         i[4]->setFlagsDef(0, c[1]);
137      i[6]->setPredicate(CC_C, c[0]);
138      i[5]->setFlagsSrc(3, c[1]);
139
140      if (isSignedType(mul->sType)) {
141         Value *cc[2];
142         Value *rr[7];
143         Value *one = bld->getSSA(fullSize);
144         bld->loadImm(one, 1);
145         for (int j = 0; j < 7; j++)
146            rr[j] = bld->getSSA(fullSize);
147
148         // NOTE: this logic uses predicates because splitting basic blocks is
149         // ~impossible during the SSA phase. The RA relies on a correlation
150         // between edge order and phi node sources.
151
152         // Set the sign of the result based on the inputs
153         bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
154            ->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
155
156         // 1s complement of 64-bit value
157         bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
158            ->setPredicate(CC_S, cc[0]);
159         bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
160            ->setPredicate(CC_S, cc[0]);
161
162         // add to low 32-bits, keep track of the carry
163         Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
164         n->setPredicate(CC_S, cc[0]);
165         n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
166
167         // If there was a carry, add 1 to the upper 32 bits
168         // XXX: These get executed even if they shouldn't be
169         bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
170            ->setPredicate(CC_C, cc[1]);
171         bld->mkMov(rr[3], rr[0])
172            ->setPredicate(CC_NC, cc[1]);
173         bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
174
175         // Merge the results from the negative and non-negative paths
176         bld->mkMov(rr[5], rr[4])
177            ->setPredicate(CC_S, cc[0]);
178         bld->mkMov(rr[6], r[4])
179            ->setPredicate(CC_NS, cc[0]);
180         bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
181      } else {
182         bld->mkMov(mul->getDef(0), r[4]);
183      }
184   } else {
185      bld->mkMov(mul->getDef(0), t[3]);
186   }
187   delete_Instruction(bld->getProgram(), mul);
188
189   for (int j = 2; j <= (highResult ? 5 : 4); ++j)
190      if (i[j])
191         i[j]->sType = hTy;
192
193   return true;
194}
195
196#define QOP_ADD  0
197#define QOP_SUBR 1
198#define QOP_SUB  2
199#define QOP_MOV2 3
200
201//             UL UR LL LR
202#define QUADOP(q, r, s, t)            \
203   ((QOP_##q << 6) | (QOP_##r << 4) | \
204    (QOP_##s << 2) | (QOP_##t << 0))
205
206class NV50LegalizePostRA : public Pass
207{
208private:
209   virtual bool visit(Function *);
210   virtual bool visit(BasicBlock *);
211
212   void handlePRERET(FlowInstruction *);
213   void replaceZero(Instruction *);
214
215   LValue *r63;
216};
217
218bool
219NV50LegalizePostRA::visit(Function *fn)
220{
221   Program *prog = fn->getProgram();
222
223   r63 = new_LValue(fn, FILE_GPR);
224   // GPR units on nv50 are in half-regs
225   if (prog->maxGPR < 126)
226      r63->reg.data.id = 63;
227   else
228      r63->reg.data.id = 127;
229
230   // this is actually per-program, but we can do it all on visiting main()
231   std::list<Instruction *> *outWrites =
232      reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
233
234   if (outWrites) {
235      for (std::list<Instruction *>::iterator it = outWrites->begin();
236           it != outWrites->end(); ++it)
237         (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
238      // instructions will be deleted on exit
239      outWrites->clear();
240   }
241
242   return true;
243}
244
245void
246NV50LegalizePostRA::replaceZero(Instruction *i)
247{
248   for (int s = 0; i->srcExists(s); ++s) {
249      ImmediateValue *imm = i->getSrc(s)->asImm();
250      if (imm && imm->reg.data.u64 == 0)
251         i->setSrc(s, r63);
252   }
253}
254
255// Emulate PRERET: jump to the target and call to the origin from there
256//
257// WARNING: atm only works if BBs are affected by at most a single PRERET
258//
259// BB:0
260// preret BB:3
261// (...)
262// BB:3
263// (...)
264//             --->
265// BB:0
266// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
267// (...)
268// BB:3
269// bra BB:3 + n1 (skip the call)
270// call BB:0 + n2 (skip bra at beginning of BB:0)
271// (...)
272void
273NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
274{
275   BasicBlock *bbE = pre->bb;
276   BasicBlock *bbT = pre->target.bb;
277
278   pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
279   bbE->remove(pre);
280   bbE->insertHead(pre);
281
282   Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
283   Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
284
285   bbT->insertHead(call);
286   bbT->insertHead(skip);
287
288   // NOTE: maybe split blocks to prevent the instructions from moving ?
289
290   skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
291   call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
292}
293
294bool
295NV50LegalizePostRA::visit(BasicBlock *bb)
296{
297   Instruction *i, *next;
298
299   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
300   for (i = bb->getFirst(); i; i = next) {
301      next = i->next;
302      if (i->isNop()) {
303         bb->remove(i);
304      } else
305      if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
306         handlePRERET(i->asFlow());
307      } else {
308         // TODO: We will want to do this before register allocation,
309         // since have to use a $c register for the carry flag.
310         if (typeSizeof(i->dType) == 8) {
311            Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
312            if (hi)
313               next = hi;
314         }
315
316         if (i->op != OP_PFETCH && i->op != OP_BAR &&
317             (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
318            replaceZero(i);
319      }
320   }
321   if (!bb->getEntry())
322      return true;
323
324   return true;
325}
326
327class NV50LegalizeSSA : public Pass
328{
329public:
330   NV50LegalizeSSA(Program *);
331
332   virtual bool visit(BasicBlock *bb);
333
334private:
335   void propagateWriteToOutput(Instruction *);
336   void handleDIV(Instruction *);
337   void handleMOD(Instruction *);
338   void handleMUL(Instruction *);
339   void handleAddrDef(Instruction *);
340
341   inline bool isARL(const Instruction *) const;
342
343   BuildUtil bld;
344
345   std::list<Instruction *> *outWrites;
346};
347
348NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
349{
350   bld.setProgram(prog);
351
352   if (prog->optLevel >= 2 &&
353       (prog->getType() == Program::TYPE_GEOMETRY ||
354        prog->getType() == Program::TYPE_VERTEX))
355      outWrites =
356         reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
357   else
358      outWrites = NULL;
359}
360
361void
362NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
363{
364   if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
365      return;
366
367   // check def instruction can store
368   Instruction *di = st->getSrc(1)->defs.front()->getInsn();
369
370   // TODO: move exports (if beneficial) in common opt pass
371   if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
372      return;
373
374   for (int s = 0; di->srcExists(s); ++s)
375      if (di->src(s).getFile() == FILE_IMMEDIATE ||
376          di->src(s).getFile() == FILE_MEMORY_LOCAL)
377         return;
378
379   if (prog->getType() == Program::TYPE_GEOMETRY) {
380      // Only propagate output writes in geometry shaders when we can be sure
381      // that we are propagating to the same output vertex.
382      if (di->bb != st->bb)
383         return;
384      Instruction *i;
385      for (i = di; i != st; i = i->next) {
386         if (i->op == OP_EMIT || i->op == OP_RESTART)
387            return;
388      }
389      assert(i); // st after di
390   }
391
392   // We cannot set defs to non-lvalues before register allocation, so
393   // save & remove (to save registers) the exports and replace later.
394   outWrites->push_back(st);
395   st->bb->remove(st);
396}
397
398bool
399NV50LegalizeSSA::isARL(const Instruction *i) const
400{
401   ImmediateValue imm;
402
403   if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
404      return false;
405   if (!i->src(1).getImmediate(imm))
406      return false;
407   return imm.isInteger(0);
408}
409
410void
411NV50LegalizeSSA::handleAddrDef(Instruction *i)
412{
413   Instruction *arl;
414
415   i->getDef(0)->reg.size = 2; // $aX are only 16 bit
416
417   // PFETCH can always write to $a
418   if (i->op == OP_PFETCH)
419      return;
420   // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
421   if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
422      if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
423         return;
424      if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
425         return;
426   }
427
428   // turn $a sources into $r sources (can't operate on $a)
429   for (int s = 0; i->srcExists(s); ++s) {
430      Value *a = i->getSrc(s);
431      Value *r;
432      if (a->reg.file == FILE_ADDRESS) {
433         if (a->getInsn() && isARL(a->getInsn())) {
434            i->setSrc(s, a->getInsn()->getSrc(0));
435         } else {
436            bld.setPosition(i, false);
437            r = bld.getSSA();
438            bld.mkMov(r, a);
439            i->setSrc(s, r);
440         }
441      }
442   }
443   if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
444      return;
445
446   // turn result back into $a
447   bld.setPosition(i, true);
448   arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
449   i->setDef(0, arl->getSrc(0));
450}
451
452void
453NV50LegalizeSSA::handleMUL(Instruction *mul)
454{
455   if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
456      return;
457   Value *def = mul->getDef(0);
458   Value *pred = mul->getPredicate();
459   CondCode cc = mul->cc;
460   if (pred)
461      mul->setPredicate(CC_ALWAYS, NULL);
462
463   if (mul->op == OP_MAD) {
464      Instruction *add = mul;
465      bld.setPosition(add, false);
466      Value *res = cloneShallow(func, mul->getDef(0));
467      mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
468      add->op = OP_ADD;
469      add->setSrc(0, mul->getDef(0));
470      add->setSrc(1, add->getSrc(2));
471      for (int s = 2; add->srcExists(s); ++s)
472         add->setSrc(s, NULL);
473      mul->subOp = add->subOp;
474      add->subOp = 0;
475   }
476   expandIntegerMUL(&bld, mul);
477   if (pred)
478      def->getInsn()->setPredicate(cc, pred);
479}
480
481// Use f32 division: first compute an approximate result, use it to reduce
482// the dividend, which should then be representable as f32, divide the reduced
483// dividend, and add the quotients.
484void
485NV50LegalizeSSA::handleDIV(Instruction *div)
486{
487   const DataType ty = div->sType;
488
489   if (ty != TYPE_U32 && ty != TYPE_S32)
490      return;
491
492   Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
493
494   bld.setPosition(div, false);
495
496   Value *a, *af = bld.getSSA();
497   Value *b, *bf = bld.getSSA();
498
499   bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
500   bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
501
502   if (isSignedType(ty)) {
503      af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
504      bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
505      a = bld.getSSA();
506      b = bld.getSSA();
507      bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
508      bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
509   } else {
510      a = div->getSrc(0);
511      b = div->getSrc(1);
512   }
513
514   bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
515   bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
516
517   bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
518   bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
519
520   // get error of 1st result
521   expandIntegerMUL(&bld,
522      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
523   bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
524
525   bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
526
527   bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
528   bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
529      ->rnd = ROUND_Z;
530   bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
531
532   // correction: if modulus >= divisor, add 1
533   expandIntegerMUL(&bld,
534      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
535   bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
536   bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
537   if (!isSignedType(ty)) {
538      div->op = OP_SUB;
539      div->setSrc(0, q);
540      div->setSrc(1, s);
541   } else {
542      t = q;
543      bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
544      s = bld.getSSA();
545      t = bld.getSSA();
546      // fix the sign
547      bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
548         ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
549      bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
550      bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
551
552      div->op = OP_UNION;
553      div->setSrc(0, s);
554      div->setSrc(1, t);
555   }
556}
557
558void
559NV50LegalizeSSA::handleMOD(Instruction *mod)
560{
561   if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
562      return;
563   bld.setPosition(mod, false);
564
565   Value *q = bld.getSSA();
566   Value *m = bld.getSSA();
567
568   bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
569   handleDIV(q->getInsn());
570
571   bld.setPosition(mod, false);
572   expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
573
574   mod->op = OP_SUB;
575   mod->setSrc(1, m);
576}
577
578bool
579NV50LegalizeSSA::visit(BasicBlock *bb)
580{
581   Instruction *insn, *next;
582   // skipping PHIs (don't pass them to handleAddrDef) !
583   for (insn = bb->getEntry(); insn; insn = next) {
584      next = insn->next;
585
586      if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
587         handleAddrDef(insn);
588
589      switch (insn->op) {
590      case OP_EXPORT:
591         if (outWrites)
592            propagateWriteToOutput(insn);
593         break;
594      case OP_DIV:
595         handleDIV(insn);
596         break;
597      case OP_MOD:
598         handleMOD(insn);
599         break;
600      case OP_MAD:
601      case OP_MUL:
602         handleMUL(insn);
603         break;
604      default:
605         break;
606      }
607   }
608   return true;
609}
610
611class NV50LoweringPreSSA : public Pass
612{
613public:
614   NV50LoweringPreSSA(Program *);
615
616private:
617   virtual bool visit(Instruction *);
618   virtual bool visit(Function *);
619
620   bool handleRDSV(Instruction *);
621   bool handleWRSV(Instruction *);
622
623   bool handlePFETCH(Instruction *);
624   bool handleEXPORT(Instruction *);
625   bool handleLOAD(Instruction *);
626
627   bool handleDIV(Instruction *);
628   bool handleSQRT(Instruction *);
629   bool handlePOW(Instruction *);
630
631   bool handleSET(Instruction *);
632   bool handleSLCT(CmpInstruction *);
633   bool handleSELP(Instruction *);
634
635   bool handleTEX(TexInstruction *);
636   bool handleTXB(TexInstruction *); // I really
637   bool handleTXL(TexInstruction *); // hate
638   bool handleTXD(TexInstruction *); // these 3
639   bool handleTXLQ(TexInstruction *);
640   bool handleTXQ(TexInstruction *);
641
642   bool handleCALL(Instruction *);
643   bool handlePRECONT(Instruction *);
644   bool handleCONT(Instruction *);
645
646   void checkPredicate(Instruction *);
647   void loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y);
648   void loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy);
649
650private:
651   const Target *const targ;
652
653   BuildUtil bld;
654
655   Value *tid;
656};
657
658NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
659   targ(prog->getTarget()), tid(NULL)
660{
661   bld.setProgram(prog);
662}
663
664bool
665NV50LoweringPreSSA::visit(Function *f)
666{
667   BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
668
669   if (prog->getType() == Program::TYPE_COMPUTE) {
670      // Add implicit "thread id" argument in $r0 to the function
671      Value *arg = new_LValue(func, FILE_GPR);
672      arg->reg.data.id = 0;
673      f->ins.push_back(arg);
674
675      bld.setPosition(root, false);
676      tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
677   }
678
679   return true;
680}
681
682void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
683                                       Value **ms_x, Value **ms_y) {
684   // This loads the texture-indexed ms setting from the constant buffer
685   Value *tmp = new_LValue(func, FILE_GPR);
686   uint8_t b = prog->driver->io.auxCBSlot;
687   off += prog->driver->io.suInfoBase;
688   if (prog->getType() > Program::TYPE_VERTEX)
689      off += 16 * 2 * 4;
690   if (prog->getType() > Program::TYPE_GEOMETRY)
691      off += 16 * 2 * 4;
692   *ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
693                             FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
694   *ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
695                             FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
696   *ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, *ms_x, *ms_y);
697}
698
699void NV50LoweringPreSSA::loadMsInfo(Value *ms, Value *s, Value **dx, Value **dy) {
700   // Given a MS level, and a sample id, compute the delta x/y
701   uint8_t b = prog->driver->io.msInfoCBSlot;
702   Value *off = new_LValue(func, FILE_ADDRESS), *t = new_LValue(func, FILE_GPR);
703
704   // The required information is at mslevel * 16 * 4 + sample * 8
705   // = (mslevel * 8 + sample) * 8
706   bld.mkOp2(OP_SHL,
707             TYPE_U32,
708             off,
709             bld.mkOp2v(OP_ADD, TYPE_U32, t,
710                        bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
711                        s),
712             bld.mkImm(3));
713   *dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
714                           FILE_MEMORY_CONST, b, TYPE_U32,
715                           prog->driver->io.msInfoBase), off);
716   *dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
717                           FILE_MEMORY_CONST, b, TYPE_U32,
718                           prog->driver->io.msInfoBase + 4), off);
719}
720
721bool
722NV50LoweringPreSSA::handleTEX(TexInstruction *i)
723{
724   const int arg = i->tex.target.getArgCount();
725   const int dref = arg;
726   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
727
728   /* Only normalize in the non-explicit derivatives case.
729    */
730   if (i->tex.target.isCube() && i->op != OP_TXD) {
731      Value *src[3], *val;
732      int c;
733      for (c = 0; c < 3; ++c)
734         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
735      val = bld.getScratch();
736      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
737      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
738      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
739      for (c = 0; c < 3; ++c) {
740         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
741                                 i->getSrc(c), val));
742      }
743   }
744
745   // handle MS, which means looking up the MS params for this texture, and
746   // adjusting the input coordinates to point at the right sample.
747   if (i->tex.target.isMS()) {
748      Value *x = i->getSrc(0);
749      Value *y = i->getSrc(1);
750      Value *s = i->getSrc(arg - 1);
751      Value *tx = new_LValue(func, FILE_GPR), *ty = new_LValue(func, FILE_GPR),
752         *ms, *ms_x, *ms_y, *dx, *dy;
753
754      i->tex.target.clearMS();
755
756      loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
757      loadMsInfo(ms, s, &dx, &dy);
758
759      bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
760      bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
761      bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
762      bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
763      i->setSrc(0, tx);
764      i->setSrc(1, ty);
765      i->setSrc(arg - 1, bld.loadImm(NULL, 0));
766   }
767
768   // dref comes before bias/lod
769   if (i->tex.target.isShadow())
770      if (i->op == OP_TXB || i->op == OP_TXL)
771         i->swapSources(dref, lod);
772
773   if (i->tex.target.isArray()) {
774      if (i->op != OP_TXF) {
775         // array index must be converted to u32, but it's already an integer
776         // for TXF
777         Value *layer = i->getSrc(arg - 1);
778         LValue *src = new_LValue(func, FILE_GPR);
779         bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
780         bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
781         i->setSrc(arg - 1, src);
782      }
783      if (i->tex.target.isCube() && i->srcCount() > 4) {
784         std::vector<Value *> acube, a2d;
785         int c;
786
787         acube.resize(4);
788         for (c = 0; c < 4; ++c)
789            acube[c] = i->getSrc(c);
790         a2d.resize(4);
791         for (c = 0; c < 3; ++c)
792            a2d[c] = new_LValue(func, FILE_GPR);
793         a2d[3] = NULL;
794
795         bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
796                   a2d, acube)->asTex()->tex.mask = 0x7;
797
798         for (c = 0; c < 3; ++c)
799            i->setSrc(c, a2d[c]);
800         for (; i->srcExists(c + 1); ++c)
801            i->setSrc(c, i->getSrc(c + 1));
802         i->setSrc(c, NULL);
803         assert(c <= 4);
804
805         i->tex.target = i->tex.target.isShadow() ?
806            TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
807      }
808   }
809
810   // texel offsets are 3 immediate fields in the instruction,
811   // nv50 cannot do textureGatherOffsets
812   assert(i->tex.useOffsets <= 1);
813   if (i->tex.useOffsets) {
814      for (int c = 0; c < 3; ++c) {
815         ImmediateValue val;
816         if (!i->offset[0][c].getImmediate(val))
817            assert(!"non-immediate offset");
818         i->tex.offset[c] = val.reg.data.u32;
819         i->offset[0][c].set(NULL);
820      }
821   }
822
823   return true;
824}
825
826// Bias must be equal for all threads of a quad or lod calculation will fail.
827//
828// The lanes of a quad are grouped by the bit in the condition register they
829// have set, which is selected by differing bias values.
830// Move the input values for TEX into a new register set for each group and
831// execute TEX only for a specific group.
832// We always need to use 4 new registers for the inputs/outputs because the
833// implicitly calculated derivatives must be correct.
834//
835// TODO: move to SSA phase so we can easily determine whether bias is constant
836bool
837NV50LoweringPreSSA::handleTXB(TexInstruction *i)
838{
839   const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
840   int l, d;
841
842   // We can't actually apply bias *and* do a compare for a cube
843   // texture. Since the compare has to be done before the filtering, just
844   // drop the bias on the floor.
845   if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
846      i->op = OP_TEX;
847      i->setSrc(3, i->getSrc(4));
848      i->setSrc(4, NULL);
849      return handleTEX(i);
850   }
851
852   handleTEX(i);
853   Value *bias = i->getSrc(i->tex.target.getArgCount());
854   if (bias->isUniform())
855      return true;
856
857   Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
858                                 bld.loadImm(NULL, 1));
859   bld.setPosition(cond, false);
860
861   for (l = 1; l < 4; ++l) {
862      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
863      Value *bit = bld.getSSA();
864      Value *pred = bld.getScratch(1, FILE_FLAGS);
865      Value *imm = bld.loadImm(NULL, (1 << l));
866      bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
867      bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
868      cond->setSrc(l, bit);
869   }
870   Value *flags = bld.getScratch(1, FILE_FLAGS);
871   bld.setPosition(cond, true);
872   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0))->flagsDef = 0;
873
874   Instruction *tex[4];
875   for (l = 0; l < 4; ++l) {
876      (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
877      bld.insert(tex[l]);
878   }
879
880   Value *res[4][4];
881   for (d = 0; i->defExists(d); ++d)
882      res[0][d] = tex[0]->getDef(d);
883   for (l = 1; l < 4; ++l) {
884      for (d = 0; tex[l]->defExists(d); ++d) {
885         res[l][d] = cloneShallow(func, res[0][d]);
886         bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
887      }
888   }
889
890   for (d = 0; i->defExists(d); ++d) {
891      Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
892      for (l = 0; l < 4; ++l)
893         dst->setSrc(l, res[l][d]);
894   }
895   delete_Instruction(prog, i);
896   return true;
897}
898
899// LOD must be equal for all threads of a quad.
900// Unlike with TXB, here we can just diverge since there's no LOD calculation
901// that would require all 4 threads' sources to be set up properly.
902bool
903NV50LoweringPreSSA::handleTXL(TexInstruction *i)
904{
905   handleTEX(i);
906   Value *lod = i->getSrc(i->tex.target.getArgCount());
907   if (lod->isUniform())
908      return true;
909
910   BasicBlock *currBB = i->bb;
911   BasicBlock *texiBB = i->bb->splitBefore(i, false);
912   BasicBlock *joinBB = i->bb->splitAfter(i);
913
914   bld.setPosition(currBB, true);
915   assert(!currBB->joinAt);
916   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
917
918   for (int l = 0; l <= 3; ++l) {
919      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
920      Value *pred = bld.getScratch(1, FILE_FLAGS);
921      bld.setPosition(currBB, true);
922      bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
923      bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
924      currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
925      if (l <= 2) {
926         BasicBlock *laneBB = new BasicBlock(func);
927         currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
928         currBB = laneBB;
929      }
930   }
931   bld.setPosition(joinBB, false);
932   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
933   return true;
934}
935
936bool
937NV50LoweringPreSSA::handleTXD(TexInstruction *i)
938{
939   static const uint8_t qOps[4][2] =
940   {
941      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
942      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
943      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
944      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
945   };
946   Value *def[4][4];
947   Value *crd[3];
948   Instruction *tex;
949   Value *zero = bld.loadImm(bld.getSSA(), 0);
950   int l, c;
951   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
952
953   handleTEX(i);
954   i->op = OP_TEX; // no need to clone dPdx/dPdy later
955   i->tex.derivAll = true;
956
957   for (c = 0; c < dim; ++c)
958      crd[c] = bld.getScratch();
959
960   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
961   for (l = 0; l < 4; ++l) {
962      Value *src[3], *val;
963      // mov coordinates from lane l to all lanes
964      for (c = 0; c < dim; ++c)
965         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
966      // add dPdx from lane l to lanes dx
967      for (c = 0; c < dim; ++c)
968         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
969      // add dPdy from lane l to lanes dy
970      for (c = 0; c < dim; ++c)
971         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
972      // normalize cube coordinates if necessary
973      if (i->tex.target.isCube()) {
974         for (c = 0; c < 3; ++c)
975            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
976         val = bld.getScratch();
977         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
978         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
979         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
980         for (c = 0; c < 3; ++c)
981            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
982      } else {
983         for (c = 0; c < dim; ++c)
984            src[c] = crd[c];
985      }
986      // texture
987      bld.insert(tex = cloneForward(func, i));
988      for (c = 0; c < dim; ++c)
989         tex->setSrc(c, src[c]);
990      // save results
991      for (c = 0; i->defExists(c); ++c) {
992         Instruction *mov;
993         def[c][l] = bld.getSSA();
994         mov = bld.mkMov(def[c][l], tex->getDef(c));
995         mov->fixed = 1;
996         mov->lanes = 1 << l;
997      }
998   }
999   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1000
1001   for (c = 0; i->defExists(c); ++c) {
1002      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1003      for (l = 0; l < 4; ++l)
1004         u->setSrc(l, def[c][l]);
1005   }
1006
1007   i->bb->remove(i);
1008   return true;
1009}
1010
1011bool
1012NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
1013{
1014   handleTEX(i);
1015   bld.setPosition(i, true);
1016
1017   /* The returned values are not quite what we want:
1018    * (a) convert from s32 to f32
1019    * (b) multiply by 1/256
1020    */
1021   for (int def = 0; def < 2; ++def) {
1022      if (!i->defExists(def))
1023         continue;
1024      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
1025      bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1026                i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1027   }
1028   return true;
1029}
1030
1031bool
1032NV50LoweringPreSSA::handleTXQ(TexInstruction *i)
1033{
1034   Value *ms, *ms_x, *ms_y;
1035   if (i->tex.query == TXQ_DIMS)
1036      return true;
1037   assert(i->tex.query == TXQ_TYPE);
1038   assert(i->tex.mask == 4);
1039
1040   loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
1041   bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.loadImm(NULL, 1), ms);
1042   i->bb->remove(i);
1043
1044   return true;
1045}
1046
1047
1048bool
1049NV50LoweringPreSSA::handleSET(Instruction *i)
1050{
1051   if (i->dType == TYPE_F32) {
1052      bld.setPosition(i, true);
1053      i->dType = TYPE_U32;
1054      bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
1055      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
1056   }
1057   return true;
1058}
1059
1060bool
1061NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
1062{
1063   Value *src0 = bld.getSSA();
1064   Value *src1 = bld.getSSA();
1065   Value *pred = bld.getScratch(1, FILE_FLAGS);
1066
1067   Value *v0 = i->getSrc(0);
1068   Value *v1 = i->getSrc(1);
1069   // XXX: these probably shouldn't be immediates in the first place ...
1070   if (v0->asImm())
1071      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1072   if (v1->asImm())
1073      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1074
1075   bld.setPosition(i, true);
1076   bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
1077   bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
1078   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1079
1080   bld.setPosition(i, false);
1081   i->op = OP_SET;
1082   i->setFlagsDef(0, pred);
1083   i->dType = TYPE_U8;
1084   i->setSrc(0, i->getSrc(2));
1085   i->setSrc(2, NULL);
1086   i->setSrc(1, bld.loadImm(NULL, 0));
1087
1088   return true;
1089}
1090
1091bool
1092NV50LoweringPreSSA::handleSELP(Instruction *i)
1093{
1094   Value *src0 = bld.getSSA();
1095   Value *src1 = bld.getSSA();
1096
1097   Value *v0 = i->getSrc(0);
1098   Value *v1 = i->getSrc(1);
1099   if (v0->asImm())
1100      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
1101   if (v1->asImm())
1102      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
1103
1104   bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
1105   bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
1106   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
1107   delete_Instruction(prog, i);
1108   return true;
1109}
1110
1111bool
1112NV50LoweringPreSSA::handleWRSV(Instruction *i)
1113{
1114   Symbol *sym = i->getSrc(0)->asSym();
1115
1116   // these are all shader outputs, $sreg are not writeable
1117   uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
1118   if (addr >= 0x400)
1119      return false;
1120   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1121
1122   bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
1123
1124   bld.getBB()->remove(i);
1125   return true;
1126}
1127
1128bool
1129NV50LoweringPreSSA::handleCALL(Instruction *i)
1130{
1131   if (prog->getType() == Program::TYPE_COMPUTE) {
1132      // Add implicit "thread id" argument in $r0 to the function
1133      i->setSrc(i->srcCount(), tid);
1134   }
1135   return true;
1136}
1137
1138bool
1139NV50LoweringPreSSA::handlePRECONT(Instruction *i)
1140{
1141   delete_Instruction(prog, i);
1142   return true;
1143}
1144
1145bool
1146NV50LoweringPreSSA::handleCONT(Instruction *i)
1147{
1148   i->op = OP_BRA;
1149   return true;
1150}
1151
1152bool
1153NV50LoweringPreSSA::handleRDSV(Instruction *i)
1154{
1155   Symbol *sym = i->getSrc(0)->asSym();
1156   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1157   Value *def = i->getDef(0);
1158   SVSemantic sv = sym->reg.data.sv.sv;
1159   int idx = sym->reg.data.sv.index;
1160
1161   if (addr >= 0x400) // mov $sreg
1162      return true;
1163
1164   switch (sv) {
1165   case SV_POSITION:
1166      assert(prog->getType() == Program::TYPE_FRAGMENT);
1167      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1168      break;
1169   case SV_FACE:
1170      bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
1171      if (i->dType == TYPE_F32) {
1172         bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
1173         bld.mkOp1(OP_NEG, TYPE_S32, def, def);
1174         bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
1175      }
1176      break;
1177   case SV_NCTAID:
1178   case SV_CTAID:
1179   case SV_NTID:
1180      if ((sv == SV_NCTAID && idx >= 2) ||
1181          (sv == SV_NTID && idx >= 3)) {
1182         bld.mkMov(def, bld.mkImm(1));
1183      } else if (sv == SV_CTAID && idx >= 2) {
1184         bld.mkMov(def, bld.mkImm(0));
1185      } else {
1186         Value *x = bld.getSSA(2);
1187         bld.mkOp1(OP_LOAD, TYPE_U16, x,
1188                   bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
1189         bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
1190      }
1191      break;
1192   case SV_TID:
1193      if (idx == 0) {
1194         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
1195      } else if (idx == 1) {
1196         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
1197         bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
1198      } else if (idx == 2) {
1199         bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
1200      } else {
1201         bld.mkMov(def, bld.mkImm(0));
1202      }
1203      break;
1204   case SV_SAMPLE_POS: {
1205      Value *off = new_LValue(func, FILE_ADDRESS);
1206      bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
1207      bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
1208      bld.mkLoad(TYPE_F32,
1209                 def,
1210                 bld.mkSymbol(
1211                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
1212                       TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
1213                 off);
1214      break;
1215   }
1216   default:
1217      bld.mkFetch(i->getDef(0), i->dType,
1218                  FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
1219      break;
1220   }
1221   bld.getBB()->remove(i);
1222   return true;
1223}
1224
1225bool
1226NV50LoweringPreSSA::handleDIV(Instruction *i)
1227{
1228   if (!isFloatType(i->dType))
1229      return true;
1230   bld.setPosition(i, false);
1231   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1232   i->op = OP_MUL;
1233   i->setSrc(1, rcp->getDef(0));
1234   return true;
1235}
1236
1237bool
1238NV50LoweringPreSSA::handleSQRT(Instruction *i)
1239{
1240   bld.setPosition(i, true);
1241   i->op = OP_RSQ;
1242   bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
1243
1244   return true;
1245}
1246
1247bool
1248NV50LoweringPreSSA::handlePOW(Instruction *i)
1249{
1250   LValue *val = bld.getScratch();
1251
1252   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1253   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1254   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1255
1256   i->op = OP_EX2;
1257   i->setSrc(0, val);
1258   i->setSrc(1, NULL);
1259
1260   return true;
1261}
1262
1263bool
1264NV50LoweringPreSSA::handleEXPORT(Instruction *i)
1265{
1266   if (prog->getType() == Program::TYPE_FRAGMENT) {
1267      if (i->getIndirect(0, 0)) {
1268         // TODO: redirect to l[] here, load to GPRs at exit
1269         return false;
1270      } else {
1271         int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
1272
1273         i->op = OP_MOV;
1274         i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1275         i->src(0).set(i->src(1));
1276         i->setSrc(1, NULL);
1277         i->setDef(0, new_LValue(func, FILE_GPR));
1278         i->getDef(0)->reg.data.id = id;
1279
1280         prog->maxGPR = MAX2(prog->maxGPR, id * 2);
1281      }
1282   }
1283   return true;
1284}
1285
1286// Handle indirect addressing in geometry shaders:
1287//
1288// ld $r0 a[$a1][$a2+k] ->
1289// ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
1290//
1291bool
1292NV50LoweringPreSSA::handleLOAD(Instruction *i)
1293{
1294   ValueRef src = i->src(0);
1295
1296   if (src.isIndirect(1)) {
1297      assert(prog->getType() == Program::TYPE_GEOMETRY);
1298      Value *addr = i->getIndirect(0, 1);
1299
1300      if (src.isIndirect(0)) {
1301         // base address is in an address register, so move to a GPR
1302         Value *base = bld.getScratch();
1303         bld.mkMov(base, addr);
1304
1305         Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
1306         Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
1307         Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1308                                    i->getIndirect(0, 0), bld.mkImm(2));
1309
1310         // Calculate final address: addr = base + attr*vstride; use 16-bit
1311         // multiplication since 32-bit would be lowered to multiple
1312         // instructions, and we only need the low 16 bits of the result
1313         Value *a[2], *b[2];
1314         bld.mkSplit(a, 2, attrib);
1315         bld.mkSplit(b, 2, vstride);
1316         Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
1317                                 base);
1318
1319         // move address from GPR into an address register
1320         addr = bld.getSSA(2, FILE_ADDRESS);
1321         bld.mkMov(addr, sum);
1322      }
1323
1324      i->setIndirect(0, 1, NULL);
1325      i->setIndirect(0, 0, addr);
1326   }
1327
1328   return true;
1329}
1330
1331bool
1332NV50LoweringPreSSA::handlePFETCH(Instruction *i)
1333{
1334   assert(prog->getType() == Program::TYPE_GEOMETRY);
1335
1336   // NOTE: cannot use getImmediate here, not in SSA form yet, move to
1337   // later phase if that assertion ever triggers:
1338
1339   ImmediateValue *imm = i->getSrc(0)->asImm();
1340   assert(imm);
1341
1342   assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
1343
1344   if (i->srcExists(1)) {
1345      // indirect addressing of vertex in primitive space
1346
1347      LValue *val = bld.getScratch();
1348      Value *ptr = bld.getSSA(2, FILE_ADDRESS);
1349      bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
1350      bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
1351
1352      // NOTE: PFETCH directly to an $aX only works with direct addressing
1353      i->op = OP_SHL;
1354      i->setSrc(0, val);
1355      i->setSrc(1, bld.mkImm(0));
1356   }
1357
1358   return true;
1359}
1360
1361// Set flags according to predicate and make the instruction read $cX.
1362void
1363NV50LoweringPreSSA::checkPredicate(Instruction *insn)
1364{
1365   Value *pred = insn->getPredicate();
1366   Value *cdst;
1367
1368   // FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
1369   if (!pred ||
1370       pred->reg.file == FILE_FLAGS || pred->reg.file == FILE_PREDICATE)
1371      return;
1372
1373   cdst = bld.getSSA(1, FILE_FLAGS);
1374
1375   bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
1376
1377   insn->setPredicate(insn->cc, cdst);
1378}
1379
1380//
1381// - add quadop dance for texturing
1382// - put FP outputs in GPRs
1383// - convert instruction sequences
1384//
1385bool
1386NV50LoweringPreSSA::visit(Instruction *i)
1387{
1388   bld.setPosition(i, false);
1389
1390   if (i->cc != CC_ALWAYS)
1391      checkPredicate(i);
1392
1393   switch (i->op) {
1394   case OP_TEX:
1395   case OP_TXF:
1396   case OP_TXG:
1397      return handleTEX(i->asTex());
1398   case OP_TXB:
1399      return handleTXB(i->asTex());
1400   case OP_TXL:
1401      return handleTXL(i->asTex());
1402   case OP_TXD:
1403      return handleTXD(i->asTex());
1404   case OP_TXLQ:
1405      return handleTXLQ(i->asTex());
1406   case OP_TXQ:
1407      return handleTXQ(i->asTex());
1408   case OP_EX2:
1409      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1410      i->setSrc(0, i->getDef(0));
1411      break;
1412   case OP_SET:
1413      return handleSET(i);
1414   case OP_SLCT:
1415      return handleSLCT(i->asCmp());
1416   case OP_SELP:
1417      return handleSELP(i);
1418   case OP_POW:
1419      return handlePOW(i);
1420   case OP_DIV:
1421      return handleDIV(i);
1422   case OP_SQRT:
1423      return handleSQRT(i);
1424   case OP_EXPORT:
1425      return handleEXPORT(i);
1426   case OP_LOAD:
1427      return handleLOAD(i);
1428   case OP_RDSV:
1429      return handleRDSV(i);
1430   case OP_WRSV:
1431      return handleWRSV(i);
1432   case OP_CALL:
1433      return handleCALL(i);
1434   case OP_PRECONT:
1435      return handlePRECONT(i);
1436   case OP_CONT:
1437      return handleCONT(i);
1438   case OP_PFETCH:
1439      return handlePFETCH(i);
1440   default:
1441      break;
1442   }
1443   return true;
1444}
1445
1446bool
1447TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
1448{
1449   bool ret = false;
1450
1451   if (stage == CG_STAGE_PRE_SSA) {
1452      NV50LoweringPreSSA pass(prog);
1453      ret = pass.run(prog, false, true);
1454   } else
1455   if (stage == CG_STAGE_SSA) {
1456      if (!prog->targetPriv)
1457         prog->targetPriv = new std::list<Instruction *>();
1458      NV50LegalizeSSA pass(prog);
1459      ret = pass.run(prog, false, true);
1460   } else
1461   if (stage == CG_STAGE_POST_RA) {
1462      NV50LegalizePostRA pass;
1463      ret = pass.run(prog, false, true);
1464      if (prog->targetPriv)
1465         delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
1466   }
1467   return ret;
1468}
1469
1470} // namespace nv50_ir
1471