1/*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include "codegen/nv50_ir.h"
24#include "codegen/nv50_ir_build_util.h"
25
26#include "codegen/nv50_ir_target_nvc0.h"
27#include "codegen/nv50_ir_lowering_nvc0.h"
28
29#include <limits>
30
31namespace nv50_ir {
32
33#define QOP_ADD  0
34#define QOP_SUBR 1
35#define QOP_SUB  2
36#define QOP_MOV2 3
37
38//             UL UR LL LR
39#define QUADOP(q, r, s, t)                      \
40   ((QOP_##q << 6) | (QOP_##r << 4) |           \
41    (QOP_##s << 2) | (QOP_##t << 0))
42
43void
44NVC0LegalizeSSA::handleDIV(Instruction *i)
45{
46   FlowInstruction *call;
47   int builtin;
48   Value *def[2];
49
50   bld.setPosition(i, false);
51   def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
52   def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
53   switch (i->dType) {
54   case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
55   case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
56   default:
57      return;
58   }
59   call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
60   bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
61   bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
62   bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
63
64   call->fixed = 1;
65   call->absolute = call->builtin = 1;
66   call->target.builtin = builtin;
67   delete_Instruction(prog, i);
68}
69
70void
71NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
72{
73   assert(i->dType == TYPE_F64);
74   // There are instructions that will compute the high 32 bits of the 64-bit
75   // float. We will just stick 0 in the bottom 32 bits.
76
77   bld.setPosition(i, false);
78
79   // 1. Take the source and it up.
80   Value *src[2], *dst[2], *def = i->getDef(0);
81   bld.mkSplit(src, 4, i->getSrc(0));
82
83   // 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
84   dst[0] = bld.loadImm(NULL, 0);
85   dst[1] = bld.getSSA();
86
87   // 3. The new version of the instruction takes the high 32 bits of the
88   // source and outputs the high 32 bits of the destination.
89   i->setSrc(0, src[1]);
90   i->setDef(0, dst[1]);
91   i->setType(TYPE_F32);
92   i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
93
94   // 4. Recombine the two dst pieces back into the original destination.
95   bld.setPosition(i, true);
96   bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
97}
98
99void
100NVC0LegalizeSSA::handleFTZ(Instruction *i)
101{
102   // Only want to flush float inputs
103   assert(i->sType == TYPE_F32);
104
105   // If we're already flushing denorms (and NaN's) to zero, no need for this.
106   if (i->dnz)
107      return;
108
109   // Only certain classes of operations can flush
110   OpClass cls = prog->getTarget()->getOpClass(i->op);
111   if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
112       cls != OPCLASS_CONVERT)
113      return;
114
115   i->ftz = true;
116}
117
118void
119NVC0LegalizeSSA::handleTEXLOD(TexInstruction *i)
120{
121   if (i->tex.levelZero)
122      return;
123
124   ImmediateValue lod;
125
126   // The LOD argument comes right after the coordinates (before depth bias,
127   // offsets, etc).
128   int arg = i->tex.target.getArgCount();
129
130   // SM30+ stores the indirect handle as a separate arg, which comes before
131   // the LOD.
132   if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET &&
133       i->tex.rIndirectSrc >= 0)
134      arg++;
135   // SM20 stores indirect handle combined with array coordinate
136   if (prog->getTarget()->getChipset() < NVISA_GK104_CHIPSET &&
137       !i->tex.target.isArray() &&
138       i->tex.rIndirectSrc >= 0)
139      arg++;
140
141   if (!i->src(arg).getImmediate(lod) || !lod.isInteger(0))
142      return;
143
144   if (i->op == OP_TXL)
145      i->op = OP_TEX;
146   i->tex.levelZero = true;
147   i->moveSources(arg + 1, -1);
148}
149
150bool
151NVC0LegalizeSSA::visit(Function *fn)
152{
153   bld.setProgram(fn->getProgram());
154   return true;
155}
156
157bool
158NVC0LegalizeSSA::visit(BasicBlock *bb)
159{
160   Instruction *next;
161   for (Instruction *i = bb->getEntry(); i; i = next) {
162      next = i->next;
163
164      if (i->sType == TYPE_F32 && prog->getType() != Program::TYPE_COMPUTE)
165         handleFTZ(i);
166
167      switch (i->op) {
168      case OP_DIV:
169      case OP_MOD:
170         if (i->sType != TYPE_F32)
171            handleDIV(i);
172         break;
173      case OP_RCP:
174      case OP_RSQ:
175         if (i->dType == TYPE_F64)
176            handleRCPRSQ(i);
177         break;
178      case OP_TXL:
179      case OP_TXF:
180         handleTEXLOD(i->asTex());
181         break;
182      default:
183         break;
184      }
185   }
186   return true;
187}
188
189NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
190   : rZero(NULL),
191     carry(NULL),
192     pOne(NULL),
193     needTexBar(prog->getTarget()->getChipset() >= 0xe0 &&
194                prog->getTarget()->getChipset() < 0x110)
195{
196}
197
198bool
199NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
200                                    const Instruction *early) const
201{
202   if (early->bb == later->bb)
203      return early->serial < later->serial;
204   return later->bb->dominatedBy(early->bb);
205}
206
207void
208NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
209                              Instruction *usei, const Instruction *texi)
210{
211   bool add = true;
212   bool dominated = insnDominatedBy(usei, texi);
213   // Uses before the tex have to all be included. Just because an earlier
214   // instruction dominates another instruction doesn't mean that there's no
215   // way to get from the tex to the later instruction. For example you could
216   // have nested loops, with the tex in the inner loop, and uses before it in
217   // both loops - even though the outer loop's instruction would dominate the
218   // inner's, we still want a texbar before the inner loop's instruction.
219   //
220   // However we can still use the eliding logic between uses dominated by the
221   // tex instruction, as that is unambiguously correct.
222   if (dominated) {
223      for (std::list<TexUse>::iterator it = uses.begin(); it != uses.end();) {
224         if (it->after) {
225            if (insnDominatedBy(usei, it->insn)) {
226               add = false;
227               break;
228            }
229            if (insnDominatedBy(it->insn, usei)) {
230               it = uses.erase(it);
231               continue;
232            }
233         }
234         ++it;
235      }
236   }
237   if (add)
238      uses.push_back(TexUse(usei, texi, dominated));
239}
240
241// While it might be tempting to use the an algorithm that just looks at tex
242// uses, not all texture results are guaranteed to be used on all paths. In
243// the case where along some control flow path a texture result is never used,
244// we might reuse that register for something else, creating a
245// write-after-write hazard. So we have to manually look through all
246// instructions looking for ones that reference the registers in question.
247void
248NVC0LegalizePostRA::findFirstUses(
249   Instruction *texi, std::list<TexUse> &uses)
250{
251   int minGPR = texi->def(0).rep()->reg.data.id;
252   int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
253
254   unordered_set<const BasicBlock *> visited;
255   findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
256}
257
258void
259NVC0LegalizePostRA::findFirstUsesBB(
260   int minGPR, int maxGPR, Instruction *start,
261   const Instruction *texi, std::list<TexUse> &uses,
262   unordered_set<const BasicBlock *> &visited)
263{
264   const BasicBlock *bb = start->bb;
265
266   // We don't process the whole bb the first time around. This is correct,
267   // however we might be in a loop and hit this BB again, and need to process
268   // the full thing. So only mark a bb as visited if we processed it from the
269   // beginning.
270   if (start == bb->getEntry()) {
271      if (visited.find(bb) != visited.end())
272         return;
273      visited.insert(bb);
274   }
275
276   for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
277      if (insn->isNop())
278         continue;
279
280      for (int d = 0; insn->defExists(d); ++d) {
281         const Value *def = insn->def(d).rep();
282         if (insn->def(d).getFile() != FILE_GPR ||
283             def->reg.data.id + def->reg.size / 4 - 1 < minGPR ||
284             def->reg.data.id > maxGPR)
285            continue;
286         addTexUse(uses, insn, texi);
287         return;
288      }
289
290      for (int s = 0; insn->srcExists(s); ++s) {
291         const Value *src = insn->src(s).rep();
292         if (insn->src(s).getFile() != FILE_GPR ||
293             src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
294             src->reg.data.id > maxGPR)
295            continue;
296         addTexUse(uses, insn, texi);
297         return;
298      }
299   }
300
301   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
302      findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
303                      texi, uses, visited);
304   }
305}
306
307// Texture barriers:
308// This pass is a bit long and ugly and can probably be optimized.
309//
310// 1. obtain a list of TEXes and their outputs' first use(s)
311// 2. calculate the barrier level of each first use (minimal number of TEXes,
312//    over all paths, between the TEX and the use in question)
313// 3. for each barrier, if all paths from the source TEX to that barrier
314//    contain a barrier of lesser level, it can be culled
315bool
316NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
317{
318   std::list<TexUse> *uses;
319   std::vector<Instruction *> texes;
320   std::vector<int> bbFirstTex;
321   std::vector<int> bbFirstUse;
322   std::vector<int> texCounts;
323   std::vector<TexUse> useVec;
324   ArrayList insns;
325
326   fn->orderInstructions(insns);
327
328   texCounts.resize(fn->allBBlocks.getSize(), 0);
329   bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
330   bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
331
332   // tag BB CFG nodes by their id for later
333   for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
334      BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
335      if (bb)
336         bb->cfg.tag = bb->getId();
337   }
338
339   // gather the first uses for each TEX
340   for (int i = 0; i < insns.getSize(); ++i) {
341      Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
342      if (isTextureOp(tex->op)) {
343         texes.push_back(tex);
344         if (!texCounts.at(tex->bb->getId()))
345            bbFirstTex[tex->bb->getId()] = texes.size() - 1;
346         texCounts[tex->bb->getId()]++;
347      }
348   }
349   insns.clear();
350   if (texes.empty())
351      return false;
352   uses = new std::list<TexUse>[texes.size()];
353   if (!uses)
354      return false;
355   for (size_t i = 0; i < texes.size(); ++i) {
356      findFirstUses(texes[i], uses[i]);
357   }
358
359   // determine the barrier level at each use
360   for (size_t i = 0; i < texes.size(); ++i) {
361      for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
362           ++u) {
363         BasicBlock *tb = texes[i]->bb;
364         BasicBlock *ub = u->insn->bb;
365         if (tb == ub) {
366            u->level = 0;
367            for (size_t j = i + 1; j < texes.size() &&
368                    texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
369                 ++j)
370               u->level++;
371         } else {
372            u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
373                                                      &ub->cfg, texCounts);
374            if (u->level < 0) {
375               WARN("Failed to find path TEX -> TEXBAR\n");
376               u->level = 0;
377               continue;
378            }
379            // this counted all TEXes in the origin block, correct that
380            u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
381            // and did not count the TEXes in the destination block, add those
382            for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
383                    texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
384                 ++j)
385               u->level++;
386         }
387         assert(u->level >= 0);
388         useVec.push_back(*u);
389      }
390   }
391   delete[] uses;
392
393   // insert the barriers
394   for (size_t i = 0; i < useVec.size(); ++i) {
395      Instruction *prev = useVec[i].insn->prev;
396      if (useVec[i].level < 0)
397         continue;
398      if (prev && prev->op == OP_TEXBAR) {
399         if (prev->subOp > useVec[i].level)
400            prev->subOp = useVec[i].level;
401         prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
402      } else {
403         Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
404         bar->fixed = 1;
405         bar->subOp = useVec[i].level;
406         // make use explicit to ease latency calculation
407         bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
408         useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
409      }
410   }
411
412   if (fn->getProgram()->optLevel < 3)
413      return true;
414
415   std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
416
417   limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
418   limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
419   limitS.resize(fn->allBBlocks.getSize());
420
421   // cull unneeded barriers (should do that earlier, but for simplicity)
422   IteratorRef bi = fn->cfg.iteratorCFG();
423   // first calculate min/max outstanding TEXes for each BB
424   for (bi->reset(); !bi->end(); bi->next()) {
425      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
426      BasicBlock *bb = BasicBlock::get(n);
427      int min = 0;
428      int max = std::numeric_limits<int>::max();
429      for (Instruction *i = bb->getFirst(); i; i = i->next) {
430         if (isTextureOp(i->op)) {
431            min++;
432            if (max < std::numeric_limits<int>::max())
433               max++;
434         } else
435         if (i->op == OP_TEXBAR) {
436            min = MIN2(min, i->subOp);
437            max = MIN2(max, i->subOp);
438         }
439      }
440      // limits when looking at an isolated block
441      limitS[bb->getId()].min = min;
442      limitS[bb->getId()].max = max;
443   }
444   // propagate the min/max values
445   for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
446      for (bi->reset(); !bi->end(); bi->next()) {
447         Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
448         BasicBlock *bb = BasicBlock::get(n);
449         const int bbId = bb->getId();
450         for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
451            BasicBlock *in = BasicBlock::get(ei.getNode());
452            const int inId = in->getId();
453            limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
454            limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
455         }
456         // I just hope this is correct ...
457         if (limitS[bbId].max == std::numeric_limits<int>::max()) {
458            // no barrier
459            limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
460            limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
461         } else {
462            // block contained a barrier
463            limitB[bbId].min = MIN2(limitS[bbId].max,
464                                    limitT[bbId].min + limitS[bbId].min);
465            limitB[bbId].max = MIN2(limitS[bbId].max,
466                                    limitT[bbId].max + limitS[bbId].min);
467         }
468      }
469   }
470   // finally delete unnecessary barriers
471   for (bi->reset(); !bi->end(); bi->next()) {
472      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
473      BasicBlock *bb = BasicBlock::get(n);
474      Instruction *prev = NULL;
475      Instruction *next;
476      int max = limitT[bb->getId()].max;
477      for (Instruction *i = bb->getFirst(); i; i = next) {
478         next = i->next;
479         if (i->op == OP_TEXBAR) {
480            if (i->subOp >= max) {
481               delete_Instruction(prog, i);
482               i = NULL;
483            } else {
484               max = i->subOp;
485               if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
486                  delete_Instruction(prog, prev);
487                  prev = NULL;
488               }
489            }
490         } else
491         if (isTextureOp(i->op)) {
492            max++;
493         }
494         if (i && !i->isNop())
495            prev = i;
496      }
497   }
498   return true;
499}
500
501bool
502NVC0LegalizePostRA::visit(Function *fn)
503{
504   if (needTexBar)
505      insertTextureBarriers(fn);
506
507   rZero = new_LValue(fn, FILE_GPR);
508   pOne = new_LValue(fn, FILE_PREDICATE);
509   carry = new_LValue(fn, FILE_FLAGS);
510
511   rZero->reg.data.id = (prog->getTarget()->getChipset() >= NVISA_GK20A_CHIPSET) ? 255 : 63;
512   carry->reg.data.id = 0;
513   pOne->reg.data.id = 7;
514
515   return true;
516}
517
518void
519NVC0LegalizePostRA::replaceZero(Instruction *i)
520{
521   for (int s = 0; i->srcExists(s); ++s) {
522      if (s == 2 && i->op == OP_SUCLAMP)
523         continue;
524      ImmediateValue *imm = i->getSrc(s)->asImm();
525      if (imm) {
526         if (i->op == OP_SELP && s == 2) {
527            i->setSrc(s, pOne);
528            if (imm->reg.data.u64 == 0)
529               i->src(s).mod = i->src(s).mod ^ Modifier(NV50_IR_MOD_NOT);
530         } else if (imm->reg.data.u64 == 0) {
531            i->setSrc(s, rZero);
532         }
533      }
534   }
535}
536
537// replace CONT with BRA for single unconditional continue
538bool
539NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
540{
541   if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
542      return false;
543   Graph::EdgeIterator ei = bb->cfg.incident();
544   if (ei.getType() != Graph::Edge::BACK)
545      ei.next();
546   if (ei.getType() != Graph::Edge::BACK)
547      return false;
548   BasicBlock *contBB = BasicBlock::get(ei.getNode());
549
550   if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
551       contBB->getExit()->getPredicate())
552      return false;
553   contBB->getExit()->op = OP_BRA;
554   bb->remove(bb->getEntry()); // delete PRECONT
555
556   ei.next();
557   assert(ei.end() || ei.getType() != Graph::Edge::BACK);
558   return true;
559}
560
561// replace branches to join blocks with join ops
562void
563NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
564{
565   if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
566      return;
567   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
568      BasicBlock *in = BasicBlock::get(ei.getNode());
569      Instruction *exit = in->getExit();
570      if (!exit) {
571         in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
572         // there should always be a terminator instruction
573         WARN("inserted missing terminator in BB:%i\n", in->getId());
574      } else
575      if (exit->op == OP_BRA) {
576         exit->op = OP_JOIN;
577         exit->asFlow()->limit = 1; // must-not-propagate marker
578      }
579   }
580   bb->remove(bb->getEntry());
581}
582
583bool
584NVC0LegalizePostRA::visit(BasicBlock *bb)
585{
586   Instruction *i, *next;
587
588   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
589   for (i = bb->getFirst(); i; i = next) {
590      next = i->next;
591      if (i->op == OP_EMIT || i->op == OP_RESTART) {
592         if (!i->getDef(0)->refCount())
593            i->setDef(0, NULL);
594         if (i->src(0).getFile() == FILE_IMMEDIATE)
595            i->setSrc(0, rZero); // initial value must be 0
596         replaceZero(i);
597      } else
598      if (i->isNop()) {
599         bb->remove(i);
600      } else
601      if (i->op == OP_BAR && i->subOp == NV50_IR_SUBOP_BAR_SYNC &&
602          prog->getType() != Program::TYPE_COMPUTE) {
603         // It seems like barriers are never required for tessellation since
604         // the warp size is 32, and there are always at most 32 tcs threads.
605         bb->remove(i);
606      } else
607      if (i->op == OP_LOAD && i->subOp == NV50_IR_SUBOP_LDC_IS) {
608         int offset = i->src(0).get()->reg.data.offset;
609         if (abs(offset) > 0x10000)
610            i->src(0).get()->reg.fileIndex += offset >> 16;
611         i->src(0).get()->reg.data.offset = (int)(short)offset;
612      } else {
613         // TODO: Move this to before register allocation for operations that
614         // need the $c register !
615         if (typeSizeof(i->dType) == 8) {
616            Instruction *hi;
617            hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
618            if (hi)
619               next = hi;
620         }
621
622         if (i->op != OP_MOV && i->op != OP_PFETCH)
623            replaceZero(i);
624      }
625   }
626   if (!bb->getEntry())
627      return true;
628
629   if (!tryReplaceContWithBra(bb))
630      propagateJoin(bb);
631
632   return true;
633}
634
635NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
636{
637   bld.setProgram(prog);
638}
639
640bool
641NVC0LoweringPass::visit(Function *fn)
642{
643   if (prog->getType() == Program::TYPE_GEOMETRY) {
644      assert(!strncmp(fn->getName(), "MAIN", 4));
645      // TODO: when we generate actual functions pass this value along somehow
646      bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
647      gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
648      if (fn->cfgExit) {
649         bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
650         bld.mkMovToReg(0, gpEmitAddress);
651      }
652   }
653   return true;
654}
655
656bool
657NVC0LoweringPass::visit(BasicBlock *bb)
658{
659   return true;
660}
661
662inline Value *
663NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
664{
665   uint8_t b = prog->driver->io.auxCBSlot;
666   uint32_t off = prog->driver->io.texBindBase + slot * 4;
667
668   if (ptr)
669      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(2));
670
671   return bld.
672      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
673}
674
675// move array source to first slot, convert to u16, add indirections
676bool
677NVC0LoweringPass::handleTEX(TexInstruction *i)
678{
679   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
680   const int arg = i->tex.target.getArgCount();
681   const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
682   const int chipset = prog->getTarget()->getChipset();
683
684   /* Only normalize in the non-explicit derivatives case. For explicit
685    * derivatives, this is handled in handleManualTXD.
686    */
687   if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
688      Value *src[3], *val;
689      int c;
690      for (c = 0; c < 3; ++c)
691         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
692      val = bld.getScratch();
693      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
694      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
695      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
696      for (c = 0; c < 3; ++c) {
697         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
698                                 i->getSrc(c), val));
699      }
700   }
701
702   // Arguments to the TEX instruction are a little insane. Even though the
703   // encoding is identical between SM20 and SM30, the arguments mean
704   // different things between Fermi and Kepler+. A lot of arguments are
705   // optional based on flags passed to the instruction. This summarizes the
706   // order of things.
707   //
708   // Fermi:
709   //  array/indirect
710   //  coords
711   //  sample
712   //  lod bias
713   //  depth compare
714   //  offsets:
715   //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
716   //    - other: 4 bits each, single reg
717   //
718   // Kepler+:
719   //  indirect handle
720   //  array (+ offsets for txd in upper 16 bits)
721   //  coords
722   //  sample
723   //  lod bias
724   //  depth compare
725   //  offsets (same as fermi, except txd which takes it with array)
726   //
727   // Maxwell (tex):
728   //  array
729   //  coords
730   //  indirect handle
731   //  sample
732   //  lod bias
733   //  depth compare
734   //  offsets
735   //
736   // Maxwell (txd):
737   //  indirect handle
738   //  coords
739   //  array + offsets
740   //  derivatives
741
742   if (chipset >= NVISA_GK104_CHIPSET) {
743      if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
744         // XXX this ignores tsc, and assumes a 1:1 mapping
745         assert(i->tex.rIndirectSrc >= 0);
746         Value *hnd = loadTexHandle(i->getIndirectR(), i->tex.r);
747         i->tex.r = 0xff;
748         i->tex.s = 0x1f;
749         i->setIndirectR(hnd);
750         i->setIndirectS(NULL);
751      } else if (i->tex.r == i->tex.s || i->op == OP_TXF) {
752         if (i->tex.r == 0xffff)
753            i->tex.r = prog->driver->io.fbtexBindBase / 4;
754         else
755            i->tex.r += prog->driver->io.texBindBase / 4;
756         i->tex.s  = 0; // only a single cX[] value possible here
757      } else {
758         Value *hnd = bld.getScratch();
759         Value *rHnd = loadTexHandle(NULL, i->tex.r);
760         Value *sHnd = loadTexHandle(NULL, i->tex.s);
761
762         bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
763
764         i->tex.r = 0; // not used for indirect tex
765         i->tex.s = 0;
766         i->setIndirectR(hnd);
767      }
768      if (i->tex.target.isArray()) {
769         LValue *layer = new_LValue(func, FILE_GPR);
770         Value *src = i->getSrc(lyr);
771         const int sat = (i->op == OP_TXF) ? 1 : 0;
772         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
773         bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
774         if (i->op != OP_TXD || chipset < NVISA_GM107_CHIPSET) {
775            for (int s = dim; s >= 1; --s)
776               i->setSrc(s, i->getSrc(s - 1));
777            i->setSrc(0, layer);
778         } else {
779            i->setSrc(dim, layer);
780         }
781      }
782      // Move the indirect reference to the first place
783      if (i->tex.rIndirectSrc >= 0 && (
784                i->op == OP_TXD || chipset < NVISA_GM107_CHIPSET)) {
785         Value *hnd = i->getIndirectR();
786
787         i->setIndirectR(NULL);
788         i->moveSources(0, 1);
789         i->setSrc(0, hnd);
790         i->tex.rIndirectSrc = 0;
791         i->tex.sIndirectSrc = -1;
792      }
793      // Move the indirect reference to right after the coords
794      else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
795         Value *hnd = i->getIndirectR();
796
797         i->setIndirectR(NULL);
798         i->moveSources(arg, 1);
799         i->setSrc(arg, hnd);
800         i->tex.rIndirectSrc = 0;
801         i->tex.sIndirectSrc = -1;
802      }
803   } else
804   // (nvc0) generate and move the tsc/tic/array source to the front
805   if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
806      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
807
808      Value *ticRel = i->getIndirectR();
809      Value *tscRel = i->getIndirectS();
810
811      if (i->tex.r == 0xffff) {
812         i->tex.r = 0x20;
813         i->tex.s = 0x10;
814      }
815
816      if (ticRel) {
817         i->setSrc(i->tex.rIndirectSrc, NULL);
818         if (i->tex.r)
819            ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
820                                ticRel, bld.mkImm(i->tex.r));
821      }
822      if (tscRel) {
823         i->setSrc(i->tex.sIndirectSrc, NULL);
824         if (i->tex.s)
825            tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
826                                tscRel, bld.mkImm(i->tex.s));
827      }
828
829      Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
830      if (arrayIndex) {
831         for (int s = dim; s >= 1; --s)
832            i->setSrc(s, i->getSrc(s - 1));
833         i->setSrc(0, arrayIndex);
834      } else {
835         i->moveSources(0, 1);
836      }
837
838      if (arrayIndex) {
839         int sat = (i->op == OP_TXF) ? 1 : 0;
840         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
841         bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
842      } else {
843         bld.loadImm(src, 0);
844      }
845
846      if (ticRel)
847         bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
848      if (tscRel)
849         bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
850
851      i->setSrc(0, src);
852   }
853
854   // For nvc0, the sample id has to be in the second operand, as the offset
855   // does. Right now we don't know how to pass both in, and this case can't
856   // happen with OpenGL. On nve0, the sample id is part of the texture
857   // coordinate argument.
858   assert(chipset >= NVISA_GK104_CHIPSET ||
859          !i->tex.useOffsets || !i->tex.target.isMS());
860
861   // offset is between lod and dc
862   if (i->tex.useOffsets) {
863      int n, c;
864      int s = i->srcCount(0xff, true);
865      if (i->op != OP_TXD || chipset < NVISA_GK104_CHIPSET) {
866         if (i->tex.target.isShadow())
867            s--;
868         if (i->srcExists(s)) // move potential predicate out of the way
869            i->moveSources(s, 1);
870         if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
871            i->moveSources(s + 1, 1);
872      }
873      if (i->op == OP_TXG) {
874         // Either there is 1 offset, which goes into the 2 low bytes of the
875         // first source, or there are 4 offsets, which go into 2 sources (8
876         // values, 1 byte each).
877         Value *offs[2] = {NULL, NULL};
878         for (n = 0; n < i->tex.useOffsets; n++) {
879            for (c = 0; c < 2; ++c) {
880               if ((n % 2) == 0 && c == 0)
881                  bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
882               else
883                  bld.mkOp3(OP_INSBF, TYPE_U32,
884                            offs[n / 2],
885                            i->offset[n][c].get(),
886                            bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
887                            offs[n / 2]);
888            }
889         }
890         i->setSrc(s, offs[0]);
891         if (offs[1])
892            i->setSrc(s + 1, offs[1]);
893      } else {
894         unsigned imm = 0;
895         assert(i->tex.useOffsets == 1);
896         for (c = 0; c < 3; ++c) {
897            ImmediateValue val;
898            if (!i->offset[0][c].getImmediate(val))
899               assert(!"non-immediate offset passed to non-TXG");
900            imm |= (val.reg.data.u32 & 0xf) << (c * 4);
901         }
902         if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
903            // The offset goes into the upper 16 bits of the array index. So
904            // create it if it's not already there, and INSBF it if it already
905            // is.
906            s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
907            if (chipset >= NVISA_GM107_CHIPSET)
908               s += dim;
909            if (i->tex.target.isArray()) {
910               bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(s),
911                         bld.loadImm(NULL, imm), bld.mkImm(0xc10),
912                         i->getSrc(s));
913            } else {
914               i->moveSources(s, 1);
915               i->setSrc(s, bld.loadImm(NULL, imm << 16));
916            }
917         } else {
918            i->setSrc(s, bld.loadImm(NULL, imm));
919         }
920      }
921   }
922
923   if (chipset >= NVISA_GK104_CHIPSET) {
924      //
925      // If TEX requires more than 4 sources, the 2nd register tuple must be
926      // aligned to 4, even if it consists of just a single 4-byte register.
927      //
928      // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
929      //
930      int s = i->srcCount(0xff, true);
931      if (s > 4 && s < 7) {
932         if (i->srcExists(s)) // move potential predicate out of the way
933            i->moveSources(s, 7 - s);
934         while (s < 7)
935            i->setSrc(s++, bld.loadImm(NULL, 0));
936      }
937   }
938
939   return true;
940}
941
942bool
943NVC0LoweringPass::handleManualTXD(TexInstruction *i)
944{
945   static const uint8_t qOps[4][2] =
946   {
947      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
948      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
949      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
950      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
951   };
952   Value *def[4][4];
953   Value *crd[3];
954   Instruction *tex;
955   Value *zero = bld.loadImm(bld.getSSA(), 0);
956   int l, c;
957   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
958
959   // This function is invoked after handleTEX lowering, so we have to expect
960   // the arguments in the order that the hw wants them. For Fermi, array and
961   // indirect are both in the leading arg, while for Kepler, array and
962   // indirect are separate (and both precede the coordinates). Maxwell is
963   // handled in a separate function.
964   unsigned array;
965   if (targ->getChipset() < NVISA_GK104_CHIPSET)
966      array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
967   else
968      array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
969
970   i->op = OP_TEX; // no need to clone dPdx/dPdy later
971
972   for (c = 0; c < dim; ++c)
973      crd[c] = bld.getScratch();
974
975   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
976   for (l = 0; l < 4; ++l) {
977      Value *src[3], *val;
978      // mov coordinates from lane l to all lanes
979      for (c = 0; c < dim; ++c)
980         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
981      // add dPdx from lane l to lanes dx
982      for (c = 0; c < dim; ++c)
983         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
984      // add dPdy from lane l to lanes dy
985      for (c = 0; c < dim; ++c)
986         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
987      // normalize cube coordinates
988      if (i->tex.target.isCube()) {
989         for (c = 0; c < 3; ++c)
990            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
991         val = bld.getScratch();
992         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
993         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
994         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
995         for (c = 0; c < 3; ++c)
996            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
997      } else {
998         for (c = 0; c < dim; ++c)
999            src[c] = crd[c];
1000      }
1001      // texture
1002      bld.insert(tex = cloneForward(func, i));
1003      for (c = 0; c < dim; ++c)
1004         tex->setSrc(c + array, src[c]);
1005      // save results
1006      for (c = 0; i->defExists(c); ++c) {
1007         Instruction *mov;
1008         def[c][l] = bld.getSSA();
1009         mov = bld.mkMov(def[c][l], tex->getDef(c));
1010         mov->fixed = 1;
1011         mov->lanes = 1 << l;
1012      }
1013   }
1014   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
1015
1016   for (c = 0; i->defExists(c); ++c) {
1017      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
1018      for (l = 0; l < 4; ++l)
1019         u->setSrc(l, def[c][l]);
1020   }
1021
1022   i->bb->remove(i);
1023   return true;
1024}
1025
1026bool
1027NVC0LoweringPass::handleTXD(TexInstruction *txd)
1028{
1029   int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
1030   unsigned arg = txd->tex.target.getArgCount();
1031   unsigned expected_args = arg;
1032   const int chipset = prog->getTarget()->getChipset();
1033
1034   if (chipset >= NVISA_GK104_CHIPSET) {
1035      if (!txd->tex.target.isArray() && txd->tex.useOffsets)
1036         expected_args++;
1037      if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
1038         expected_args++;
1039   } else {
1040      if (txd->tex.useOffsets)
1041         expected_args++;
1042      if (!txd->tex.target.isArray() && (
1043                txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0))
1044         expected_args++;
1045   }
1046
1047   if (expected_args > 4 ||
1048       dim > 2 ||
1049       txd->tex.target.isShadow())
1050      txd->op = OP_TEX;
1051
1052   handleTEX(txd);
1053   while (txd->srcExists(arg))
1054      ++arg;
1055
1056   txd->tex.derivAll = true;
1057   if (txd->op == OP_TEX)
1058      return handleManualTXD(txd);
1059
1060   assert(arg == expected_args);
1061   for (int c = 0; c < dim; ++c) {
1062      txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
1063      txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
1064      txd->dPdx[c].set(NULL);
1065      txd->dPdy[c].set(NULL);
1066   }
1067
1068   // In this case we have fewer than 4 "real" arguments, which means that
1069   // handleTEX didn't apply any padding. However we have to make sure that
1070   // the second "group" of arguments still gets padded up to 4.
1071   if (chipset >= NVISA_GK104_CHIPSET) {
1072      int s = arg + 2 * dim;
1073      if (s >= 4 && s < 7) {
1074         if (txd->srcExists(s)) // move potential predicate out of the way
1075            txd->moveSources(s, 7 - s);
1076         while (s < 7)
1077            txd->setSrc(s++, bld.loadImm(NULL, 0));
1078      }
1079   }
1080
1081   return true;
1082}
1083
1084bool
1085NVC0LoweringPass::handleTXQ(TexInstruction *txq)
1086{
1087   const int chipset = prog->getTarget()->getChipset();
1088   if (chipset >= NVISA_GK104_CHIPSET && txq->tex.rIndirectSrc < 0)
1089      txq->tex.r += prog->driver->io.texBindBase / 4;
1090
1091   if (txq->tex.rIndirectSrc < 0)
1092      return true;
1093
1094   Value *ticRel = txq->getIndirectR();
1095
1096   txq->setIndirectS(NULL);
1097   txq->tex.sIndirectSrc = -1;
1098
1099   assert(ticRel);
1100
1101   if (chipset < NVISA_GK104_CHIPSET) {
1102      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
1103
1104      txq->setSrc(txq->tex.rIndirectSrc, NULL);
1105      if (txq->tex.r)
1106         ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
1107                             ticRel, bld.mkImm(txq->tex.r));
1108
1109      bld.mkOp2(OP_SHL, TYPE_U32, src, ticRel, bld.mkImm(0x17));
1110
1111      txq->moveSources(0, 1);
1112      txq->setSrc(0, src);
1113   } else {
1114      Value *hnd = loadTexHandle(txq->getIndirectR(), txq->tex.r);
1115      txq->tex.r = 0xff;
1116      txq->tex.s = 0x1f;
1117
1118      txq->setIndirectR(NULL);
1119      txq->moveSources(0, 1);
1120      txq->setSrc(0, hnd);
1121      txq->tex.rIndirectSrc = 0;
1122   }
1123
1124   return true;
1125}
1126
1127bool
1128NVC0LoweringPass::handleTXLQ(TexInstruction *i)
1129{
1130   /* The outputs are inverted compared to what the TGSI instruction
1131    * expects. Take that into account in the mask.
1132    */
1133   assert((i->tex.mask & ~3) == 0);
1134   if (i->tex.mask == 1)
1135      i->tex.mask = 2;
1136   else if (i->tex.mask == 2)
1137      i->tex.mask = 1;
1138   handleTEX(i);
1139   bld.setPosition(i, true);
1140
1141   /* The returned values are not quite what we want:
1142    * (a) convert from s16/u16 to f32
1143    * (b) multiply by 1/256
1144    */
1145   for (int def = 0; def < 2; ++def) {
1146      if (!i->defExists(def))
1147         continue;
1148      enum DataType type = TYPE_S16;
1149      if (i->tex.mask == 2 || def > 0)
1150         type = TYPE_U16;
1151      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
1152      bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
1153                i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
1154   }
1155   if (i->tex.mask == 3) {
1156      LValue *t = new_LValue(func, FILE_GPR);
1157      bld.mkMov(t, i->getDef(0));
1158      bld.mkMov(i->getDef(0), i->getDef(1));
1159      bld.mkMov(i->getDef(1), t);
1160   }
1161   return true;
1162}
1163
1164bool
1165NVC0LoweringPass::handleBUFQ(Instruction *bufq)
1166{
1167   bufq->op = OP_MOV;
1168   bufq->setSrc(0, loadBufLength32(bufq->getIndirect(0, 1),
1169                                   bufq->getSrc(0)->reg.fileIndex * 16));
1170   bufq->setIndirect(0, 0, NULL);
1171   bufq->setIndirect(0, 1, NULL);
1172   return true;
1173}
1174
1175void
1176NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
1177{
1178   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1179
1180   BasicBlock *currBB = atom->bb;
1181   BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
1182   BasicBlock *joinBB = atom->bb->splitAfter(atom);
1183   BasicBlock *setAndUnlockBB = new BasicBlock(func);
1184   BasicBlock *failLockBB = new BasicBlock(func);
1185
1186   bld.setPosition(currBB, true);
1187   assert(!currBB->joinAt);
1188   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1189
1190   CmpInstruction *pred =
1191      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1192                TYPE_U32, bld.mkImm(0), bld.mkImm(1));
1193
1194   bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
1195   currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
1196
1197   bld.setPosition(tryLockBB, true);
1198
1199   Instruction *ld =
1200      bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1201                 atom->getIndirect(0, 0));
1202   ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1203   ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1204
1205   bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
1206   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1207   tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
1208   tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
1209
1210   tryLockBB->cfg.detach(&joinBB->cfg);
1211   bld.remove(atom);
1212
1213   bld.setPosition(setAndUnlockBB, true);
1214   Value *stVal;
1215   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1216      // Read the old value, and write the new one.
1217      stVal = atom->getSrc(1);
1218   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1219      CmpInstruction *set =
1220         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
1221                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
1222
1223      bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
1224                TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
1225   } else {
1226      operation op;
1227
1228      switch (atom->subOp) {
1229      case NV50_IR_SUBOP_ATOM_ADD:
1230         op = OP_ADD;
1231         break;
1232      case NV50_IR_SUBOP_ATOM_AND:
1233         op = OP_AND;
1234         break;
1235      case NV50_IR_SUBOP_ATOM_OR:
1236         op = OP_OR;
1237         break;
1238      case NV50_IR_SUBOP_ATOM_XOR:
1239         op = OP_XOR;
1240         break;
1241      case NV50_IR_SUBOP_ATOM_MIN:
1242         op = OP_MIN;
1243         break;
1244      case NV50_IR_SUBOP_ATOM_MAX:
1245         op = OP_MAX;
1246         break;
1247      default:
1248         assert(0);
1249         return;
1250      }
1251
1252      stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
1253                         atom->getSrc(1));
1254   }
1255
1256   Instruction *st =
1257      bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1258                  atom->getIndirect(0, 0), stVal);
1259   st->setDef(0, pred->getDef(0));
1260   st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1261
1262   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
1263   setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
1264
1265   // Lock until the store has not been performed.
1266   bld.setPosition(failLockBB, true);
1267   bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
1268   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1269   failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
1270   failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
1271
1272   bld.setPosition(joinBB, false);
1273   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1274}
1275
1276void
1277NVC0LoweringPass::handleSharedATOM(Instruction *atom)
1278{
1279   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
1280
1281   BasicBlock *currBB = atom->bb;
1282   BasicBlock *tryLockAndSetBB = atom->bb->splitBefore(atom, false);
1283   BasicBlock *joinBB = atom->bb->splitAfter(atom);
1284
1285   bld.setPosition(currBB, true);
1286   assert(!currBB->joinAt);
1287   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
1288
1289   bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_ALWAYS, NULL);
1290   currBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::TREE);
1291
1292   bld.setPosition(tryLockAndSetBB, true);
1293
1294   Instruction *ld =
1295      bld.mkLoad(TYPE_U32, atom->getDef(0), atom->getSrc(0)->asSym(),
1296                 atom->getIndirect(0, 0));
1297   ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
1298   ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
1299
1300   Value *stVal;
1301   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
1302      // Read the old value, and write the new one.
1303      stVal = atom->getSrc(1);
1304   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1305      CmpInstruction *set =
1306         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1307                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
1308      set->setPredicate(CC_P, ld->getDef(1));
1309
1310      Instruction *selp =
1311         bld.mkOp3(OP_SELP, TYPE_U32, bld.getSSA(), ld->getDef(0),
1312                   atom->getSrc(2), set->getDef(0));
1313      selp->src(2).mod = Modifier(NV50_IR_MOD_NOT);
1314      selp->setPredicate(CC_P, ld->getDef(1));
1315
1316      stVal = selp->getDef(0);
1317   } else {
1318      operation op;
1319
1320      switch (atom->subOp) {
1321      case NV50_IR_SUBOP_ATOM_ADD:
1322         op = OP_ADD;
1323         break;
1324      case NV50_IR_SUBOP_ATOM_AND:
1325         op = OP_AND;
1326         break;
1327      case NV50_IR_SUBOP_ATOM_OR:
1328         op = OP_OR;
1329         break;
1330      case NV50_IR_SUBOP_ATOM_XOR:
1331         op = OP_XOR;
1332         break;
1333      case NV50_IR_SUBOP_ATOM_MIN:
1334         op = OP_MIN;
1335         break;
1336      case NV50_IR_SUBOP_ATOM_MAX:
1337         op = OP_MAX;
1338         break;
1339      default:
1340         assert(0);
1341         return;
1342      }
1343
1344      Instruction *i =
1345         bld.mkOp2(op, atom->dType, bld.getSSA(), ld->getDef(0),
1346                   atom->getSrc(1));
1347      i->setPredicate(CC_P, ld->getDef(1));
1348
1349      stVal = i->getDef(0);
1350   }
1351
1352   Instruction *st =
1353      bld.mkStore(OP_STORE, TYPE_U32, atom->getSrc(0)->asSym(),
1354                  atom->getIndirect(0, 0), stVal);
1355   st->setPredicate(CC_P, ld->getDef(1));
1356   st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
1357
1358   // Loop until the lock is acquired.
1359   bld.mkFlow(OP_BRA, tryLockAndSetBB, CC_NOT_P, ld->getDef(1));
1360   tryLockAndSetBB->cfg.attach(&tryLockAndSetBB->cfg, Graph::Edge::BACK);
1361   tryLockAndSetBB->cfg.attach(&joinBB->cfg, Graph::Edge::CROSS);
1362   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
1363
1364   bld.remove(atom);
1365
1366   bld.setPosition(joinBB, false);
1367   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
1368}
1369
1370bool
1371NVC0LoweringPass::handleATOM(Instruction *atom)
1372{
1373   SVSemantic sv;
1374   Value *ptr = atom->getIndirect(0, 0), *ind = atom->getIndirect(0, 1), *base;
1375
1376   switch (atom->src(0).getFile()) {
1377   case FILE_MEMORY_LOCAL:
1378      sv = SV_LBASE;
1379      break;
1380   case FILE_MEMORY_SHARED:
1381      // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
1382      // operations on shared memory. For Maxwell, ATOMS is enough.
1383      if (targ->getChipset() < NVISA_GK104_CHIPSET)
1384         handleSharedATOM(atom);
1385      else if (targ->getChipset() < NVISA_GM107_CHIPSET)
1386         handleSharedATOMNVE4(atom);
1387      return true;
1388   default:
1389      assert(atom->src(0).getFile() == FILE_MEMORY_BUFFER);
1390      base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
1391      assert(base->reg.size == 8);
1392      if (ptr)
1393         base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
1394      assert(base->reg.size == 8);
1395      atom->setIndirect(0, 0, base);
1396      atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1397
1398      // Harden against out-of-bounds accesses
1399      Value *offset = bld.loadImm(NULL, atom->getSrc(0)->reg.data.offset + typeSizeof(atom->sType));
1400      Value *length = loadBufLength32(ind, atom->getSrc(0)->reg.fileIndex * 16);
1401      Value *pred = new_LValue(func, FILE_PREDICATE);
1402      if (ptr)
1403         bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, ptr);
1404      bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
1405      atom->setPredicate(CC_NOT_P, pred);
1406      if (atom->defExists(0)) {
1407         Value *zero, *dst = atom->getDef(0);
1408         atom->setDef(0, bld.getSSA());
1409
1410         bld.setPosition(atom, true);
1411         bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
1412            ->setPredicate(CC_P, pred);
1413         bld.mkOp2(OP_UNION, TYPE_U32, dst, atom->getDef(0), zero);
1414      }
1415
1416      return true;
1417   }
1418   base =
1419      bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
1420
1421   atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
1422   atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
1423   if (ptr)
1424      base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
1425   atom->setIndirect(0, 1, NULL);
1426   atom->setIndirect(0, 0, base);
1427
1428   return true;
1429}
1430
1431bool
1432NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
1433{
1434   if (targ->getChipset() < NVISA_GM107_CHIPSET) {
1435      if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
1436         // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
1437         return false;
1438      }
1439   }
1440
1441   if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
1442       cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
1443      return false;
1444   bld.setPosition(cas, true);
1445
1446   if (needCctl) {
1447      Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
1448      cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
1449      cctl->fixed = 1;
1450      cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
1451      if (cas->isPredicated())
1452         cctl->setPredicate(cas->cc, cas->getPredicate());
1453   }
1454
1455   if (cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
1456      // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
1457      // should be set to the high part of the double reg or bad things will
1458      // happen elsewhere in the universe.
1459      // Also, it sometimes returns the new value instead of the old one
1460      // under mysterious circumstances.
1461      Value *dreg = bld.getSSA(8);
1462      bld.setPosition(cas, false);
1463      bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
1464      cas->setSrc(1, dreg);
1465      cas->setSrc(2, dreg);
1466   }
1467
1468   return true;
1469}
1470
1471inline Value *
1472NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
1473{
1474   uint8_t b = prog->driver->io.auxCBSlot;
1475   off += base;
1476
1477   return bld.
1478      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1479}
1480
1481inline Value *
1482NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
1483{
1484   uint8_t b = prog->driver->io.auxCBSlot;
1485   off += base;
1486
1487   if (ptr)
1488      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1489
1490   return bld.
1491      mkLoadv(TYPE_U64, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off), ptr);
1492}
1493
1494inline Value *
1495NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
1496{
1497   uint8_t b = prog->driver->io.auxCBSlot;
1498   off += base;
1499
1500   if (ptr)
1501      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
1502
1503   return bld.
1504      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
1505}
1506
1507inline Value *
1508NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
1509{
1510   return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
1511}
1512
1513inline Value *
1514NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
1515{
1516   return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
1517}
1518
1519inline Value *
1520NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
1521{
1522   return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
1523}
1524
1525inline Value *
1526NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
1527{
1528   return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
1529}
1530
1531inline Value *
1532NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
1533{
1534   uint8_t b = prog->driver->io.msInfoCBSlot;
1535   off += prog->driver->io.msInfoBase;
1536   return bld.
1537      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1538}
1539
1540/* On nvc0, surface info is obtained via the surface binding points passed
1541 * to the SULD/SUST instructions.
1542 * On nve4, surface info is stored in c[] and is used by various special
1543 * instructions, e.g. for clamping coordinates or generating an address.
1544 * They couldn't just have added an equivalent to TIC now, couldn't they ?
1545 */
1546#define NVC0_SU_INFO_ADDR   0x00
1547#define NVC0_SU_INFO_FMT    0x04
1548#define NVC0_SU_INFO_DIM_X  0x08
1549#define NVC0_SU_INFO_PITCH  0x0c
1550#define NVC0_SU_INFO_DIM_Y  0x10
1551#define NVC0_SU_INFO_ARRAY  0x14
1552#define NVC0_SU_INFO_DIM_Z  0x18
1553#define NVC0_SU_INFO_UNK1C  0x1c
1554#define NVC0_SU_INFO_WIDTH  0x20
1555#define NVC0_SU_INFO_HEIGHT 0x24
1556#define NVC0_SU_INFO_DEPTH  0x28
1557#define NVC0_SU_INFO_TARGET 0x2c
1558#define NVC0_SU_INFO_BSIZE  0x30
1559#define NVC0_SU_INFO_RAW_X  0x34
1560#define NVC0_SU_INFO_MS_X   0x38
1561#define NVC0_SU_INFO_MS_Y   0x3c
1562
1563#define NVC0_SU_INFO__STRIDE 0x40
1564
1565#define NVC0_SU_INFO_DIM(i)  (0x08 + (i) * 8)
1566#define NVC0_SU_INFO_SIZE(i) (0x20 + (i) * 4)
1567#define NVC0_SU_INFO_MS(i)   (0x38 + (i) * 4)
1568
1569inline Value *
1570NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off)
1571{
1572   uint32_t base = slot * NVC0_SU_INFO__STRIDE;
1573
1574   if (ptr) {
1575      ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot));
1576      ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
1577      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(6));
1578      base = 0;
1579   }
1580   off += base;
1581
1582   return loadResInfo32(ptr, off, prog->driver->io.suInfoBase);
1583}
1584
1585static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
1586{
1587   switch (su->tex.target.getEnum()) {
1588   case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1589   case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1590   case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1591   case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
1592                                   NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1593                                   NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1594   case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1595   case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1596   case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1597   case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1598   case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1599   case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1600   case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1601   default:
1602      assert(0);
1603      return 0;
1604   }
1605}
1606
1607bool
1608NVC0LoweringPass::handleSUQ(TexInstruction *suq)
1609{
1610   int mask = suq->tex.mask;
1611   int dim = suq->tex.target.getDim();
1612   int arg = dim + (suq->tex.target.isArray() || suq->tex.target.isCube());
1613   Value *ind = suq->getIndirectR();
1614   int slot = suq->tex.r;
1615   int c, d;
1616
1617   for (c = 0, d = 0; c < 3; ++c, mask >>= 1) {
1618      if (c >= arg || !(mask & 1))
1619         continue;
1620
1621      int offset;
1622
1623      if (c == 1 && suq->tex.target == TEX_TARGET_1D_ARRAY) {
1624         offset = NVC0_SU_INFO_SIZE(2);
1625      } else {
1626         offset = NVC0_SU_INFO_SIZE(c);
1627      }
1628      bld.mkMov(suq->getDef(d++), loadSuInfo32(ind, slot, offset));
1629      if (c == 2 && suq->tex.target.isCube())
1630         bld.mkOp2(OP_DIV, TYPE_U32, suq->getDef(d - 1), suq->getDef(d - 1),
1631                   bld.loadImm(NULL, 6));
1632   }
1633
1634   if (mask & 1) {
1635      if (suq->tex.target.isMS()) {
1636         Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0));
1637         Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1));
1638         Value *ms = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(), ms_x, ms_y);
1639         bld.mkOp2(OP_SHL, TYPE_U32, suq->getDef(d++), bld.loadImm(NULL, 1), ms);
1640      } else {
1641         bld.mkMov(suq->getDef(d++), bld.loadImm(NULL, 1));
1642      }
1643   }
1644
1645   bld.remove(suq);
1646   return true;
1647}
1648
1649void
1650NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
1651{
1652   const int arg = tex->tex.target.getArgCount();
1653   int slot = tex->tex.r;
1654
1655   if (tex->tex.target == TEX_TARGET_2D_MS)
1656      tex->tex.target = TEX_TARGET_2D;
1657   else
1658   if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
1659      tex->tex.target = TEX_TARGET_2D_ARRAY;
1660   else
1661      return;
1662
1663   Value *x = tex->getSrc(0);
1664   Value *y = tex->getSrc(1);
1665   Value *s = tex->getSrc(arg - 1);
1666
1667   Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
1668   Value *ind = tex->getIndirectR();
1669
1670   Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0));
1671   Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1));
1672
1673   bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
1674   bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
1675
1676   s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
1677   s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
1678
1679   Value *dx = loadMsInfo32(ts, 0x0);
1680   Value *dy = loadMsInfo32(ts, 0x4);
1681
1682   bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
1683   bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
1684
1685   tex->setSrc(0, tx);
1686   tex->setSrc(1, ty);
1687   tex->moveSources(arg, -1);
1688}
1689
1690// Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1691// They're computed from the coordinates using the surface info in c[] space.
1692void
1693NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
1694{
1695   Instruction *insn;
1696   const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
1697   const bool raw =
1698      su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
1699   const int slot = su->tex.r;
1700   const int dim = su->tex.target.getDim();
1701   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
1702   int c;
1703   Value *zero = bld.mkImm(0);
1704   Value *p1 = NULL;
1705   Value *v;
1706   Value *src[3];
1707   Value *bf, *eau, *off;
1708   Value *addr, *pred;
1709   Value *ind = su->getIndirectR();
1710
1711   off = bld.getScratch(4);
1712   bf = bld.getScratch(4);
1713   addr = bld.getSSA(8);
1714   pred = bld.getScratch(1, FILE_PREDICATE);
1715
1716   bld.setPosition(su, false);
1717
1718   adjustCoordinatesMS(su);
1719
1720   // calculate clamped coordinates
1721   for (c = 0; c < arg; ++c) {
1722      int dimc = c;
1723
1724      if (c == 1 && su->tex.target == TEX_TARGET_1D_ARRAY) {
1725         // The array index is stored in the Z component for 1D arrays.
1726         dimc = 2;
1727      }
1728
1729      src[c] = bld.getScratch();
1730      if (c == 0 && raw)
1731         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_RAW_X);
1732      else
1733         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_DIM(dimc));
1734      bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
1735         ->subOp = getSuClampSubOp(su, dimc);
1736   }
1737   for (; c < 3; ++c)
1738      src[c] = zero;
1739
1740   // set predicate output
1741   if (su->tex.target == TEX_TARGET_BUFFER) {
1742      src[0]->getInsn()->setFlagsDef(1, pred);
1743   } else
1744   if (su->tex.target.isArray() || su->tex.target.isCube()) {
1745      p1 = bld.getSSA(1, FILE_PREDICATE);
1746      src[dim]->getInsn()->setFlagsDef(1, p1);
1747   }
1748
1749   // calculate pixel offset
1750   if (dim == 1) {
1751      if (su->tex.target != TEX_TARGET_BUFFER)
1752         bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
1753   } else
1754   if (dim == 3) {
1755      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C);
1756      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
1757         ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1758
1759      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH);
1760      bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
1761         ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
1762   } else {
1763      assert(dim == 2);
1764      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_PITCH);
1765      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
1766         ->subOp = (su->tex.target.isArray() || su->tex.target.isCube()) ?
1767         NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1768   }
1769
1770   // calculate effective address part 1
1771   if (su->tex.target == TEX_TARGET_BUFFER) {
1772      if (raw) {
1773         bf = src[0];
1774      } else {
1775         v = loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT);
1776         bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
1777            ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
1778      }
1779   } else {
1780      Value *y = src[1];
1781      Value *z = src[2];
1782      uint16_t subOp = 0;
1783
1784      switch (dim) {
1785      case 1:
1786         y = zero;
1787         z = zero;
1788         break;
1789      case 2:
1790         z = off;
1791         if (!su->tex.target.isArray() && !su->tex.target.isCube()) {
1792            z = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C);
1793            subOp = NV50_IR_SUBOP_SUBFM_3D;
1794         }
1795         break;
1796      default:
1797         subOp = NV50_IR_SUBOP_SUBFM_3D;
1798         assert(dim == 3);
1799         break;
1800      }
1801      insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
1802      insn->subOp = subOp;
1803      insn->setFlagsDef(1, pred);
1804   }
1805
1806   // part 2
1807   v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR);
1808
1809   if (su->tex.target == TEX_TARGET_BUFFER) {
1810      eau = v;
1811   } else {
1812      eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
1813   }
1814   // add array layer offset
1815   if (su->tex.target.isArray() || su->tex.target.isCube()) {
1816      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY);
1817      if (dim == 1)
1818         bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
1819            ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
1820      else
1821         bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
1822            ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
1823      // combine predicates
1824      assert(p1);
1825      bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
1826   }
1827
1828   if (atom) {
1829      Value *lo = bf;
1830      if (su->tex.target == TEX_TARGET_BUFFER) {
1831         lo = zero;
1832         bld.mkMov(off, bf);
1833      }
1834      //  bf == g[] address & 0xff
1835      // eau == g[] address >> 8
1836      bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
1837      bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
1838   } else
1839   if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
1840      // Convert from u32 to u8 address format, which is what the library code
1841      // doing SULDP currently uses.
1842      // XXX: can SUEAU do this ?
1843      // XXX: does it matter that we don't mask high bytes in bf ?
1844      // Grrr.
1845      bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
1846      bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
1847   }
1848
1849   bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
1850
1851   if (atom && su->tex.target == TEX_TARGET_BUFFER)
1852      bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
1853
1854   // let's just set it 0 for raw access and hope it works
1855   v = raw ?
1856      bld.mkImm(0) : loadSuInfo32(ind, slot, NVC0_SU_INFO_FMT);
1857
1858   // get rid of old coordinate sources, make space for fmt info and predicate
1859   su->moveSources(arg, 3 - arg);
1860   // set 64 bit address and 32-bit format sources
1861   su->setSrc(0, addr);
1862   su->setSrc(1, v);
1863   su->setSrc(2, pred);
1864
1865   // prevent read fault when the image is not actually bound
1866   CmpInstruction *pred1 =
1867      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
1868                TYPE_U32, bld.mkImm(0),
1869                loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
1870
1871   if (su->op != OP_SUSTP && su->tex.format) {
1872      const TexInstruction::ImgFormatDesc *format = su->tex.format;
1873      int blockwidth = format->bits[0] + format->bits[1] +
1874                       format->bits[2] + format->bits[3];
1875
1876      // make sure that the format doesn't mismatch
1877      assert(format->components != 0);
1878      bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred1->getDef(0),
1879                TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
1880                loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
1881                pred1->getDef(0));
1882   }
1883   su->setPredicate(CC_NOT_P, pred1->getDef(0));
1884
1885   // TODO: initialize def values to 0 when the surface operation is not
1886   // performed (not needed for stores). Also, fix the "address bounds test"
1887   // subtests from arb_shader_image_load_store-invalid for buffers, because it
1888   // seems like that the predicate is not correctly set by suclamp.
1889}
1890
1891static DataType
1892getSrcType(const TexInstruction::ImgFormatDesc *t, int c)
1893{
1894   switch (t->type) {
1895   case FLOAT: return t->bits[c] == 16 ? TYPE_F16 : TYPE_F32;
1896   case UNORM: return t->bits[c] == 8 ? TYPE_U8 : TYPE_U16;
1897   case SNORM: return t->bits[c] == 8 ? TYPE_S8 : TYPE_S16;
1898   case UINT:
1899      return (t->bits[c] == 8 ? TYPE_U8 :
1900              (t->bits[c] == 16 ? TYPE_U16 : TYPE_U32));
1901   case SINT:
1902      return (t->bits[c] == 8 ? TYPE_S8 :
1903              (t->bits[c] == 16 ? TYPE_S16 : TYPE_S32));
1904   }
1905   return TYPE_NONE;
1906}
1907
1908static DataType
1909getDestType(const ImgType type) {
1910   switch (type) {
1911   case FLOAT:
1912   case UNORM:
1913   case SNORM:
1914      return TYPE_F32;
1915   case UINT:
1916      return TYPE_U32;
1917   case SINT:
1918      return TYPE_S32;
1919   default:
1920      assert(!"Impossible type");
1921      return TYPE_NONE;
1922   }
1923}
1924
1925void
1926NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su)
1927{
1928   const TexInstruction::ImgFormatDesc *format = su->tex.format;
1929   int width = format->bits[0] + format->bits[1] +
1930      format->bits[2] + format->bits[3];
1931   Value *untypedDst[4] = {};
1932   Value *typedDst[4] = {};
1933
1934   // We must convert this to a generic load.
1935   su->op = OP_SULDB;
1936
1937   su->dType = typeOfSize(width / 8);
1938   su->sType = TYPE_U8;
1939
1940   for (int i = 0; i < width / 32; i++)
1941      untypedDst[i] = bld.getSSA();
1942   if (width < 32)
1943      untypedDst[0] = bld.getSSA();
1944
1945   for (int i = 0; i < 4; i++) {
1946      typedDst[i] = su->getDef(i);
1947   }
1948
1949   // Set the untyped dsts as the su's destinations
1950   for (int i = 0; i < 4; i++)
1951      su->setDef(i, untypedDst[i]);
1952
1953   bld.setPosition(su, true);
1954
1955   // Unpack each component into the typed dsts
1956   int bits = 0;
1957   for (int i = 0; i < 4; bits += format->bits[i], i++) {
1958      if (!typedDst[i])
1959         continue;
1960      if (i >= format->components) {
1961         if (format->type == FLOAT ||
1962             format->type == UNORM ||
1963             format->type == SNORM)
1964            bld.loadImm(typedDst[i], i == 3 ? 1.0f : 0.0f);
1965         else
1966            bld.loadImm(typedDst[i], i == 3 ? 1 : 0);
1967         continue;
1968      }
1969
1970      // Get just that component's data into the relevant place
1971      if (format->bits[i] == 32)
1972         bld.mkMov(typedDst[i], untypedDst[i]);
1973      else if (format->bits[i] == 16)
1974         bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
1975                   getSrcType(format, i), untypedDst[i / 2])
1976         ->subOp = (i & 1) << (format->type == FLOAT ? 0 : 1);
1977      else if (format->bits[i] == 8)
1978         bld.mkCvt(OP_CVT, getDestType(format->type), typedDst[i],
1979                   getSrcType(format, i), untypedDst[0])->subOp = i;
1980      else {
1981         bld.mkOp2(OP_EXTBF, TYPE_U32, typedDst[i], untypedDst[bits / 32],
1982                   bld.mkImm((bits % 32) | (format->bits[i] << 8)));
1983         if (format->type == UNORM || format->type == SNORM)
1984            bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], getSrcType(format, i), typedDst[i]);
1985      }
1986
1987      // Normalize / convert as necessary
1988      if (format->type == UNORM)
1989         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << format->bits[i]) - 1)));
1990      else if (format->type == SNORM)
1991         bld.mkOp2(OP_MUL, TYPE_F32, typedDst[i], typedDst[i], bld.loadImm(NULL, 1.0f / ((1 << (format->bits[i] - 1)) - 1)));
1992      else if (format->type == FLOAT && format->bits[i] < 16) {
1993         bld.mkOp2(OP_SHL, TYPE_U32, typedDst[i], typedDst[i], bld.loadImm(NULL, 15 - format->bits[i]));
1994         bld.mkCvt(OP_CVT, TYPE_F32, typedDst[i], TYPE_F16, typedDst[i]);
1995      }
1996   }
1997
1998   if (format->bgra) {
1999      std::swap(typedDst[0], typedDst[2]);
2000   }
2001}
2002
2003void
2004NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
2005{
2006   processSurfaceCoordsNVE4(su);
2007
2008   if (su->op == OP_SULDP)
2009      convertSurfaceFormat(su);
2010
2011   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2012      assert(su->getPredicate());
2013      Value *pred =
2014         bld.mkOp2v(OP_OR, TYPE_U8, bld.getScratch(1, FILE_PREDICATE),
2015                    su->getPredicate(), su->getSrc(2));
2016
2017      Instruction *red = bld.mkOp(OP_ATOM, su->dType, bld.getSSA());
2018      red->subOp = su->subOp;
2019      red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0));
2020      red->setSrc(1, su->getSrc(3));
2021      if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
2022         red->setSrc(2, su->getSrc(4));
2023      red->setIndirect(0, 0, su->getSrc(0));
2024
2025      // make sure to initialize dst value when the atomic operation is not
2026      // performed
2027      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2028
2029      assert(su->cc == CC_NOT_P);
2030      red->setPredicate(su->cc, pred);
2031      mov->setPredicate(CC_P, pred);
2032
2033      bld.mkOp2(OP_UNION, TYPE_U32, su->getDef(0),
2034                red->getDef(0), mov->getDef(0));
2035
2036      delete_Instruction(bld.getProgram(), su);
2037      handleCasExch(red, true);
2038   }
2039
2040   if (su->op == OP_SUSTB || su->op == OP_SUSTP)
2041      su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
2042}
2043
2044void
2045NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
2046{
2047   const int slot = su->tex.r;
2048   const int dim = su->tex.target.getDim();
2049   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2050   int c;
2051   Value *zero = bld.mkImm(0);
2052   Value *src[3];
2053   Value *v;
2054   Value *ind = su->getIndirectR();
2055
2056   bld.setPosition(su, false);
2057
2058   adjustCoordinatesMS(su);
2059
2060   if (ind) {
2061      Value *ptr;
2062      ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
2063      ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
2064      su->setIndirectR(ptr);
2065   }
2066
2067   // get surface coordinates
2068   for (c = 0; c < arg; ++c)
2069      src[c] = su->getSrc(c);
2070   for (; c < 3; ++c)
2071      src[c] = zero;
2072
2073   // calculate pixel offset
2074   if (su->op == OP_SULDP || su->op == OP_SUREDP) {
2075      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE);
2076      su->setSrc(0, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[0], v));
2077   }
2078
2079   // add array layer offset
2080   if (su->tex.target.isArray() || su->tex.target.isCube()) {
2081      v = loadSuInfo32(ind, slot, NVC0_SU_INFO_ARRAY);
2082      assert(dim > 1);
2083      su->setSrc(2, bld.mkOp2v(OP_MUL, TYPE_U32, bld.getSSA(), src[2], v));
2084   }
2085
2086   // prevent read fault when the image is not actually bound
2087   CmpInstruction *pred =
2088      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2089                TYPE_U32, bld.mkImm(0),
2090                loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
2091   if (su->op != OP_SUSTP && su->tex.format) {
2092      const TexInstruction::ImgFormatDesc *format = su->tex.format;
2093      int blockwidth = format->bits[0] + format->bits[1] +
2094                       format->bits[2] + format->bits[3];
2095
2096      assert(format->components != 0);
2097      // make sure that the format doesn't mismatch when it's not FMT_NONE
2098      bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2099                TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2100                loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
2101                pred->getDef(0));
2102   }
2103   su->setPredicate(CC_NOT_P, pred->getDef(0));
2104}
2105
2106void
2107NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su)
2108{
2109   if (su->tex.target == TEX_TARGET_1D_ARRAY) {
2110      /* As 1d arrays also need 3 coordinates, switching to TEX_TARGET_2D_ARRAY
2111       * will simplify the lowering pass and the texture constraints. */
2112      su->moveSources(1, 1);
2113      su->setSrc(1, bld.loadImm(NULL, 0));
2114      su->tex.target = TEX_TARGET_2D_ARRAY;
2115   }
2116
2117   processSurfaceCoordsNVC0(su);
2118
2119   if (su->op == OP_SULDP)
2120      convertSurfaceFormat(su);
2121
2122   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
2123      const int dim = su->tex.target.getDim();
2124      const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2125      LValue *addr = bld.getSSA(8);
2126      Value *def = su->getDef(0);
2127
2128      su->op = OP_SULEA;
2129
2130      // Set the destination to the address
2131      su->dType = TYPE_U64;
2132      su->setDef(0, addr);
2133      su->setDef(1, su->getPredicate());
2134
2135      bld.setPosition(su, true);
2136
2137      // Perform the atomic op
2138      Instruction *red = bld.mkOp(OP_ATOM, su->sType, bld.getSSA());
2139      red->subOp = su->subOp;
2140      red->setSrc(0, bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, su->sType, 0));
2141      red->setSrc(1, su->getSrc(arg));
2142      if (red->subOp == NV50_IR_SUBOP_ATOM_CAS)
2143         red->setSrc(2, su->getSrc(arg + 1));
2144      red->setIndirect(0, 0, addr);
2145
2146      // make sure to initialize dst value when the atomic operation is not
2147      // performed
2148      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2149
2150      assert(su->cc == CC_NOT_P);
2151      red->setPredicate(su->cc, su->getPredicate());
2152      mov->setPredicate(CC_P, su->getPredicate());
2153
2154      bld.mkOp2(OP_UNION, TYPE_U32, def, red->getDef(0), mov->getDef(0));
2155
2156      handleCasExch(red, false);
2157   }
2158}
2159
2160void
2161NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su)
2162{
2163   const int slot = su->tex.r;
2164   const int dim = su->tex.target.getDim();
2165   const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube());
2166   Value *ind = su->getIndirectR();
2167   int pos = 0;
2168
2169   bld.setPosition(su, false);
2170
2171   // add texture handle
2172   switch (su->op) {
2173   case OP_SUSTP:
2174      pos = 4;
2175      break;
2176   case OP_SUREDP:
2177      pos = (su->subOp == NV50_IR_SUBOP_ATOM_CAS) ? 2 : 1;
2178      break;
2179   default:
2180      assert(pos == 0);
2181      break;
2182   }
2183   su->setSrc(arg + pos, loadTexHandle(ind, slot + 32));
2184
2185   // prevent read fault when the image is not actually bound
2186   CmpInstruction *pred =
2187      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
2188                TYPE_U32, bld.mkImm(0),
2189                loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR));
2190   if (su->op != OP_SUSTP && su->tex.format) {
2191      const TexInstruction::ImgFormatDesc *format = su->tex.format;
2192      int blockwidth = format->bits[0] + format->bits[1] +
2193                       format->bits[2] + format->bits[3];
2194
2195      assert(format->components != 0);
2196      // make sure that the format doesn't mismatch when it's not FMT_NONE
2197      bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0),
2198                TYPE_U32, bld.loadImm(NULL, blockwidth / 8),
2199                loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE),
2200                pred->getDef(0));
2201   }
2202   su->setPredicate(CC_NOT_P, pred->getDef(0));
2203}
2204
2205void
2206NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su)
2207{
2208   processSurfaceCoordsGM107(su);
2209
2210   if (su->op == OP_SULDP)
2211      convertSurfaceFormat(su);
2212
2213   if (su->op == OP_SUREDP) {
2214      Value *def = su->getDef(0);
2215
2216      su->op = OP_SUREDB;
2217      su->setDef(0, bld.getSSA());
2218
2219      bld.setPosition(su, true);
2220
2221      // make sure to initialize dst value when the atomic operation is not
2222      // performed
2223      Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0));
2224
2225      assert(su->cc == CC_NOT_P);
2226      mov->setPredicate(CC_P, su->getPredicate());
2227
2228      bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0));
2229   }
2230}
2231
2232bool
2233NVC0LoweringPass::handleWRSV(Instruction *i)
2234{
2235   Instruction *st;
2236   Symbol *sym;
2237   uint32_t addr;
2238
2239   // must replace, $sreg are not writeable
2240   addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
2241   if (addr >= 0x400)
2242      return false;
2243   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
2244
2245   st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
2246                    i->getSrc(1));
2247   st->perPatch = i->perPatch;
2248
2249   bld.getBB()->remove(i);
2250   return true;
2251}
2252
2253void
2254NVC0LoweringPass::handleLDST(Instruction *i)
2255{
2256   if (i->src(0).getFile() == FILE_SHADER_INPUT) {
2257      if (prog->getType() == Program::TYPE_COMPUTE) {
2258         i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
2259         i->getSrc(0)->reg.fileIndex = 0;
2260      } else
2261      if (prog->getType() == Program::TYPE_GEOMETRY &&
2262          i->src(0).isIndirect(0)) {
2263         // XXX: this assumes vec4 units
2264         Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2265                                 i->getIndirect(0, 0), bld.mkImm(4));
2266         i->setIndirect(0, 0, ptr);
2267         i->op = OP_VFETCH;
2268      } else {
2269         i->op = OP_VFETCH;
2270         assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
2271      }
2272   } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
2273      if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2274          prog->getType() == Program::TYPE_COMPUTE) {
2275         // The launch descriptor only allows to set up 8 CBs, but OpenGL
2276         // requires at least 12 UBOs. To bypass this limitation, we store the
2277         // addrs into the driver constbuf and we directly load from the global
2278         // memory.
2279         int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
2280         Value *ind = i->getIndirect(0, 1);
2281
2282         if (!ind && fileIndex == -1)
2283            return;
2284
2285         if (ind) {
2286            // Clamp the UBO index when an indirect access is used to avoid
2287            // loading information from the wrong place in the driver cb.
2288            // TODO - synchronize the max with the driver.
2289            ind = bld.mkOp2v(OP_MIN, TYPE_U32, ind,
2290                             bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(),
2291                                        ind, bld.loadImm(NULL, fileIndex)),
2292                             bld.loadImm(NULL, 13));
2293            fileIndex = 0;
2294         }
2295
2296         Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2297         Value *ptr = loadUboInfo64(ind, fileIndex * 16);
2298         Value *length = loadUboLength32(ind, fileIndex * 16);
2299         Value *pred = new_LValue(func, FILE_PREDICATE);
2300         if (i->src(0).isIndirect(0)) {
2301            bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2302            bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2303         }
2304         i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2305         i->setIndirect(0, 1, NULL);
2306         i->setIndirect(0, 0, ptr);
2307         bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2308         i->setPredicate(CC_NOT_P, pred);
2309         Value *zero, *dst = i->getDef(0);
2310         i->setDef(0, bld.getSSA());
2311
2312         bld.setPosition(i, true);
2313         bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2314            ->setPredicate(CC_P, pred);
2315         bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2316      } else if (i->src(0).isIndirect(1)) {
2317         Value *ptr;
2318         if (i->src(0).isIndirect(0))
2319            ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
2320                             i->getIndirect(0, 1), bld.mkImm(0x1010),
2321                             i->getIndirect(0, 0));
2322         else
2323            ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2324                             i->getIndirect(0, 1), bld.mkImm(16));
2325         i->setIndirect(0, 1, NULL);
2326         i->setIndirect(0, 0, ptr);
2327         i->subOp = NV50_IR_SUBOP_LDC_IS;
2328      }
2329   } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
2330      assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
2331      i->op = OP_VFETCH;
2332   } else if (i->src(0).getFile() == FILE_MEMORY_BUFFER) {
2333      Value *ind = i->getIndirect(0, 1);
2334      Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
2335      // XXX come up with a way not to do this for EVERY little access but
2336      // rather to batch these up somehow. Unfortunately we've lost the
2337      // information about the field width by the time we get here.
2338      Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
2339      Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
2340      Value *pred = new_LValue(func, FILE_PREDICATE);
2341      if (i->src(0).isIndirect(0)) {
2342         bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
2343         bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
2344      }
2345      i->setIndirect(0, 1, NULL);
2346      i->setIndirect(0, 0, ptr);
2347      i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
2348      bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
2349      i->setPredicate(CC_NOT_P, pred);
2350      if (i->defExists(0)) {
2351         Value *zero, *dst = i->getDef(0);
2352         i->setDef(0, bld.getSSA());
2353
2354         bld.setPosition(i, true);
2355         bld.mkMov((zero = bld.getSSA()), bld.mkImm(0))
2356            ->setPredicate(CC_P, pred);
2357         bld.mkOp2(OP_UNION, TYPE_U32, dst, i->getDef(0), zero);
2358      }
2359   }
2360}
2361
2362void
2363NVC0LoweringPass::readTessCoord(LValue *dst, int c)
2364{
2365   Value *laneid = bld.getSSA();
2366   Value *x, *y;
2367
2368   bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
2369
2370   if (c == 0) {
2371      x = dst;
2372      y = NULL;
2373   } else
2374   if (c == 1) {
2375      x = NULL;
2376      y = dst;
2377   } else {
2378      assert(c == 2);
2379      if (prog->driver->prop.tp.domain != PIPE_PRIM_TRIANGLES) {
2380         bld.mkMov(dst, bld.loadImm(NULL, 0));
2381         return;
2382      }
2383      x = bld.getSSA();
2384      y = bld.getSSA();
2385   }
2386   if (x)
2387      bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
2388   if (y)
2389      bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
2390
2391   if (c == 2) {
2392      bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
2393      bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
2394   }
2395}
2396
2397bool
2398NVC0LoweringPass::handleRDSV(Instruction *i)
2399{
2400   Symbol *sym = i->getSrc(0)->asSym();
2401   const SVSemantic sv = sym->reg.data.sv.sv;
2402   Value *vtx = NULL;
2403   Instruction *ld;
2404   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
2405
2406   if (addr >= 0x400) {
2407      // mov $sreg
2408      if (sym->reg.data.sv.index == 3) {
2409         // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
2410         i->op = OP_MOV;
2411         i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
2412      }
2413      if (sv == SV_VERTEX_COUNT) {
2414         bld.setPosition(i, true);
2415         bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0), bld.mkImm(0x808));
2416      }
2417      return true;
2418   }
2419
2420   switch (sv) {
2421   case SV_POSITION:
2422      assert(prog->getType() == Program::TYPE_FRAGMENT);
2423      if (i->srcExists(1)) {
2424         // Pass offset through to the interpolation logic
2425         ld = bld.mkInterp(NV50_IR_INTERP_LINEAR | NV50_IR_INTERP_OFFSET,
2426                           i->getDef(0), addr, NULL);
2427         ld->setSrc(1, i->getSrc(1));
2428      } else {
2429         bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
2430      }
2431      break;
2432   case SV_FACE:
2433   {
2434      Value *face = i->getDef(0);
2435      bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
2436      if (i->dType == TYPE_F32) {
2437         bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
2438         bld.mkOp1(OP_NEG, TYPE_S32, face, face);
2439         bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
2440      }
2441   }
2442      break;
2443   case SV_TESS_COORD:
2444      assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
2445      readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
2446      break;
2447   case SV_NTID:
2448   case SV_NCTAID:
2449   case SV_GRIDID:
2450      assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
2451      if (sym->reg.data.sv.index == 3) {
2452         i->op = OP_MOV;
2453         i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
2454         return true;
2455      }
2456      // Fallthrough
2457   case SV_WORK_DIM:
2458      addr += prog->driver->prop.cp.gridInfoBase;
2459      bld.mkLoad(TYPE_U32, i->getDef(0),
2460                 bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2461                              TYPE_U32, addr), NULL);
2462      break;
2463   case SV_SAMPLE_INDEX:
2464      // TODO: Properly pass source as an address in the PIX address space
2465      // (which can be of the form [r0+offset]). But this is currently
2466      // unnecessary.
2467      ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2468      ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2469      break;
2470   case SV_SAMPLE_POS: {
2471      Value *off = new_LValue(func, FILE_GPR);
2472      ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2473      ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2474      bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3));
2475      bld.mkLoad(TYPE_F32,
2476                 i->getDef(0),
2477                 bld.mkSymbol(
2478                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
2479                       TYPE_U32, prog->driver->io.sampleInfoBase +
2480                       4 * sym->reg.data.sv.index),
2481                 off);
2482      break;
2483   }
2484   case SV_SAMPLE_MASK: {
2485      ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
2486      ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
2487      Instruction *sampleid =
2488         bld.mkOp1(OP_PIXLD, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2489      sampleid->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
2490      Value *masked =
2491         bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ld->getDef(0),
2492                    bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
2493                               bld.loadImm(NULL, 1), sampleid->getDef(0)));
2494      if (prog->driver->prop.fp.persampleInvocation) {
2495         bld.mkMov(i->getDef(0), masked);
2496      } else {
2497         bld.mkOp3(OP_SELP, TYPE_U32, i->getDef(0), ld->getDef(0), masked,
2498                   bld.mkImm(0))
2499            ->subOp = 1;
2500      }
2501      break;
2502   }
2503   case SV_BASEVERTEX:
2504   case SV_BASEINSTANCE:
2505   case SV_DRAWID:
2506      ld = bld.mkLoad(TYPE_U32, i->getDef(0),
2507                      bld.mkSymbol(FILE_MEMORY_CONST,
2508                                   prog->driver->io.auxCBSlot,
2509                                   TYPE_U32,
2510                                   prog->driver->io.drawInfoBase +
2511                                   4 * (sv - SV_BASEVERTEX)),
2512                      NULL);
2513      break;
2514   default:
2515      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL && !i->perPatch)
2516         vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
2517      if (prog->getType() == Program::TYPE_FRAGMENT) {
2518         bld.mkInterp(NV50_IR_INTERP_FLAT, i->getDef(0), addr, NULL);
2519      } else {
2520         ld = bld.mkFetch(i->getDef(0), i->dType,
2521                          FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
2522         ld->perPatch = i->perPatch;
2523      }
2524      break;
2525   }
2526   bld.getBB()->remove(i);
2527   return true;
2528}
2529
2530bool
2531NVC0LoweringPass::handleDIV(Instruction *i)
2532{
2533   if (!isFloatType(i->dType))
2534      return true;
2535   bld.setPosition(i, false);
2536   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
2537   i->op = OP_MUL;
2538   i->setSrc(1, rcp->getDef(0));
2539   return true;
2540}
2541
2542bool
2543NVC0LoweringPass::handleMOD(Instruction *i)
2544{
2545   if (!isFloatType(i->dType))
2546      return true;
2547   LValue *value = bld.getScratch(typeSizeof(i->dType));
2548   bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
2549   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
2550   bld.mkOp1(OP_TRUNC, i->dType, value, value);
2551   bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
2552   i->op = OP_SUB;
2553   i->setSrc(1, value);
2554   return true;
2555}
2556
2557bool
2558NVC0LoweringPass::handleSQRT(Instruction *i)
2559{
2560   if (i->dType == TYPE_F64) {
2561      Value *pred = bld.getSSA(1, FILE_PREDICATE);
2562      Value *zero = bld.loadImm(NULL, 0.0);
2563      Value *dst = bld.getSSA(8);
2564      bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
2565      bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
2566      bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
2567      i->op = OP_MUL;
2568      i->setSrc(1, dst);
2569      // TODO: Handle this properly with a library function
2570   } else {
2571      bld.setPosition(i, true);
2572      i->op = OP_RSQ;
2573      bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
2574   }
2575
2576   return true;
2577}
2578
2579bool
2580NVC0LoweringPass::handlePOW(Instruction *i)
2581{
2582   LValue *val = bld.getScratch();
2583
2584   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
2585   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
2586   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
2587
2588   i->op = OP_EX2;
2589   i->setSrc(0, val);
2590   i->setSrc(1, NULL);
2591
2592   return true;
2593}
2594
2595bool
2596NVC0LoweringPass::handleEXPORT(Instruction *i)
2597{
2598   if (prog->getType() == Program::TYPE_FRAGMENT) {
2599      int id = i->getSrc(0)->reg.data.offset / 4;
2600
2601      if (i->src(0).isIndirect(0)) // TODO, ugly
2602         return false;
2603      i->op = OP_MOV;
2604      i->subOp = NV50_IR_SUBOP_MOV_FINAL;
2605      i->src(0).set(i->src(1));
2606      i->setSrc(1, NULL);
2607      i->setDef(0, new_LValue(func, FILE_GPR));
2608      i->getDef(0)->reg.data.id = id;
2609
2610      prog->maxGPR = MAX2(prog->maxGPR, id);
2611   } else
2612   if (prog->getType() == Program::TYPE_GEOMETRY) {
2613      i->setIndirect(0, 1, gpEmitAddress);
2614   }
2615   return true;
2616}
2617
2618bool
2619NVC0LoweringPass::handleOUT(Instruction *i)
2620{
2621   Instruction *prev = i->prev;
2622   ImmediateValue stream, prevStream;
2623
2624   // Only merge if the stream ids match. Also, note that the previous
2625   // instruction would have already been lowered, so we take arg1 from it.
2626   if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
2627       i->src(0).getImmediate(stream) &&
2628       prev->src(1).getImmediate(prevStream) &&
2629       stream.reg.data.u32 == prevStream.reg.data.u32) {
2630      i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
2631      delete_Instruction(prog, i);
2632   } else {
2633      assert(gpEmitAddress);
2634      i->setDef(0, gpEmitAddress);
2635      i->setSrc(1, i->getSrc(0));
2636      i->setSrc(0, gpEmitAddress);
2637   }
2638   return true;
2639}
2640
2641// Generate a binary predicate if an instruction is predicated by
2642// e.g. an f32 value.
2643void
2644NVC0LoweringPass::checkPredicate(Instruction *insn)
2645{
2646   Value *pred = insn->getPredicate();
2647   Value *pdst;
2648
2649   if (!pred || pred->reg.file == FILE_PREDICATE)
2650      return;
2651   pdst = new_LValue(func, FILE_PREDICATE);
2652
2653   // CAUTION: don't use pdst->getInsn, the definition might not be unique,
2654   //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
2655
2656   bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
2657
2658   insn->setPredicate(insn->cc, pdst);
2659}
2660
2661//
2662// - add quadop dance for texturing
2663// - put FP outputs in GPRs
2664// - convert instruction sequences
2665//
2666bool
2667NVC0LoweringPass::visit(Instruction *i)
2668{
2669   bool ret = true;
2670   bld.setPosition(i, false);
2671
2672   if (i->cc != CC_ALWAYS)
2673      checkPredicate(i);
2674
2675   switch (i->op) {
2676   case OP_TEX:
2677   case OP_TXB:
2678   case OP_TXL:
2679   case OP_TXF:
2680   case OP_TXG:
2681      return handleTEX(i->asTex());
2682   case OP_TXD:
2683      return handleTXD(i->asTex());
2684   case OP_TXLQ:
2685      return handleTXLQ(i->asTex());
2686   case OP_TXQ:
2687     return handleTXQ(i->asTex());
2688   case OP_EX2:
2689      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
2690      i->setSrc(0, i->getDef(0));
2691      break;
2692   case OP_POW:
2693      return handlePOW(i);
2694   case OP_DIV:
2695      return handleDIV(i);
2696   case OP_MOD:
2697      return handleMOD(i);
2698   case OP_SQRT:
2699      return handleSQRT(i);
2700   case OP_EXPORT:
2701      ret = handleEXPORT(i);
2702      break;
2703   case OP_EMIT:
2704   case OP_RESTART:
2705      return handleOUT(i);
2706   case OP_RDSV:
2707      return handleRDSV(i);
2708   case OP_WRSV:
2709      return handleWRSV(i);
2710   case OP_STORE:
2711   case OP_LOAD:
2712      handleLDST(i);
2713      break;
2714   case OP_ATOM:
2715   {
2716      const bool cctl = i->src(0).getFile() == FILE_MEMORY_BUFFER;
2717      handleATOM(i);
2718      handleCasExch(i, cctl);
2719   }
2720      break;
2721   case OP_SULDB:
2722   case OP_SULDP:
2723   case OP_SUSTB:
2724   case OP_SUSTP:
2725   case OP_SUREDB:
2726   case OP_SUREDP:
2727      if (targ->getChipset() >= NVISA_GM107_CHIPSET)
2728         handleSurfaceOpGM107(i->asTex());
2729      else if (targ->getChipset() >= NVISA_GK104_CHIPSET)
2730         handleSurfaceOpNVE4(i->asTex());
2731      else
2732         handleSurfaceOpNVC0(i->asTex());
2733      break;
2734   case OP_SUQ:
2735      handleSUQ(i->asTex());
2736      break;
2737   case OP_BUFQ:
2738      handleBUFQ(i);
2739      break;
2740   default:
2741      break;
2742   }
2743
2744   /* Kepler+ has a special opcode to compute a new base address to be used
2745    * for indirect loads.
2746    *
2747    * Maxwell+ has an additional similar requirement for indirect
2748    * interpolation ops in frag shaders.
2749    */
2750   bool doAfetch = false;
2751   if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
2752       !i->perPatch &&
2753       (i->op == OP_VFETCH || i->op == OP_EXPORT) &&
2754       i->src(0).isIndirect(0)) {
2755      doAfetch = true;
2756   }
2757   if (targ->getChipset() >= NVISA_GM107_CHIPSET &&
2758       (i->op == OP_LINTERP || i->op == OP_PINTERP) &&
2759       i->src(0).isIndirect(0)) {
2760      doAfetch = true;
2761   }
2762
2763   if (doAfetch) {
2764      Value *addr = cloneShallow(func, i->getSrc(0));
2765      Instruction *afetch = bld.mkOp1(OP_AFETCH, TYPE_U32, bld.getSSA(),
2766                                      i->getSrc(0));
2767      afetch->setIndirect(0, 0, i->getIndirect(0, 0));
2768      addr->reg.data.offset = 0;
2769      i->setSrc(0, addr);
2770      i->setIndirect(0, 0, afetch->getDef(0));
2771   }
2772
2773   return ret;
2774}
2775
2776bool
2777TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
2778{
2779   if (stage == CG_STAGE_PRE_SSA) {
2780      NVC0LoweringPass pass(prog);
2781      return pass.run(prog, false, true);
2782   } else
2783   if (stage == CG_STAGE_POST_RA) {
2784      NVC0LegalizePostRA pass(prog);
2785      return pass.run(prog, false, true);
2786   } else
2787   if (stage == CG_STAGE_SSA) {
2788      NVC0LegalizeSSA pass;
2789      return pass.run(prog, false, true);
2790   }
2791   return false;
2792}
2793
2794} // namespace nv50_ir
2795