1/*
2 * Copyright 2011 Christoph Bumiller
3 *           2014 Red Hat Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 * OTHER DEALINGS IN THE SOFTWARE.
22 */
23
24#include "codegen/nv50_ir.h"
25#include "codegen/nv50_ir_build_util.h"
26
27#include "codegen/nv50_ir_target_nvc0.h"
28#include "codegen/nv50_ir_lowering_gm107.h"
29
30#include <limits>
31
32namespace nv50_ir {
33
34#define QOP_ADD  0
35#define QOP_SUBR 1
36#define QOP_SUB  2
37#define QOP_MOV2 3
38
39//             UL UR LL LR
40#define QUADOP(q, r, s, t)                      \
41   ((QOP_##q << 6) | (QOP_##r << 4) |           \
42    (QOP_##s << 2) | (QOP_##t << 0))
43
44void
45GM107LegalizeSSA::handlePFETCH(Instruction *i)
46{
47   Value *src0;
48
49   if (i->src(0).getFile() == FILE_GPR && !i->srcExists(1))
50      return;
51
52   bld.setPosition(i, false);
53   src0 = bld.getSSA();
54
55   if (i->srcExists(1))
56      bld.mkOp2(OP_ADD , TYPE_U32, src0, i->getSrc(0), i->getSrc(1));
57   else
58      bld.mkOp1(OP_MOV , TYPE_U32, src0, i->getSrc(0));
59
60   i->setSrc(0, src0);
61   i->setSrc(1, NULL);
62}
63
64void
65GM107LegalizeSSA::handleLOAD(Instruction *i)
66{
67   if (i->src(0).getFile() != FILE_MEMORY_CONST)
68      return;
69   if (i->src(0).isIndirect(0))
70      return;
71   if (typeSizeof(i->dType) != 4)
72      return;
73
74   i->op = OP_MOV;
75}
76
77bool
78GM107LegalizeSSA::visit(Instruction *i)
79{
80   switch (i->op) {
81   case OP_PFETCH:
82      handlePFETCH(i);
83      break;
84   case OP_LOAD:
85      handleLOAD(i);
86      break;
87   default:
88      break;
89   }
90   return true;
91}
92
93bool
94GM107LoweringPass::handleManualTXD(TexInstruction *i)
95{
96   static const uint8_t qOps[4][2] =
97   {
98      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
99      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
100      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
101      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
102   };
103   Value *def[4][4];
104   Value *crd[3];
105   Value *tmp;
106   Instruction *tex, *add;
107   Value *zero = bld.loadImm(bld.getSSA(), 0);
108   int l, c;
109   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
110   const int array = i->tex.target.isArray();
111
112   i->op = OP_TEX; // no need to clone dPdx/dPdy later
113
114   for (c = 0; c < dim; ++c)
115      crd[c] = bld.getScratch();
116   tmp = bld.getScratch();
117
118   for (l = 0; l < 4; ++l) {
119      Value *src[3], *val;
120      // mov coordinates from lane l to all lanes
121      bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
122      for (c = 0; c < dim; ++c) {
123         bld.mkOp2(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c + array), bld.mkImm(l));
124         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], crd[c], zero);
125         add->subOp = 0x00;
126         add->lanes = 1; /* abused for .ndv */
127      }
128
129      // add dPdx from lane l to lanes dx
130      for (c = 0; c < dim; ++c) {
131         bld.mkOp2(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), bld.mkImm(l));
132         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
133         add->subOp = qOps[l][0];
134         add->lanes = 1; /* abused for .ndv */
135      }
136
137      // add dPdy from lane l to lanes dy
138      for (c = 0; c < dim; ++c) {
139         bld.mkOp2(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), bld.mkImm(l));
140         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
141         add->subOp = qOps[l][1];
142         add->lanes = 1; /* abused for .ndv */
143      }
144
145      // normalize cube coordinates if necessary
146      if (i->tex.target.isCube()) {
147         for (c = 0; c < 3; ++c)
148            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
149         val = bld.getScratch();
150         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
151         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
152         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
153         for (c = 0; c < 3; ++c)
154            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
155      } else {
156         for (c = 0; c < dim; ++c)
157            src[c] = crd[c];
158      }
159
160      // texture
161      bld.insert(tex = cloneForward(func, i));
162      for (c = 0; c < dim; ++c)
163         tex->setSrc(c + array, src[c]);
164      bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
165
166      // save results
167      for (c = 0; i->defExists(c); ++c) {
168         Instruction *mov;
169         def[c][l] = bld.getSSA();
170         mov = bld.mkMov(def[c][l], tex->getDef(c));
171         mov->fixed = 1;
172         mov->lanes = 1 << l;
173      }
174   }
175
176   for (c = 0; i->defExists(c); ++c) {
177      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
178      for (l = 0; l < 4; ++l)
179         u->setSrc(l, def[c][l]);
180   }
181
182   i->bb->remove(i);
183   return true;
184}
185
186bool
187GM107LoweringPass::handleDFDX(Instruction *insn)
188{
189   Instruction *shfl;
190   int qop = 0, xid = 0;
191
192   switch (insn->op) {
193   case OP_DFDX:
194      qop = QUADOP(SUB, SUBR, SUB, SUBR);
195      xid = 1;
196      break;
197   case OP_DFDY:
198      qop = QUADOP(SUB, SUB, SUBR, SUBR);
199      xid = 2;
200      break;
201   default:
202      assert(!"invalid dfdx opcode");
203      break;
204   }
205
206   shfl = bld.mkOp2(OP_SHFL, TYPE_F32, bld.getScratch(),
207                    insn->getSrc(0), bld.mkImm(xid));
208   shfl->subOp = NV50_IR_SUBOP_SHFL_BFLY;
209   insn->op = OP_QUADOP;
210   insn->subOp = qop;
211   insn->lanes = 0; /* abused for !.ndv */
212   insn->setSrc(1, insn->getSrc(0));
213   insn->setSrc(0, shfl->getDef(0));
214   return true;
215}
216
217bool
218GM107LoweringPass::handlePFETCH(Instruction *i)
219{
220   Value *tmp0 = bld.getScratch();
221   Value *tmp1 = bld.getScratch();
222   Value *tmp2 = bld.getScratch();
223   bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0));
224   bld.mkOp2(OP_SHR , TYPE_U32, tmp1, tmp0, bld.mkImm(16));
225   bld.mkOp2(OP_AND , TYPE_U32, tmp0, tmp0, bld.mkImm(0xff));
226   bld.mkOp2(OP_AND , TYPE_U32, tmp1, tmp1, bld.mkImm(0xff));
227   if (i->getSrc(1))
228      bld.mkOp2(OP_ADD , TYPE_U32, tmp2, i->getSrc(0), i->getSrc(1));
229   else
230      bld.mkOp1(OP_MOV , TYPE_U32, tmp2, i->getSrc(0));
231   bld.mkOp3(OP_MAD , TYPE_U32, tmp0, tmp0, tmp1, tmp2);
232   i->setSrc(0, tmp0);
233   i->setSrc(1, NULL);
234   return true;
235}
236
237bool
238GM107LoweringPass::handlePOPCNT(Instruction *i)
239{
240   Value *tmp = bld.mkOp2v(OP_AND, i->sType, bld.getScratch(),
241                           i->getSrc(0), i->getSrc(1));
242   i->setSrc(0, tmp);
243   i->setSrc(1, NULL);
244   return true;
245}
246
247//
248// - add quadop dance for texturing
249// - put FP outputs in GPRs
250// - convert instruction sequences
251//
252bool
253GM107LoweringPass::visit(Instruction *i)
254{
255   bld.setPosition(i, false);
256
257   if (i->cc != CC_ALWAYS)
258      checkPredicate(i);
259
260   switch (i->op) {
261   case OP_PFETCH:
262      return handlePFETCH(i);
263   case OP_DFDX:
264   case OP_DFDY:
265      return handleDFDX(i);
266   case OP_POPCNT:
267      return handlePOPCNT(i);
268   default:
269      return NVC0LoweringPass::visit(i);
270   }
271}
272
273} // namespace nv50_ir
274