PerfectShuffle.cpp revision 64a8dddb73bba20dd24fb3a233a39cbc79040fef
1//===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file was developed by Chris Lattner and is distributed under
6// the University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file computes an optimal sequence of instructions for doing all shuffles
11// of two 4-element vectors.  With a release build and when configured to emit
12// an altivec instruction table, this takes about 30s to run on a 2.7Ghz
13// PowerPC G5.
14//
15//===----------------------------------------------------------------------===//
16
17#include <iostream>
18#include <vector>
19#include <cassert>
20
21struct Operator;
22
23// Masks are 4-nibble hex numbers.  Values 0-7 in any nibble means that it takes
24// an element from that value of the input vectors.  A value of 8 means the
25// entry is undefined.
26
27// Mask manipulation functions.
28static inline unsigned short MakeMask(unsigned V0, unsigned V1,
29                                      unsigned V2, unsigned V3) {
30  return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4));
31}
32
33/// getMaskElt - Return element N of the specified mask.
34static unsigned getMaskElt(unsigned Mask, unsigned Elt) {
35  return (Mask >> ((3-Elt)*4)) & 0xF;
36}
37
38static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) {
39  unsigned FieldShift = ((3-Elt)*4);
40  return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift);
41}
42
43// Reject elements where the values are 9-15.
44static bool isValidMask(unsigned short Mask) {
45  unsigned short UndefBits = Mask & 0x8888;
46  return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0;
47}
48
49/// hasUndefElements - Return true if any of the elements in the mask are undefs
50///
51static bool hasUndefElements(unsigned short Mask) {
52  return (Mask & 0x8888) != 0;
53}
54
55/// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
56/// including undef values..
57static bool isOnlyLHSMask(unsigned short Mask) {
58  return (Mask & 0x4444) == 0;
59}
60
61/// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
62/// refer to the LHS only (for when one argument value is passed into the same
63/// function twice).
64#if 0
65static unsigned short getLHSOnlyMask(unsigned short Mask) {
66  return Mask & 0xBBBB;  // Keep only LHS and Undefs.
67}
68#endif
69
70/// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
71/// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
72static unsigned getCompressedMask(unsigned short Mask) {
73  return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 +
74         getMaskElt(Mask, 2)*9     + getMaskElt(Mask, 3);
75}
76
77static void PrintMask(unsigned i, std::ostream &OS) {
78  OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0)))
79     << "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1)))
80     << "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2)))
81     << "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3)))
82     << ">";
83}
84
85/// ShuffleVal - This represents a shufflevector operation.
86struct ShuffleVal {
87  unsigned Cost;  // Number of instrs used to generate this value.
88  Operator *Op;   // The Operation used to generate this value.
89  unsigned short Arg0, Arg1;  // Input operands for this value.
90
91  ShuffleVal() : Cost(1000000) {}
92};
93
94
95/// ShufTab - This is the actual shuffle table that we are trying to generate.
96///
97static ShuffleVal ShufTab[65536];
98
99/// TheOperators - All of the operators that this target supports.
100static std::vector<Operator*> TheOperators;
101
102/// Operator - This is a vector operation that is available for use.
103struct Operator {
104  unsigned short ShuffleMask;
105  unsigned short OpNum;
106  const char *Name;
107
108  Operator(unsigned short shufflemask, const char *name, unsigned opnum)
109    : ShuffleMask(shufflemask), OpNum(opnum), Name(name) {
110    TheOperators.push_back(this);
111  }
112  ~Operator() {
113    assert(TheOperators.back() == this);
114    TheOperators.pop_back();
115  }
116
117  bool isOnlyLHSOperator() const {
118    return isOnlyLHSMask(ShuffleMask);
119  }
120
121  const char *getName() const { return Name; }
122
123  unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) {
124    // Extract the elements from LHSMask and RHSMask, as appropriate.
125    unsigned Result = 0;
126    for (unsigned i = 0; i != 4; ++i) {
127      unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF;
128      unsigned ResElt;
129      if (SrcElt < 4)
130        ResElt = getMaskElt(LHSMask, SrcElt);
131      else if (SrcElt < 8)
132        ResElt = getMaskElt(RHSMask, SrcElt-4);
133      else {
134        assert(SrcElt == 8 && "Bad src elt!");
135        ResElt = 8;
136      }
137      Result |= ResElt << (4*i);
138    }
139    return Result;
140  }
141};
142
143static const char *getZeroCostOpName(unsigned short Op) {
144  if (ShufTab[Op].Arg0 == 0x0123)
145    return "LHS";
146  else if (ShufTab[Op].Arg0 == 0x4567)
147    return "RHS";
148  else {
149    assert(0 && "bad zero cost operation");
150    abort();
151  }
152}
153
154static void PrintOperation(unsigned ValNo, unsigned short Vals[]) {
155  unsigned short ThisOp = Vals[ValNo];
156  std::cerr << "t" << ValNo;
157  PrintMask(ThisOp, std::cerr);
158  std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "(";
159
160  if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) {
161    std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0);
162    PrintMask(ShufTab[ThisOp].Arg0, std::cerr);
163  } else {
164    // Figure out what tmp # it is.
165    for (unsigned i = 0; ; ++i)
166      if (Vals[i] == ShufTab[ThisOp].Arg0) {
167        std::cerr << "t" << i;
168        break;
169      }
170  }
171
172  if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) {
173    std::cerr << ", ";
174    if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) {
175      std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1);
176      PrintMask(ShufTab[ThisOp].Arg1, std::cerr);
177    } else {
178      // Figure out what tmp # it is.
179      for (unsigned i = 0; ; ++i)
180        if (Vals[i] == ShufTab[ThisOp].Arg1) {
181          std::cerr << "t" << i;
182          break;
183        }
184    }
185  }
186  std::cerr << ")  ";
187}
188
189static unsigned getNumEntered() {
190  unsigned Count = 0;
191  for (unsigned i = 0; i != 65536; ++i)
192    Count += ShufTab[i].Cost < 100;
193  return Count;
194}
195
196static void EvaluateOps(unsigned short Elt, unsigned short Vals[],
197                        unsigned &NumVals) {
198  if (ShufTab[Elt].Cost == 0) return;
199
200  // If this value has already been evaluated, it is free.  FIXME: match undefs.
201  for (unsigned i = 0, e = NumVals; i != e; ++i)
202    if (Vals[i] == Elt) return;
203
204  // Otherwise, get the operands of the value, then add it.
205  unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1;
206  if (ShufTab[Arg0].Cost)
207    EvaluateOps(Arg0, Vals, NumVals);
208  if (Arg0 != Arg1 && ShufTab[Arg1].Cost)
209    EvaluateOps(Arg1, Vals, NumVals);
210
211  Vals[NumVals++] = Elt;
212}
213
214
215int main() {
216  // Seed the table with accesses to the LHS and RHS.
217  ShufTab[0x0123].Cost = 0;
218  ShufTab[0x0123].Op = 0;
219  ShufTab[0x0123].Arg0 = 0x0123;
220  ShufTab[0x4567].Cost = 0;
221  ShufTab[0x4567].Op = 0;
222  ShufTab[0x4567].Arg0 = 0x4567;
223
224  // Seed the first-level of shuffles, shuffles whose inputs are the input to
225  // the vectorshuffle operation.
226  bool MadeChange = true;
227  unsigned OpCount = 0;
228  while (MadeChange) {
229    MadeChange = false;
230    ++OpCount;
231    std::cerr << "Starting iteration #" << OpCount << " with "
232              << getNumEntered() << " entries established.\n";
233
234    // Scan the table for two reasons: First, compute the maximum cost of any
235    // operation left in the table.  Second, make sure that values with undefs
236    // have the cheapest alternative that they match.
237    unsigned MaxCost = ShufTab[0].Cost;
238    for (unsigned i = 1; i != 0x8889; ++i) {
239      if (!isValidMask(i)) continue;
240      if (ShufTab[i].Cost > MaxCost)
241        MaxCost = ShufTab[i].Cost;
242
243      // If this value has an undef, make it be computed the cheapest possible
244      // way of any of the things that it matches.
245      if (hasUndefElements(i)) {
246        // This code is a little bit tricky, so here's the idea: consider some
247        // permutation, like 7u4u.  To compute the lowest cost for 7u4u, we
248        // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries.  If
249        // there are 3 undefs, the number rises to 729 entries we have to scan,
250        // and for the 4 undef case, we have to scan the whole table.
251        //
252        // Instead of doing this huge amount of scanning, we process the table
253        // entries *in order*, and use the fact that 'u' is 8, larger than any
254        // valid index.  Given an entry like 7u4u then, we only need to scan
255        // 7[0-7]4u - 8 entries.  We can get away with this, because we already
256        // know that each of 704u, 714u, 724u, etc contain the minimum value of
257        // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
258        unsigned UndefIdx;
259        if (i & 0x8000)
260          UndefIdx = 0;
261        else if (i & 0x0800)
262          UndefIdx = 1;
263        else if (i & 0x0080)
264          UndefIdx = 2;
265        else if (i & 0x0008)
266          UndefIdx = 3;
267        else
268          abort();
269
270        unsigned MinVal  = i;
271        unsigned MinCost = ShufTab[i].Cost;
272
273        // Scan the 8 entries.
274        for (unsigned j = 0; j != 8; ++j) {
275          unsigned NewElt = setMaskElt(i, UndefIdx, j);
276          if (ShufTab[NewElt].Cost < MinCost) {
277            MinCost = ShufTab[NewElt].Cost;
278            MinVal = NewElt;
279          }
280        }
281
282        // If we found something cheaper than what was here before, use it.
283        if (i != MinVal) {
284          MadeChange = true;
285          ShufTab[i] = ShufTab[MinVal];
286        }
287      }
288    }
289
290    for (unsigned LHS = 0; LHS != 0x8889; ++LHS) {
291      if (!isValidMask(LHS)) continue;
292      if (ShufTab[LHS].Cost > 1000) continue;
293
294      // If nothing involving this operand could possibly be cheaper than what
295      // we already have, don't consider it.
296      if (ShufTab[LHS].Cost + 1 >= MaxCost)
297        continue;
298
299      for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) {
300        Operator *Op = TheOperators[opnum];
301
302        // Evaluate op(LHS,LHS)
303        unsigned ResultMask = Op->getTransformedMask(LHS, LHS);
304
305        unsigned Cost = ShufTab[LHS].Cost + 1;
306        if (Cost < ShufTab[ResultMask].Cost) {
307          ShufTab[ResultMask].Cost = Cost;
308          ShufTab[ResultMask].Op = Op;
309          ShufTab[ResultMask].Arg0 = LHS;
310          ShufTab[ResultMask].Arg1 = LHS;
311          MadeChange = true;
312        }
313
314        // If this is a two input instruction, include the op(x,y) cases.  If
315        // this is a one input instruction, skip this.
316        if (Op->isOnlyLHSOperator()) continue;
317
318        for (unsigned RHS = 0; RHS != 0x8889; ++RHS) {
319          if (!isValidMask(RHS)) continue;
320          if (ShufTab[RHS].Cost > 1000) continue;
321
322          // If nothing involving this operand could possibly be cheaper than
323          // what we already have, don't consider it.
324          if (ShufTab[RHS].Cost + 1 >= MaxCost)
325            continue;
326
327
328          // Evaluate op(LHS,RHS)
329          unsigned ResultMask = Op->getTransformedMask(LHS, RHS);
330
331          if (ShufTab[ResultMask].Cost <= OpCount ||
332              ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost ||
333              ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost)
334            continue;
335
336          // Figure out the cost to evaluate this, knowing that CSE's only need
337          // to be evaluated once.
338          unsigned short Vals[30];
339          unsigned NumVals = 0;
340          EvaluateOps(LHS, Vals, NumVals);
341          EvaluateOps(RHS, Vals, NumVals);
342
343          unsigned Cost = NumVals + 1;
344          if (Cost < ShufTab[ResultMask].Cost) {
345            ShufTab[ResultMask].Cost = Cost;
346            ShufTab[ResultMask].Op = Op;
347            ShufTab[ResultMask].Arg0 = LHS;
348            ShufTab[ResultMask].Arg1 = RHS;
349            MadeChange = true;
350          }
351        }
352      }
353    }
354  }
355
356  std::cerr << "Finished Table has " << getNumEntered()
357            << " entries established.\n";
358
359  unsigned CostArray[10] = { 0 };
360
361  // Compute a cost histogram.
362  for (unsigned i = 0; i != 65536; ++i) {
363    if (!isValidMask(i)) continue;
364    if (ShufTab[i].Cost > 9)
365      ++CostArray[9];
366    else
367      ++CostArray[ShufTab[i].Cost];
368  }
369
370  for (unsigned i = 0; i != 9; ++i)
371    if (CostArray[i])
372      std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n";
373  if (CostArray[9])
374    std::cout << "// " << CostArray[9] << " entries have higher cost!\n";
375
376
377  // Build up the table to emit.
378  std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n";
379  std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n";
380
381  for (unsigned i = 0; i != 0x8889; ++i) {
382    if (!isValidMask(i)) continue;
383
384    // CostSat - The cost of this operation saturated to two bits.
385    unsigned CostSat = ShufTab[i].Cost;
386    if (CostSat > 4) CostSat = 4;
387    if (CostSat == 0) CostSat = 1;
388    --CostSat;  // Cost is now between 0-3.
389
390    unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0;
391    assert(OpNum < 16 && "Too few bits to encode operation!");
392
393    unsigned LHS = getCompressedMask(ShufTab[i].Arg0);
394    unsigned RHS = getCompressedMask(ShufTab[i].Arg1);
395
396    // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
397    // LHS, and 13 bits of RHS = 32 bits.
398    unsigned Val = (CostSat << 30) | (OpNum << 26) | (LHS << 13) | RHS;
399
400    std::cout << "  " << Val << "U,\t// ";
401    PrintMask(i, std::cout);
402    std::cout << ": Cost " << ShufTab[i].Cost;
403    std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy");
404    std::cout << " ";
405    if (ShufTab[ShufTab[i].Arg0].Cost == 0) {
406      std::cout << getZeroCostOpName(ShufTab[i].Arg0);
407    } else {
408      PrintMask(ShufTab[i].Arg0, std::cout);
409    }
410
411    if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) {
412      std::cout << ", ";
413      if (ShufTab[ShufTab[i].Arg1].Cost == 0) {
414        std::cout << getZeroCostOpName(ShufTab[i].Arg1);
415      } else {
416        PrintMask(ShufTab[i].Arg1, std::cout);
417      }
418    }
419    std::cout << "\n";
420  }
421  std::cout << "  0\n};\n";
422
423  if (0) {
424    // Print out the table.
425    for (unsigned i = 0; i != 0x8889; ++i) {
426      if (!isValidMask(i)) continue;
427      if (ShufTab[i].Cost < 1000) {
428        PrintMask(i, std::cerr);
429        std::cerr << " - Cost " << ShufTab[i].Cost << " - ";
430
431        unsigned short Vals[30];
432        unsigned NumVals = 0;
433        EvaluateOps(i, Vals, NumVals);
434
435        for (unsigned j = 0, e = NumVals; j != e; ++j)
436          PrintOperation(j, Vals);
437        std::cerr << "\n";
438      }
439    }
440  }
441}
442
443
444#define GENERATE_ALTIVEC
445
446#ifdef GENERATE_ALTIVEC
447
448///===---------------------------------------------------------------------===//
449/// The altivec instruction definitions.  This is the altivec-specific part of
450/// this file.
451///===---------------------------------------------------------------------===//
452
453// Note that the opcode numbers here must match those in the PPC backend.
454enum {
455  OP_COPY = 0,   // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
456  OP_VMRGHW,
457  OP_VMRGLW,
458  OP_VSPLTISW0,
459  OP_VSPLTISW1,
460  OP_VSPLTISW2,
461  OP_VSPLTISW3,
462  OP_VSLDOI4,
463  OP_VSLDOI8,
464  OP_VSLDOI12
465};
466
467struct vmrghw : public Operator {
468  vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW) {}
469} the_vmrghw;
470
471struct vmrglw : public Operator {
472  vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW) {}
473} the_vmrglw;
474
475template<unsigned Elt>
476struct vspltisw : public Operator {
477  vspltisw(const char *N, unsigned Opc)
478    : Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {}
479};
480
481vspltisw<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0);
482vspltisw<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1);
483vspltisw<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2);
484vspltisw<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3);
485
486template<unsigned N>
487struct vsldoi : public Operator {
488  vsldoi(const char *Name, unsigned Opc)
489    : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) {
490  }
491};
492
493vsldoi<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4);
494vsldoi<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8);
495vsldoi<3> the_vsldoi3("vsldoi12", OP_VSLDOI12);
496
497#endif
498