PerfectShuffle.cpp revision 4ad53bdd19539e9781ed1c7644c7a3ea061028b9
1//===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file was developed by Chris Lattner and is distributed under
6// the University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file computes an optimal sequence of instructions for doing all shuffles
11// of two 4-element vectors.  With a release build and when configured to emit
12// an altivec instruction table, this takes about 30s to run on a 2.7Ghz
13// PowerPC G5.
14//
15//===----------------------------------------------------------------------===//
16
17#include <iostream>
18#include <vector>
19
20struct Operator;
21
22// Masks are 4-nibble hex numbers.  Values 0-7 in any nibble means that it takes
23// an element from that value of the input vectors.  A value of 8 means the
24// entry is undefined.
25
26// Mask manipulation functions.
27static inline unsigned short MakeMask(unsigned V0, unsigned V1,
28                                      unsigned V2, unsigned V3) {
29  return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4));
30}
31
32/// getMaskElt - Return element N of the specified mask.
33static unsigned getMaskElt(unsigned Mask, unsigned Elt) {
34  return (Mask >> ((3-Elt)*4)) & 0xF;
35}
36
37static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) {
38  unsigned FieldShift = ((3-Elt)*4);
39  return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift);
40}
41
42// Reject elements where the values are 9-15.
43static bool isValidMask(unsigned short Mask) {
44  unsigned short UndefBits = Mask & 0x8888;
45  return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0;
46}
47
48/// hasUndefElements - Return true if any of the elements in the mask are undefs
49///
50static bool hasUndefElements(unsigned short Mask) {
51  return (Mask & 0x8888) != 0;
52}
53
54/// isOnlyLHSMask - Return true if this mask only refers to its LHS, not
55/// including undef values..
56static bool isOnlyLHSMask(unsigned short Mask) {
57  return (Mask & 0x4444) == 0;
58}
59
60/// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to
61/// refer to the LHS only (for when one argument value is passed into the same
62/// function twice).
63static unsigned short getLHSOnlyMask(unsigned short Mask) {
64  return Mask & 0xBBBB;  // Keep only LHS and Undefs.
65}
66
67/// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4
68/// bits) into a compressed 13-bit mask, where each elt is multiplied by 9.
69static unsigned getCompressedMask(unsigned short Mask) {
70  return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 +
71         getMaskElt(Mask, 2)*9     + getMaskElt(Mask, 3);
72}
73
74static void PrintMask(unsigned i, std::ostream &OS) {
75  OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0)))
76     << "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1)))
77     << "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2)))
78     << "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3)))
79     << ">";
80}
81
82/// ShuffleVal - This represents a shufflevector operation.
83struct ShuffleVal {
84  unsigned Cost;  // Number of instrs used to generate this value.
85  Operator *Op;   // The Operation used to generate this value.
86  unsigned short Arg0, Arg1;  // Input operands for this value.
87
88  ShuffleVal() : Cost(1000000) {}
89};
90
91
92/// ShufTab - This is the actual shuffle table that we are trying to generate.
93///
94static ShuffleVal ShufTab[65536];
95
96/// TheOperators - All of the operators that this target supports.
97static std::vector<Operator*> TheOperators;
98
99/// Operator - This is a vector operation that is available for use.
100struct Operator {
101  unsigned short ShuffleMask;
102  unsigned short OpNum;
103  const char *Name;
104
105  Operator(unsigned short shufflemask, const char *name)
106    : ShuffleMask(shufflemask), Name(name) {
107    OpNum = TheOperators.size();
108    TheOperators.push_back(this);
109  }
110  ~Operator() {
111    assert(TheOperators.back() == this);
112    TheOperators.pop_back();
113  }
114
115  bool isOnlyLHSOperator() const {
116    return isOnlyLHSMask(ShuffleMask);
117  }
118
119  const char *getName() const { return Name; }
120
121  unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) {
122    // Extract the elements from LHSMask and RHSMask, as appropriate.
123    unsigned Result = 0;
124    for (unsigned i = 0; i != 4; ++i) {
125      unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF;
126      unsigned ResElt;
127      if (SrcElt < 4)
128        ResElt = getMaskElt(LHSMask, SrcElt);
129      else if (SrcElt < 8)
130        ResElt = getMaskElt(RHSMask, SrcElt-4);
131      else {
132        assert(SrcElt == 8 && "Bad src elt!");
133        ResElt = 8;
134      }
135      Result |= ResElt << (4*i);
136    }
137    return Result;
138  }
139};
140
141static const char *getZeroCostOpName(unsigned short Op) {
142  if (ShufTab[Op].Arg0 == 0x0123)
143    return "LHS";
144  else if (ShufTab[Op].Arg0 == 0x4567)
145    return "RHS";
146  else {
147    assert(0 && "bad zero cost operation");
148    abort();
149  }
150}
151
152static void PrintOperation(unsigned ValNo, unsigned short Vals[]) {
153  unsigned short ThisOp = Vals[ValNo];
154  std::cerr << "t" << ValNo;
155  PrintMask(ThisOp, std::cerr);
156  std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "(";
157
158  if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) {
159    std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0);
160    PrintMask(ShufTab[ThisOp].Arg0, std::cerr);
161  } else {
162    // Figure out what tmp # it is.
163    for (unsigned i = 0; ; ++i)
164      if (Vals[i] == ShufTab[ThisOp].Arg0) {
165        std::cerr << "t" << i;
166        break;
167      }
168  }
169
170  if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) {
171    std::cerr << ", ";
172    if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) {
173      std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1);
174      PrintMask(ShufTab[ThisOp].Arg1, std::cerr);
175    } else {
176      // Figure out what tmp # it is.
177      for (unsigned i = 0; ; ++i)
178        if (Vals[i] == ShufTab[ThisOp].Arg1) {
179          std::cerr << "t" << i;
180          break;
181        }
182    }
183  }
184  std::cerr << ")  ";
185}
186
187static unsigned getNumEntered() {
188  unsigned Count = 0;
189  for (unsigned i = 0; i != 65536; ++i)
190    Count += ShufTab[i].Cost < 100;
191  return Count;
192}
193
194static void EvaluateOps(unsigned short Elt, unsigned short Vals[],
195                        unsigned &NumVals) {
196  if (ShufTab[Elt].Cost == 0) return;
197
198  // If this value has already been evaluated, it is free.  FIXME: match undefs.
199  for (unsigned i = 0, e = NumVals; i != e; ++i)
200    if (Vals[i] == Elt) return;
201
202  // Otherwise, get the operands of the value, then add it.
203  unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1;
204  if (ShufTab[Arg0].Cost)
205    EvaluateOps(Arg0, Vals, NumVals);
206  if (Arg0 != Arg1 && ShufTab[Arg1].Cost)
207    EvaluateOps(Arg1, Vals, NumVals);
208
209  Vals[NumVals++] = Elt;
210}
211
212
213int main() {
214  // Seed the table with accesses to the LHS and RHS.
215  ShufTab[0x0123].Cost = 0;
216  ShufTab[0x0123].Op = 0;
217  ShufTab[0x0123].Arg0 = 0x0123;
218  ShufTab[0x4567].Cost = 0;
219  ShufTab[0x4567].Op = 0;
220  ShufTab[0x4567].Arg0 = 0x4567;
221
222  // Seed the first-level of shuffles, shuffles whose inputs are the input to
223  // the vectorshuffle operation.
224  bool MadeChange = true;
225  unsigned OpCount = 0;
226  while (MadeChange) {
227    MadeChange = false;
228    ++OpCount;
229    std::cerr << "Starting iteration #" << OpCount << " with "
230              << getNumEntered() << " entries established.\n";
231
232    // Scan the table for two reasons: First, compute the maximum cost of any
233    // operation left in the table.  Second, make sure that values with undefs
234    // have the cheapest alternative that they match.
235    unsigned MaxCost = ShufTab[0].Cost;
236    for (unsigned i = 1; i != 0x8889; ++i) {
237      if (!isValidMask(i)) continue;
238      if (ShufTab[i].Cost > MaxCost)
239        MaxCost = ShufTab[i].Cost;
240
241      // If this value has an undef, make it be computed the cheapest possible
242      // way of any of the things that it matches.
243      if (hasUndefElements(i)) {
244        // This code is a little bit tricky, so here's the idea: consider some
245        // permutation, like 7u4u.  To compute the lowest cost for 7u4u, we
246        // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries.  If
247        // there are 3 undefs, the number rises to 729 entries we have to scan,
248        // and for the 4 undef case, we have to scan the whole table.
249        //
250        // Instead of doing this huge amount of scanning, we process the table
251        // entries *in order*, and use the fact that 'u' is 8, larger than any
252        // valid index.  Given an entry like 7u4u then, we only need to scan
253        // 7[0-7]4u - 8 entries.  We can get away with this, because we already
254        // know that each of 704u, 714u, 724u, etc contain the minimum value of
255        // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively.
256        unsigned UndefIdx;
257        if (i & 0x8000)
258          UndefIdx = 0;
259        else if (i & 0x0800)
260          UndefIdx = 1;
261        else if (i & 0x0080)
262          UndefIdx = 2;
263        else if (i & 0x0008)
264          UndefIdx = 3;
265        else
266          abort();
267
268        unsigned MinVal  = i;
269        unsigned MinCost = ShufTab[i].Cost;
270
271        // Scan the 8 entries.
272        for (unsigned j = 0; j != 8; ++j) {
273          unsigned NewElt = setMaskElt(i, UndefIdx, j);
274          if (ShufTab[NewElt].Cost < MinCost) {
275            MinCost = ShufTab[NewElt].Cost;
276            MinVal = NewElt;
277          }
278        }
279
280        // If we found something cheaper than what was here before, use it.
281        if (i != MinVal) {
282          MadeChange = true;
283          ShufTab[i] = ShufTab[MinVal];
284        }
285      }
286    }
287
288    for (unsigned LHS = 0; LHS != 0x8889; ++LHS) {
289      if (!isValidMask(LHS)) continue;
290      if (ShufTab[LHS].Cost > 1000) continue;
291
292      // If nothing involving this operand could possibly be cheaper than what
293      // we already have, don't consider it.
294      if (ShufTab[LHS].Cost + 1 >= MaxCost)
295        continue;
296
297      for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) {
298        Operator *Op = TheOperators[opnum];
299        unsigned short Mask = Op->ShuffleMask;
300
301        // Evaluate op(LHS,LHS)
302        unsigned ResultMask = Op->getTransformedMask(LHS, LHS);
303
304        unsigned Cost = ShufTab[LHS].Cost + 1;
305        if (Cost < ShufTab[ResultMask].Cost) {
306          ShufTab[ResultMask].Cost = Cost;
307          ShufTab[ResultMask].Op = Op;
308          ShufTab[ResultMask].Arg0 = LHS;
309          ShufTab[ResultMask].Arg1 = LHS;
310          MadeChange = true;
311        }
312
313        // If this is a two input instruction, include the op(x,y) cases.  If
314        // this is a one input instruction, skip this.
315        if (Op->isOnlyLHSOperator()) continue;
316
317        for (unsigned RHS = 0; RHS != 0x8889; ++RHS) {
318          if (!isValidMask(RHS)) continue;
319          if (ShufTab[RHS].Cost > 1000) continue;
320
321          // If nothing involving this operand could possibly be cheaper than
322          // what we already have, don't consider it.
323          if (ShufTab[RHS].Cost + 1 >= MaxCost)
324            continue;
325
326
327          // Evaluate op(LHS,RHS)
328          unsigned ResultMask = Op->getTransformedMask(LHS, RHS);
329
330          if (ShufTab[ResultMask].Cost <= OpCount ||
331              ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost ||
332              ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost)
333            continue;
334
335          // Figure out the cost to evaluate this, knowing that CSE's only need
336          // to be evaluated once.
337          unsigned short Vals[30];
338          unsigned NumVals = 0;
339          EvaluateOps(LHS, Vals, NumVals);
340          EvaluateOps(RHS, Vals, NumVals);
341
342          unsigned Cost = NumVals + 1;
343          if (Cost < ShufTab[ResultMask].Cost) {
344            ShufTab[ResultMask].Cost = Cost;
345            ShufTab[ResultMask].Op = Op;
346            ShufTab[ResultMask].Arg0 = LHS;
347            ShufTab[ResultMask].Arg1 = RHS;
348            MadeChange = true;
349          }
350        }
351      }
352    }
353  }
354
355  std::cerr << "Finished Table has " << getNumEntered()
356            << " entries established.\n";
357
358  unsigned CostArray[10] = { 0 };
359
360  // Compute a cost histogram.
361  for (unsigned i = 0; i != 65536; ++i) {
362    if (!isValidMask(i)) continue;
363    if (ShufTab[i].Cost > 9)
364      ++CostArray[9];
365    else
366      ++CostArray[ShufTab[i].Cost];
367  }
368
369  for (unsigned i = 0; i != 9; ++i)
370    if (CostArray[i])
371      std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n";
372  if (CostArray[9])
373    std::cout << "// " << CostArray[9] << " entries have higher cost!\n";
374
375
376  // Build up the table to emit.
377  std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n";
378  std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n";
379
380  for (unsigned i = 0; i != 0x8889; ++i) {
381    if (!isValidMask(i)) continue;
382
383    // CostSat - The cost of this operation saturated to two bits.
384    unsigned CostSat = ShufTab[i].Cost;
385    if (CostSat > 3) CostSat = 3;
386
387    unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0;
388    assert(OpNum < 16 && "Too few bits to encode operation!");
389
390    unsigned LHS = getCompressedMask(ShufTab[i].Arg0);
391    unsigned RHS = getCompressedMask(ShufTab[i].Arg1);
392
393    // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of
394    // LHS, and 13 bits of RHS = 32 bits.
395    unsigned Val = (CostSat << 30) | (OpNum << 27) | (LHS << 13) | RHS;
396
397    std::cout << "  " << Val << "U,\t// ";
398    PrintMask(i, std::cout);
399    std::cout << ": Cost " << ShufTab[i].Cost;
400    std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy");
401    std::cout << " ";
402    if (ShufTab[ShufTab[i].Arg0].Cost == 0) {
403      std::cout << getZeroCostOpName(ShufTab[i].Arg0);
404    } else {
405      PrintMask(ShufTab[i].Arg0, std::cout);
406    }
407
408    if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) {
409      std::cout << ", ";
410      if (ShufTab[ShufTab[i].Arg1].Cost == 0) {
411        std::cout << getZeroCostOpName(ShufTab[i].Arg1);
412      } else {
413        PrintMask(ShufTab[i].Arg1, std::cout);
414      }
415    }
416    std::cout << "\n";
417  }
418  std::cout << "  0\n};\n";
419
420  if (0) {
421    // Print out the table.
422    for (unsigned i = 0; i != 0x8889; ++i) {
423      if (!isValidMask(i)) continue;
424      if (ShufTab[i].Cost < 1000) {
425        PrintMask(i, std::cerr);
426        std::cerr << " - Cost " << ShufTab[i].Cost << " - ";
427
428        unsigned short Vals[30];
429        unsigned NumVals = 0;
430        EvaluateOps(i, Vals, NumVals);
431
432        for (unsigned j = 0, e = NumVals; j != e; ++j)
433          PrintOperation(j, Vals);
434        std::cerr << "\n";
435      }
436    }
437  }
438}
439
440
441
442///===---------------------------------------------------------------------===//
443/// The altivec instruction definitions.  This is the altivec-specific part of
444/// this file.
445///===---------------------------------------------------------------------===//
446
447struct vmrghw : public Operator {
448  vmrghw() : Operator(0x0415, "vmrghw") {}
449} the_vmrghw;
450
451struct vmrglw : public Operator {
452  vmrglw() : Operator(0x2637, "vmrglw") {}
453} the_vmrglw;
454
455template<unsigned Elt>
456struct vspltisw : public Operator {
457  vspltisw(const char *N) : Operator(MakeMask(Elt, Elt, Elt, Elt), N) {}
458};
459
460vspltisw<0> the_vspltisw0("vspltisw0");
461vspltisw<1> the_vspltisw1("vspltisw1");
462vspltisw<2> the_vspltisw2("vspltisw2");
463vspltisw<3> the_vspltisw3("vspltisw3");
464
465template<unsigned N>
466struct vsldoi : public Operator {
467  vsldoi(const char *n) : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), n){
468  }
469};
470
471vsldoi<1> the_vsldoi1("vsldoi4");
472vsldoi<2> the_vsldoi2("vsldoi8");
473vsldoi<3> the_vsldoi3("vsldoi12");
474
475