PerfectShuffle.cpp revision 4ad53bdd19539e9781ed1c7644c7a3ea061028b9
1//===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file was developed by Chris Lattner and is distributed under 6// the University of Illinois Open Source License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file computes an optimal sequence of instructions for doing all shuffles 11// of two 4-element vectors. With a release build and when configured to emit 12// an altivec instruction table, this takes about 30s to run on a 2.7Ghz 13// PowerPC G5. 14// 15//===----------------------------------------------------------------------===// 16 17#include <iostream> 18#include <vector> 19 20struct Operator; 21 22// Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes 23// an element from that value of the input vectors. A value of 8 means the 24// entry is undefined. 25 26// Mask manipulation functions. 27static inline unsigned short MakeMask(unsigned V0, unsigned V1, 28 unsigned V2, unsigned V3) { 29 return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4)); 30} 31 32/// getMaskElt - Return element N of the specified mask. 33static unsigned getMaskElt(unsigned Mask, unsigned Elt) { 34 return (Mask >> ((3-Elt)*4)) & 0xF; 35} 36 37static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) { 38 unsigned FieldShift = ((3-Elt)*4); 39 return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift); 40} 41 42// Reject elements where the values are 9-15. 43static bool isValidMask(unsigned short Mask) { 44 unsigned short UndefBits = Mask & 0x8888; 45 return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0; 46} 47 48/// hasUndefElements - Return true if any of the elements in the mask are undefs 49/// 50static bool hasUndefElements(unsigned short Mask) { 51 return (Mask & 0x8888) != 0; 52} 53 54/// isOnlyLHSMask - Return true if this mask only refers to its LHS, not 55/// including undef values.. 56static bool isOnlyLHSMask(unsigned short Mask) { 57 return (Mask & 0x4444) == 0; 58} 59 60/// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to 61/// refer to the LHS only (for when one argument value is passed into the same 62/// function twice). 63static unsigned short getLHSOnlyMask(unsigned short Mask) { 64 return Mask & 0xBBBB; // Keep only LHS and Undefs. 65} 66 67/// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4 68/// bits) into a compressed 13-bit mask, where each elt is multiplied by 9. 69static unsigned getCompressedMask(unsigned short Mask) { 70 return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 + 71 getMaskElt(Mask, 2)*9 + getMaskElt(Mask, 3); 72} 73 74static void PrintMask(unsigned i, std::ostream &OS) { 75 OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0))) 76 << "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1))) 77 << "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2))) 78 << "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3))) 79 << ">"; 80} 81 82/// ShuffleVal - This represents a shufflevector operation. 83struct ShuffleVal { 84 unsigned Cost; // Number of instrs used to generate this value. 85 Operator *Op; // The Operation used to generate this value. 86 unsigned short Arg0, Arg1; // Input operands for this value. 87 88 ShuffleVal() : Cost(1000000) {} 89}; 90 91 92/// ShufTab - This is the actual shuffle table that we are trying to generate. 93/// 94static ShuffleVal ShufTab[65536]; 95 96/// TheOperators - All of the operators that this target supports. 97static std::vector<Operator*> TheOperators; 98 99/// Operator - This is a vector operation that is available for use. 100struct Operator { 101 unsigned short ShuffleMask; 102 unsigned short OpNum; 103 const char *Name; 104 105 Operator(unsigned short shufflemask, const char *name) 106 : ShuffleMask(shufflemask), Name(name) { 107 OpNum = TheOperators.size(); 108 TheOperators.push_back(this); 109 } 110 ~Operator() { 111 assert(TheOperators.back() == this); 112 TheOperators.pop_back(); 113 } 114 115 bool isOnlyLHSOperator() const { 116 return isOnlyLHSMask(ShuffleMask); 117 } 118 119 const char *getName() const { return Name; } 120 121 unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) { 122 // Extract the elements from LHSMask and RHSMask, as appropriate. 123 unsigned Result = 0; 124 for (unsigned i = 0; i != 4; ++i) { 125 unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF; 126 unsigned ResElt; 127 if (SrcElt < 4) 128 ResElt = getMaskElt(LHSMask, SrcElt); 129 else if (SrcElt < 8) 130 ResElt = getMaskElt(RHSMask, SrcElt-4); 131 else { 132 assert(SrcElt == 8 && "Bad src elt!"); 133 ResElt = 8; 134 } 135 Result |= ResElt << (4*i); 136 } 137 return Result; 138 } 139}; 140 141static const char *getZeroCostOpName(unsigned short Op) { 142 if (ShufTab[Op].Arg0 == 0x0123) 143 return "LHS"; 144 else if (ShufTab[Op].Arg0 == 0x4567) 145 return "RHS"; 146 else { 147 assert(0 && "bad zero cost operation"); 148 abort(); 149 } 150} 151 152static void PrintOperation(unsigned ValNo, unsigned short Vals[]) { 153 unsigned short ThisOp = Vals[ValNo]; 154 std::cerr << "t" << ValNo; 155 PrintMask(ThisOp, std::cerr); 156 std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "("; 157 158 if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) { 159 std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0); 160 PrintMask(ShufTab[ThisOp].Arg0, std::cerr); 161 } else { 162 // Figure out what tmp # it is. 163 for (unsigned i = 0; ; ++i) 164 if (Vals[i] == ShufTab[ThisOp].Arg0) { 165 std::cerr << "t" << i; 166 break; 167 } 168 } 169 170 if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) { 171 std::cerr << ", "; 172 if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) { 173 std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1); 174 PrintMask(ShufTab[ThisOp].Arg1, std::cerr); 175 } else { 176 // Figure out what tmp # it is. 177 for (unsigned i = 0; ; ++i) 178 if (Vals[i] == ShufTab[ThisOp].Arg1) { 179 std::cerr << "t" << i; 180 break; 181 } 182 } 183 } 184 std::cerr << ") "; 185} 186 187static unsigned getNumEntered() { 188 unsigned Count = 0; 189 for (unsigned i = 0; i != 65536; ++i) 190 Count += ShufTab[i].Cost < 100; 191 return Count; 192} 193 194static void EvaluateOps(unsigned short Elt, unsigned short Vals[], 195 unsigned &NumVals) { 196 if (ShufTab[Elt].Cost == 0) return; 197 198 // If this value has already been evaluated, it is free. FIXME: match undefs. 199 for (unsigned i = 0, e = NumVals; i != e; ++i) 200 if (Vals[i] == Elt) return; 201 202 // Otherwise, get the operands of the value, then add it. 203 unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1; 204 if (ShufTab[Arg0].Cost) 205 EvaluateOps(Arg0, Vals, NumVals); 206 if (Arg0 != Arg1 && ShufTab[Arg1].Cost) 207 EvaluateOps(Arg1, Vals, NumVals); 208 209 Vals[NumVals++] = Elt; 210} 211 212 213int main() { 214 // Seed the table with accesses to the LHS and RHS. 215 ShufTab[0x0123].Cost = 0; 216 ShufTab[0x0123].Op = 0; 217 ShufTab[0x0123].Arg0 = 0x0123; 218 ShufTab[0x4567].Cost = 0; 219 ShufTab[0x4567].Op = 0; 220 ShufTab[0x4567].Arg0 = 0x4567; 221 222 // Seed the first-level of shuffles, shuffles whose inputs are the input to 223 // the vectorshuffle operation. 224 bool MadeChange = true; 225 unsigned OpCount = 0; 226 while (MadeChange) { 227 MadeChange = false; 228 ++OpCount; 229 std::cerr << "Starting iteration #" << OpCount << " with " 230 << getNumEntered() << " entries established.\n"; 231 232 // Scan the table for two reasons: First, compute the maximum cost of any 233 // operation left in the table. Second, make sure that values with undefs 234 // have the cheapest alternative that they match. 235 unsigned MaxCost = ShufTab[0].Cost; 236 for (unsigned i = 1; i != 0x8889; ++i) { 237 if (!isValidMask(i)) continue; 238 if (ShufTab[i].Cost > MaxCost) 239 MaxCost = ShufTab[i].Cost; 240 241 // If this value has an undef, make it be computed the cheapest possible 242 // way of any of the things that it matches. 243 if (hasUndefElements(i)) { 244 // This code is a little bit tricky, so here's the idea: consider some 245 // permutation, like 7u4u. To compute the lowest cost for 7u4u, we 246 // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If 247 // there are 3 undefs, the number rises to 729 entries we have to scan, 248 // and for the 4 undef case, we have to scan the whole table. 249 // 250 // Instead of doing this huge amount of scanning, we process the table 251 // entries *in order*, and use the fact that 'u' is 8, larger than any 252 // valid index. Given an entry like 7u4u then, we only need to scan 253 // 7[0-7]4u - 8 entries. We can get away with this, because we already 254 // know that each of 704u, 714u, 724u, etc contain the minimum value of 255 // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively. 256 unsigned UndefIdx; 257 if (i & 0x8000) 258 UndefIdx = 0; 259 else if (i & 0x0800) 260 UndefIdx = 1; 261 else if (i & 0x0080) 262 UndefIdx = 2; 263 else if (i & 0x0008) 264 UndefIdx = 3; 265 else 266 abort(); 267 268 unsigned MinVal = i; 269 unsigned MinCost = ShufTab[i].Cost; 270 271 // Scan the 8 entries. 272 for (unsigned j = 0; j != 8; ++j) { 273 unsigned NewElt = setMaskElt(i, UndefIdx, j); 274 if (ShufTab[NewElt].Cost < MinCost) { 275 MinCost = ShufTab[NewElt].Cost; 276 MinVal = NewElt; 277 } 278 } 279 280 // If we found something cheaper than what was here before, use it. 281 if (i != MinVal) { 282 MadeChange = true; 283 ShufTab[i] = ShufTab[MinVal]; 284 } 285 } 286 } 287 288 for (unsigned LHS = 0; LHS != 0x8889; ++LHS) { 289 if (!isValidMask(LHS)) continue; 290 if (ShufTab[LHS].Cost > 1000) continue; 291 292 // If nothing involving this operand could possibly be cheaper than what 293 // we already have, don't consider it. 294 if (ShufTab[LHS].Cost + 1 >= MaxCost) 295 continue; 296 297 for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) { 298 Operator *Op = TheOperators[opnum]; 299 unsigned short Mask = Op->ShuffleMask; 300 301 // Evaluate op(LHS,LHS) 302 unsigned ResultMask = Op->getTransformedMask(LHS, LHS); 303 304 unsigned Cost = ShufTab[LHS].Cost + 1; 305 if (Cost < ShufTab[ResultMask].Cost) { 306 ShufTab[ResultMask].Cost = Cost; 307 ShufTab[ResultMask].Op = Op; 308 ShufTab[ResultMask].Arg0 = LHS; 309 ShufTab[ResultMask].Arg1 = LHS; 310 MadeChange = true; 311 } 312 313 // If this is a two input instruction, include the op(x,y) cases. If 314 // this is a one input instruction, skip this. 315 if (Op->isOnlyLHSOperator()) continue; 316 317 for (unsigned RHS = 0; RHS != 0x8889; ++RHS) { 318 if (!isValidMask(RHS)) continue; 319 if (ShufTab[RHS].Cost > 1000) continue; 320 321 // If nothing involving this operand could possibly be cheaper than 322 // what we already have, don't consider it. 323 if (ShufTab[RHS].Cost + 1 >= MaxCost) 324 continue; 325 326 327 // Evaluate op(LHS,RHS) 328 unsigned ResultMask = Op->getTransformedMask(LHS, RHS); 329 330 if (ShufTab[ResultMask].Cost <= OpCount || 331 ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost || 332 ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost) 333 continue; 334 335 // Figure out the cost to evaluate this, knowing that CSE's only need 336 // to be evaluated once. 337 unsigned short Vals[30]; 338 unsigned NumVals = 0; 339 EvaluateOps(LHS, Vals, NumVals); 340 EvaluateOps(RHS, Vals, NumVals); 341 342 unsigned Cost = NumVals + 1; 343 if (Cost < ShufTab[ResultMask].Cost) { 344 ShufTab[ResultMask].Cost = Cost; 345 ShufTab[ResultMask].Op = Op; 346 ShufTab[ResultMask].Arg0 = LHS; 347 ShufTab[ResultMask].Arg1 = RHS; 348 MadeChange = true; 349 } 350 } 351 } 352 } 353 } 354 355 std::cerr << "Finished Table has " << getNumEntered() 356 << " entries established.\n"; 357 358 unsigned CostArray[10] = { 0 }; 359 360 // Compute a cost histogram. 361 for (unsigned i = 0; i != 65536; ++i) { 362 if (!isValidMask(i)) continue; 363 if (ShufTab[i].Cost > 9) 364 ++CostArray[9]; 365 else 366 ++CostArray[ShufTab[i].Cost]; 367 } 368 369 for (unsigned i = 0; i != 9; ++i) 370 if (CostArray[i]) 371 std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n"; 372 if (CostArray[9]) 373 std::cout << "// " << CostArray[9] << " entries have higher cost!\n"; 374 375 376 // Build up the table to emit. 377 std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n"; 378 std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n"; 379 380 for (unsigned i = 0; i != 0x8889; ++i) { 381 if (!isValidMask(i)) continue; 382 383 // CostSat - The cost of this operation saturated to two bits. 384 unsigned CostSat = ShufTab[i].Cost; 385 if (CostSat > 3) CostSat = 3; 386 387 unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0; 388 assert(OpNum < 16 && "Too few bits to encode operation!"); 389 390 unsigned LHS = getCompressedMask(ShufTab[i].Arg0); 391 unsigned RHS = getCompressedMask(ShufTab[i].Arg1); 392 393 // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of 394 // LHS, and 13 bits of RHS = 32 bits. 395 unsigned Val = (CostSat << 30) | (OpNum << 27) | (LHS << 13) | RHS; 396 397 std::cout << " " << Val << "U,\t// "; 398 PrintMask(i, std::cout); 399 std::cout << ": Cost " << ShufTab[i].Cost; 400 std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy"); 401 std::cout << " "; 402 if (ShufTab[ShufTab[i].Arg0].Cost == 0) { 403 std::cout << getZeroCostOpName(ShufTab[i].Arg0); 404 } else { 405 PrintMask(ShufTab[i].Arg0, std::cout); 406 } 407 408 if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) { 409 std::cout << ", "; 410 if (ShufTab[ShufTab[i].Arg1].Cost == 0) { 411 std::cout << getZeroCostOpName(ShufTab[i].Arg1); 412 } else { 413 PrintMask(ShufTab[i].Arg1, std::cout); 414 } 415 } 416 std::cout << "\n"; 417 } 418 std::cout << " 0\n};\n"; 419 420 if (0) { 421 // Print out the table. 422 for (unsigned i = 0; i != 0x8889; ++i) { 423 if (!isValidMask(i)) continue; 424 if (ShufTab[i].Cost < 1000) { 425 PrintMask(i, std::cerr); 426 std::cerr << " - Cost " << ShufTab[i].Cost << " - "; 427 428 unsigned short Vals[30]; 429 unsigned NumVals = 0; 430 EvaluateOps(i, Vals, NumVals); 431 432 for (unsigned j = 0, e = NumVals; j != e; ++j) 433 PrintOperation(j, Vals); 434 std::cerr << "\n"; 435 } 436 } 437 } 438} 439 440 441 442///===---------------------------------------------------------------------===// 443/// The altivec instruction definitions. This is the altivec-specific part of 444/// this file. 445///===---------------------------------------------------------------------===// 446 447struct vmrghw : public Operator { 448 vmrghw() : Operator(0x0415, "vmrghw") {} 449} the_vmrghw; 450 451struct vmrglw : public Operator { 452 vmrglw() : Operator(0x2637, "vmrglw") {} 453} the_vmrglw; 454 455template<unsigned Elt> 456struct vspltisw : public Operator { 457 vspltisw(const char *N) : Operator(MakeMask(Elt, Elt, Elt, Elt), N) {} 458}; 459 460vspltisw<0> the_vspltisw0("vspltisw0"); 461vspltisw<1> the_vspltisw1("vspltisw1"); 462vspltisw<2> the_vspltisw2("vspltisw2"); 463vspltisw<3> the_vspltisw3("vspltisw3"); 464 465template<unsigned N> 466struct vsldoi : public Operator { 467 vsldoi(const char *n) : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), n){ 468 } 469}; 470 471vsldoi<1> the_vsldoi1("vsldoi4"); 472vsldoi<2> the_vsldoi2("vsldoi8"); 473vsldoi<3> the_vsldoi3("vsldoi12"); 474 475