1//===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file computes an optimal sequence of instructions for doing all shuffles 11// of two 4-element vectors. With a release build and when configured to emit 12// an altivec instruction table, this takes about 30s to run on a 2.7Ghz 13// PowerPC G5. 14// 15//===----------------------------------------------------------------------===// 16 17#include <cassert> 18#include <cstdlib> 19#include <iomanip> 20#include <iostream> 21#include <vector> 22struct Operator; 23 24// Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes 25// an element from that value of the input vectors. A value of 8 means the 26// entry is undefined. 27 28// Mask manipulation functions. 29static inline unsigned short MakeMask(unsigned V0, unsigned V1, 30 unsigned V2, unsigned V3) { 31 return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4)); 32} 33 34/// getMaskElt - Return element N of the specified mask. 35static unsigned getMaskElt(unsigned Mask, unsigned Elt) { 36 return (Mask >> ((3-Elt)*4)) & 0xF; 37} 38 39static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) { 40 unsigned FieldShift = ((3-Elt)*4); 41 return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift); 42} 43 44// Reject elements where the values are 9-15. 45static bool isValidMask(unsigned short Mask) { 46 unsigned short UndefBits = Mask & 0x8888; 47 return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0; 48} 49 50/// hasUndefElements - Return true if any of the elements in the mask are undefs 51/// 52static bool hasUndefElements(unsigned short Mask) { 53 return (Mask & 0x8888) != 0; 54} 55 56/// isOnlyLHSMask - Return true if this mask only refers to its LHS, not 57/// including undef values.. 58static bool isOnlyLHSMask(unsigned short Mask) { 59 return (Mask & 0x4444) == 0; 60} 61 62/// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to 63/// refer to the LHS only (for when one argument value is passed into the same 64/// function twice). 65#if 0 66static unsigned short getLHSOnlyMask(unsigned short Mask) { 67 return Mask & 0xBBBB; // Keep only LHS and Undefs. 68} 69#endif 70 71/// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4 72/// bits) into a compressed 13-bit mask, where each elt is multiplied by 9. 73static unsigned getCompressedMask(unsigned short Mask) { 74 return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 + 75 getMaskElt(Mask, 2)*9 + getMaskElt(Mask, 3); 76} 77 78static void PrintMask(unsigned i, std::ostream &OS) { 79 OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0))) 80 << "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1))) 81 << "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2))) 82 << "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3))) 83 << ">"; 84} 85 86/// ShuffleVal - This represents a shufflevector operation. 87struct ShuffleVal { 88 unsigned Cost; // Number of instrs used to generate this value. 89 Operator *Op; // The Operation used to generate this value. 90 unsigned short Arg0, Arg1; // Input operands for this value. 91 92 ShuffleVal() : Cost(1000000) {} 93}; 94 95 96/// ShufTab - This is the actual shuffle table that we are trying to generate. 97/// 98static ShuffleVal ShufTab[65536]; 99 100/// TheOperators - All of the operators that this target supports. 101static std::vector<Operator*> TheOperators; 102 103/// Operator - This is a vector operation that is available for use. 104struct Operator { 105 unsigned short ShuffleMask; 106 unsigned short OpNum; 107 const char *Name; 108 unsigned Cost; 109 110 Operator(unsigned short shufflemask, const char *name, unsigned opnum, 111 unsigned cost = 1) 112 : ShuffleMask(shufflemask), OpNum(opnum), Name(name), Cost(cost) { 113 TheOperators.push_back(this); 114 } 115 ~Operator() { 116 assert(TheOperators.back() == this); 117 TheOperators.pop_back(); 118 } 119 120 bool isOnlyLHSOperator() const { 121 return isOnlyLHSMask(ShuffleMask); 122 } 123 124 const char *getName() const { return Name; } 125 unsigned getCost() const { return Cost; } 126 127 unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) { 128 // Extract the elements from LHSMask and RHSMask, as appropriate. 129 unsigned Result = 0; 130 for (unsigned i = 0; i != 4; ++i) { 131 unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF; 132 unsigned ResElt; 133 if (SrcElt < 4) 134 ResElt = getMaskElt(LHSMask, SrcElt); 135 else if (SrcElt < 8) 136 ResElt = getMaskElt(RHSMask, SrcElt-4); 137 else { 138 assert(SrcElt == 8 && "Bad src elt!"); 139 ResElt = 8; 140 } 141 Result |= ResElt << (4*i); 142 } 143 return Result; 144 } 145}; 146 147static const char *getZeroCostOpName(unsigned short Op) { 148 if (ShufTab[Op].Arg0 == 0x0123) 149 return "LHS"; 150 else if (ShufTab[Op].Arg0 == 0x4567) 151 return "RHS"; 152 else { 153 assert(0 && "bad zero cost operation"); 154 abort(); 155 } 156} 157 158static void PrintOperation(unsigned ValNo, unsigned short Vals[]) { 159 unsigned short ThisOp = Vals[ValNo]; 160 std::cerr << "t" << ValNo; 161 PrintMask(ThisOp, std::cerr); 162 std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "("; 163 164 if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) { 165 std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0); 166 PrintMask(ShufTab[ThisOp].Arg0, std::cerr); 167 } else { 168 // Figure out what tmp # it is. 169 for (unsigned i = 0; ; ++i) 170 if (Vals[i] == ShufTab[ThisOp].Arg0) { 171 std::cerr << "t" << i; 172 break; 173 } 174 } 175 176 if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) { 177 std::cerr << ", "; 178 if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) { 179 std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1); 180 PrintMask(ShufTab[ThisOp].Arg1, std::cerr); 181 } else { 182 // Figure out what tmp # it is. 183 for (unsigned i = 0; ; ++i) 184 if (Vals[i] == ShufTab[ThisOp].Arg1) { 185 std::cerr << "t" << i; 186 break; 187 } 188 } 189 } 190 std::cerr << ") "; 191} 192 193static unsigned getNumEntered() { 194 unsigned Count = 0; 195 for (unsigned i = 0; i != 65536; ++i) 196 Count += ShufTab[i].Cost < 100; 197 return Count; 198} 199 200static void EvaluateOps(unsigned short Elt, unsigned short Vals[], 201 unsigned &NumVals) { 202 if (ShufTab[Elt].Cost == 0) return; 203 204 // If this value has already been evaluated, it is free. FIXME: match undefs. 205 for (unsigned i = 0, e = NumVals; i != e; ++i) 206 if (Vals[i] == Elt) return; 207 208 // Otherwise, get the operands of the value, then add it. 209 unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1; 210 if (ShufTab[Arg0].Cost) 211 EvaluateOps(Arg0, Vals, NumVals); 212 if (Arg0 != Arg1 && ShufTab[Arg1].Cost) 213 EvaluateOps(Arg1, Vals, NumVals); 214 215 Vals[NumVals++] = Elt; 216} 217 218 219int main() { 220 // Seed the table with accesses to the LHS and RHS. 221 ShufTab[0x0123].Cost = 0; 222 ShufTab[0x0123].Op = 0; 223 ShufTab[0x0123].Arg0 = 0x0123; 224 ShufTab[0x4567].Cost = 0; 225 ShufTab[0x4567].Op = 0; 226 ShufTab[0x4567].Arg0 = 0x4567; 227 228 // Seed the first-level of shuffles, shuffles whose inputs are the input to 229 // the vectorshuffle operation. 230 bool MadeChange = true; 231 unsigned OpCount = 0; 232 while (MadeChange) { 233 MadeChange = false; 234 ++OpCount; 235 std::cerr << "Starting iteration #" << OpCount << " with " 236 << getNumEntered() << " entries established.\n"; 237 238 // Scan the table for two reasons: First, compute the maximum cost of any 239 // operation left in the table. Second, make sure that values with undefs 240 // have the cheapest alternative that they match. 241 unsigned MaxCost = ShufTab[0].Cost; 242 for (unsigned i = 1; i != 0x8889; ++i) { 243 if (!isValidMask(i)) continue; 244 if (ShufTab[i].Cost > MaxCost) 245 MaxCost = ShufTab[i].Cost; 246 247 // If this value has an undef, make it be computed the cheapest possible 248 // way of any of the things that it matches. 249 if (hasUndefElements(i)) { 250 // This code is a little bit tricky, so here's the idea: consider some 251 // permutation, like 7u4u. To compute the lowest cost for 7u4u, we 252 // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If 253 // there are 3 undefs, the number rises to 729 entries we have to scan, 254 // and for the 4 undef case, we have to scan the whole table. 255 // 256 // Instead of doing this huge amount of scanning, we process the table 257 // entries *in order*, and use the fact that 'u' is 8, larger than any 258 // valid index. Given an entry like 7u4u then, we only need to scan 259 // 7[0-7]4u - 8 entries. We can get away with this, because we already 260 // know that each of 704u, 714u, 724u, etc contain the minimum value of 261 // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively. 262 unsigned UndefIdx; 263 if (i & 0x8000) 264 UndefIdx = 0; 265 else if (i & 0x0800) 266 UndefIdx = 1; 267 else if (i & 0x0080) 268 UndefIdx = 2; 269 else if (i & 0x0008) 270 UndefIdx = 3; 271 else 272 abort(); 273 274 unsigned MinVal = i; 275 unsigned MinCost = ShufTab[i].Cost; 276 277 // Scan the 8 entries. 278 for (unsigned j = 0; j != 8; ++j) { 279 unsigned NewElt = setMaskElt(i, UndefIdx, j); 280 if (ShufTab[NewElt].Cost < MinCost) { 281 MinCost = ShufTab[NewElt].Cost; 282 MinVal = NewElt; 283 } 284 } 285 286 // If we found something cheaper than what was here before, use it. 287 if (i != MinVal) { 288 MadeChange = true; 289 ShufTab[i] = ShufTab[MinVal]; 290 } 291 } 292 } 293 294 for (unsigned LHS = 0; LHS != 0x8889; ++LHS) { 295 if (!isValidMask(LHS)) continue; 296 if (ShufTab[LHS].Cost > 1000) continue; 297 298 // If nothing involving this operand could possibly be cheaper than what 299 // we already have, don't consider it. 300 if (ShufTab[LHS].Cost + 1 >= MaxCost) 301 continue; 302 303 for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) { 304 Operator *Op = TheOperators[opnum]; 305 306 // Evaluate op(LHS,LHS) 307 unsigned ResultMask = Op->getTransformedMask(LHS, LHS); 308 309 unsigned Cost = ShufTab[LHS].Cost + Op->getCost(); 310 if (Cost < ShufTab[ResultMask].Cost) { 311 ShufTab[ResultMask].Cost = Cost; 312 ShufTab[ResultMask].Op = Op; 313 ShufTab[ResultMask].Arg0 = LHS; 314 ShufTab[ResultMask].Arg1 = LHS; 315 MadeChange = true; 316 } 317 318 // If this is a two input instruction, include the op(x,y) cases. If 319 // this is a one input instruction, skip this. 320 if (Op->isOnlyLHSOperator()) continue; 321 322 for (unsigned RHS = 0; RHS != 0x8889; ++RHS) { 323 if (!isValidMask(RHS)) continue; 324 if (ShufTab[RHS].Cost > 1000) continue; 325 326 // If nothing involving this operand could possibly be cheaper than 327 // what we already have, don't consider it. 328 if (ShufTab[RHS].Cost + 1 >= MaxCost) 329 continue; 330 331 332 // Evaluate op(LHS,RHS) 333 unsigned ResultMask = Op->getTransformedMask(LHS, RHS); 334 335 if (ShufTab[ResultMask].Cost <= OpCount || 336 ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost || 337 ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost) 338 continue; 339 340 // Figure out the cost to evaluate this, knowing that CSE's only need 341 // to be evaluated once. 342 unsigned short Vals[30]; 343 unsigned NumVals = 0; 344 EvaluateOps(LHS, Vals, NumVals); 345 EvaluateOps(RHS, Vals, NumVals); 346 347 unsigned Cost = NumVals + Op->getCost(); 348 if (Cost < ShufTab[ResultMask].Cost) { 349 ShufTab[ResultMask].Cost = Cost; 350 ShufTab[ResultMask].Op = Op; 351 ShufTab[ResultMask].Arg0 = LHS; 352 ShufTab[ResultMask].Arg1 = RHS; 353 MadeChange = true; 354 } 355 } 356 } 357 } 358 } 359 360 std::cerr << "Finished Table has " << getNumEntered() 361 << " entries established.\n"; 362 363 unsigned CostArray[10] = { 0 }; 364 365 // Compute a cost histogram. 366 for (unsigned i = 0; i != 65536; ++i) { 367 if (!isValidMask(i)) continue; 368 if (ShufTab[i].Cost > 9) 369 ++CostArray[9]; 370 else 371 ++CostArray[ShufTab[i].Cost]; 372 } 373 374 for (unsigned i = 0; i != 9; ++i) 375 if (CostArray[i]) 376 std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n"; 377 if (CostArray[9]) 378 std::cout << "// " << CostArray[9] << " entries have higher cost!\n"; 379 380 381 // Build up the table to emit. 382 std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n"; 383 std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n"; 384 385 for (unsigned i = 0; i != 0x8889; ++i) { 386 if (!isValidMask(i)) continue; 387 388 // CostSat - The cost of this operation saturated to two bits. 389 unsigned CostSat = ShufTab[i].Cost; 390 if (CostSat > 4) CostSat = 4; 391 if (CostSat == 0) CostSat = 1; 392 --CostSat; // Cost is now between 0-3. 393 394 unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0; 395 assert(OpNum < 16 && "Too few bits to encode operation!"); 396 397 unsigned LHS = getCompressedMask(ShufTab[i].Arg0); 398 unsigned RHS = getCompressedMask(ShufTab[i].Arg1); 399 400 // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of 401 // LHS, and 13 bits of RHS = 32 bits. 402 unsigned Val = (CostSat << 30) | (OpNum << 26) | (LHS << 13) | RHS; 403 404 std::cout << " " << std::setw(10) << Val << "U, // "; 405 PrintMask(i, std::cout); 406 std::cout << ": Cost " << ShufTab[i].Cost; 407 std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy"); 408 std::cout << " "; 409 if (ShufTab[ShufTab[i].Arg0].Cost == 0) { 410 std::cout << getZeroCostOpName(ShufTab[i].Arg0); 411 } else { 412 PrintMask(ShufTab[i].Arg0, std::cout); 413 } 414 415 if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) { 416 std::cout << ", "; 417 if (ShufTab[ShufTab[i].Arg1].Cost == 0) { 418 std::cout << getZeroCostOpName(ShufTab[i].Arg1); 419 } else { 420 PrintMask(ShufTab[i].Arg1, std::cout); 421 } 422 } 423 std::cout << "\n"; 424 } 425 std::cout << " 0\n};\n"; 426 427 if (0) { 428 // Print out the table. 429 for (unsigned i = 0; i != 0x8889; ++i) { 430 if (!isValidMask(i)) continue; 431 if (ShufTab[i].Cost < 1000) { 432 PrintMask(i, std::cerr); 433 std::cerr << " - Cost " << ShufTab[i].Cost << " - "; 434 435 unsigned short Vals[30]; 436 unsigned NumVals = 0; 437 EvaluateOps(i, Vals, NumVals); 438 439 for (unsigned j = 0, e = NumVals; j != e; ++j) 440 PrintOperation(j, Vals); 441 std::cerr << "\n"; 442 } 443 } 444 } 445} 446 447 448#ifdef GENERATE_ALTIVEC 449 450///===---------------------------------------------------------------------===// 451/// The altivec instruction definitions. This is the altivec-specific part of 452/// this file. 453///===---------------------------------------------------------------------===// 454 455// Note that the opcode numbers here must match those in the PPC backend. 456enum { 457 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 458 OP_VMRGHW, 459 OP_VMRGLW, 460 OP_VSPLTISW0, 461 OP_VSPLTISW1, 462 OP_VSPLTISW2, 463 OP_VSPLTISW3, 464 OP_VSLDOI4, 465 OP_VSLDOI8, 466 OP_VSLDOI12 467}; 468 469struct vmrghw : public Operator { 470 vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW) {} 471} the_vmrghw; 472 473struct vmrglw : public Operator { 474 vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW) {} 475} the_vmrglw; 476 477template<unsigned Elt> 478struct vspltisw : public Operator { 479 vspltisw(const char *N, unsigned Opc) 480 : Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {} 481}; 482 483vspltisw<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0); 484vspltisw<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1); 485vspltisw<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2); 486vspltisw<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3); 487 488template<unsigned N> 489struct vsldoi : public Operator { 490 vsldoi(const char *Name, unsigned Opc) 491 : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) { 492 } 493}; 494 495vsldoi<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4); 496vsldoi<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8); 497vsldoi<3> the_vsldoi3("vsldoi12", OP_VSLDOI12); 498 499#endif 500 501#define GENERATE_NEON 502 503#ifdef GENERATE_NEON 504enum { 505 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 506 OP_VREV, 507 OP_VDUP0, 508 OP_VDUP1, 509 OP_VDUP2, 510 OP_VDUP3, 511 OP_VEXT1, 512 OP_VEXT2, 513 OP_VEXT3, 514 OP_VUZPL, // VUZP, left result 515 OP_VUZPR, // VUZP, right result 516 OP_VZIPL, // VZIP, left result 517 OP_VZIPR, // VZIP, right result 518 OP_VTRNL, // VTRN, left result 519 OP_VTRNR // VTRN, right result 520}; 521 522struct vrev : public Operator { 523 vrev() : Operator(0x1032, "vrev", OP_VREV) {} 524} the_vrev; 525 526template<unsigned Elt> 527struct vdup : public Operator { 528 vdup(const char *N, unsigned Opc) 529 : Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {} 530}; 531 532vdup<0> the_vdup0("vdup0", OP_VDUP0); 533vdup<1> the_vdup1("vdup1", OP_VDUP1); 534vdup<2> the_vdup2("vdup2", OP_VDUP2); 535vdup<3> the_vdup3("vdup3", OP_VDUP3); 536 537template<unsigned N> 538struct vext : public Operator { 539 vext(const char *Name, unsigned Opc) 540 : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) { 541 } 542}; 543 544vext<1> the_vext1("vext1", OP_VEXT1); 545vext<2> the_vext2("vext2", OP_VEXT2); 546vext<3> the_vext3("vext3", OP_VEXT3); 547 548struct vuzpl : public Operator { 549 vuzpl() : Operator(0x0246, "vuzpl", OP_VUZPL, 2) {} 550} the_vuzpl; 551 552struct vuzpr : public Operator { 553 vuzpr() : Operator(0x1357, "vuzpr", OP_VUZPR, 2) {} 554} the_vuzpr; 555 556struct vzipl : public Operator { 557 vzipl() : Operator(0x0415, "vzipl", OP_VZIPL, 2) {} 558} the_vzipl; 559 560struct vzipr : public Operator { 561 vzipr() : Operator(0x2637, "vzipr", OP_VZIPR, 2) {} 562} the_vzipr; 563 564struct vtrnl : public Operator { 565 vtrnl() : Operator(0x0426, "vtrnl", OP_VTRNL, 2) {} 566} the_vtrnl; 567 568struct vtrnr : public Operator { 569 vtrnr() : Operator(0x1537, "vtrnr", OP_VTRNR, 2) {} 570} the_vtrnr; 571 572#endif 573