PerfectShuffle.cpp revision 64a8dddb73bba20dd24fb3a233a39cbc79040fef
1//===-- PerfectShuffle.cpp - Perfect Shuffle Generator --------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file was developed by Chris Lattner and is distributed under 6// the University of Illinois Open Source License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file computes an optimal sequence of instructions for doing all shuffles 11// of two 4-element vectors. With a release build and when configured to emit 12// an altivec instruction table, this takes about 30s to run on a 2.7Ghz 13// PowerPC G5. 14// 15//===----------------------------------------------------------------------===// 16 17#include <iostream> 18#include <vector> 19#include <cassert> 20 21struct Operator; 22 23// Masks are 4-nibble hex numbers. Values 0-7 in any nibble means that it takes 24// an element from that value of the input vectors. A value of 8 means the 25// entry is undefined. 26 27// Mask manipulation functions. 28static inline unsigned short MakeMask(unsigned V0, unsigned V1, 29 unsigned V2, unsigned V3) { 30 return (V0 << (3*4)) | (V1 << (2*4)) | (V2 << (1*4)) | (V3 << (0*4)); 31} 32 33/// getMaskElt - Return element N of the specified mask. 34static unsigned getMaskElt(unsigned Mask, unsigned Elt) { 35 return (Mask >> ((3-Elt)*4)) & 0xF; 36} 37 38static unsigned setMaskElt(unsigned Mask, unsigned Elt, unsigned NewVal) { 39 unsigned FieldShift = ((3-Elt)*4); 40 return (Mask & ~(0xF << FieldShift)) | (NewVal << FieldShift); 41} 42 43// Reject elements where the values are 9-15. 44static bool isValidMask(unsigned short Mask) { 45 unsigned short UndefBits = Mask & 0x8888; 46 return (Mask & ((UndefBits >> 1)|(UndefBits>>2)|(UndefBits>>3))) == 0; 47} 48 49/// hasUndefElements - Return true if any of the elements in the mask are undefs 50/// 51static bool hasUndefElements(unsigned short Mask) { 52 return (Mask & 0x8888) != 0; 53} 54 55/// isOnlyLHSMask - Return true if this mask only refers to its LHS, not 56/// including undef values.. 57static bool isOnlyLHSMask(unsigned short Mask) { 58 return (Mask & 0x4444) == 0; 59} 60 61/// getLHSOnlyMask - Given a mask that refers to its LHS and RHS, modify it to 62/// refer to the LHS only (for when one argument value is passed into the same 63/// function twice). 64#if 0 65static unsigned short getLHSOnlyMask(unsigned short Mask) { 66 return Mask & 0xBBBB; // Keep only LHS and Undefs. 67} 68#endif 69 70/// getCompressedMask - Turn a 16-bit uncompressed mask (where each elt uses 4 71/// bits) into a compressed 13-bit mask, where each elt is multiplied by 9. 72static unsigned getCompressedMask(unsigned short Mask) { 73 return getMaskElt(Mask, 0)*9*9*9 + getMaskElt(Mask, 1)*9*9 + 74 getMaskElt(Mask, 2)*9 + getMaskElt(Mask, 3); 75} 76 77static void PrintMask(unsigned i, std::ostream &OS) { 78 OS << "<" << (char)(getMaskElt(i, 0) == 8 ? 'u' : ('0'+getMaskElt(i, 0))) 79 << "," << (char)(getMaskElt(i, 1) == 8 ? 'u' : ('0'+getMaskElt(i, 1))) 80 << "," << (char)(getMaskElt(i, 2) == 8 ? 'u' : ('0'+getMaskElt(i, 2))) 81 << "," << (char)(getMaskElt(i, 3) == 8 ? 'u' : ('0'+getMaskElt(i, 3))) 82 << ">"; 83} 84 85/// ShuffleVal - This represents a shufflevector operation. 86struct ShuffleVal { 87 unsigned Cost; // Number of instrs used to generate this value. 88 Operator *Op; // The Operation used to generate this value. 89 unsigned short Arg0, Arg1; // Input operands for this value. 90 91 ShuffleVal() : Cost(1000000) {} 92}; 93 94 95/// ShufTab - This is the actual shuffle table that we are trying to generate. 96/// 97static ShuffleVal ShufTab[65536]; 98 99/// TheOperators - All of the operators that this target supports. 100static std::vector<Operator*> TheOperators; 101 102/// Operator - This is a vector operation that is available for use. 103struct Operator { 104 unsigned short ShuffleMask; 105 unsigned short OpNum; 106 const char *Name; 107 108 Operator(unsigned short shufflemask, const char *name, unsigned opnum) 109 : ShuffleMask(shufflemask), OpNum(opnum), Name(name) { 110 TheOperators.push_back(this); 111 } 112 ~Operator() { 113 assert(TheOperators.back() == this); 114 TheOperators.pop_back(); 115 } 116 117 bool isOnlyLHSOperator() const { 118 return isOnlyLHSMask(ShuffleMask); 119 } 120 121 const char *getName() const { return Name; } 122 123 unsigned short getTransformedMask(unsigned short LHSMask, unsigned RHSMask) { 124 // Extract the elements from LHSMask and RHSMask, as appropriate. 125 unsigned Result = 0; 126 for (unsigned i = 0; i != 4; ++i) { 127 unsigned SrcElt = (ShuffleMask >> (4*i)) & 0xF; 128 unsigned ResElt; 129 if (SrcElt < 4) 130 ResElt = getMaskElt(LHSMask, SrcElt); 131 else if (SrcElt < 8) 132 ResElt = getMaskElt(RHSMask, SrcElt-4); 133 else { 134 assert(SrcElt == 8 && "Bad src elt!"); 135 ResElt = 8; 136 } 137 Result |= ResElt << (4*i); 138 } 139 return Result; 140 } 141}; 142 143static const char *getZeroCostOpName(unsigned short Op) { 144 if (ShufTab[Op].Arg0 == 0x0123) 145 return "LHS"; 146 else if (ShufTab[Op].Arg0 == 0x4567) 147 return "RHS"; 148 else { 149 assert(0 && "bad zero cost operation"); 150 abort(); 151 } 152} 153 154static void PrintOperation(unsigned ValNo, unsigned short Vals[]) { 155 unsigned short ThisOp = Vals[ValNo]; 156 std::cerr << "t" << ValNo; 157 PrintMask(ThisOp, std::cerr); 158 std::cerr << " = " << ShufTab[ThisOp].Op->getName() << "("; 159 160 if (ShufTab[ShufTab[ThisOp].Arg0].Cost == 0) { 161 std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg0); 162 PrintMask(ShufTab[ThisOp].Arg0, std::cerr); 163 } else { 164 // Figure out what tmp # it is. 165 for (unsigned i = 0; ; ++i) 166 if (Vals[i] == ShufTab[ThisOp].Arg0) { 167 std::cerr << "t" << i; 168 break; 169 } 170 } 171 172 if (!ShufTab[Vals[ValNo]].Op->isOnlyLHSOperator()) { 173 std::cerr << ", "; 174 if (ShufTab[ShufTab[ThisOp].Arg1].Cost == 0) { 175 std::cerr << getZeroCostOpName(ShufTab[ThisOp].Arg1); 176 PrintMask(ShufTab[ThisOp].Arg1, std::cerr); 177 } else { 178 // Figure out what tmp # it is. 179 for (unsigned i = 0; ; ++i) 180 if (Vals[i] == ShufTab[ThisOp].Arg1) { 181 std::cerr << "t" << i; 182 break; 183 } 184 } 185 } 186 std::cerr << ") "; 187} 188 189static unsigned getNumEntered() { 190 unsigned Count = 0; 191 for (unsigned i = 0; i != 65536; ++i) 192 Count += ShufTab[i].Cost < 100; 193 return Count; 194} 195 196static void EvaluateOps(unsigned short Elt, unsigned short Vals[], 197 unsigned &NumVals) { 198 if (ShufTab[Elt].Cost == 0) return; 199 200 // If this value has already been evaluated, it is free. FIXME: match undefs. 201 for (unsigned i = 0, e = NumVals; i != e; ++i) 202 if (Vals[i] == Elt) return; 203 204 // Otherwise, get the operands of the value, then add it. 205 unsigned Arg0 = ShufTab[Elt].Arg0, Arg1 = ShufTab[Elt].Arg1; 206 if (ShufTab[Arg0].Cost) 207 EvaluateOps(Arg0, Vals, NumVals); 208 if (Arg0 != Arg1 && ShufTab[Arg1].Cost) 209 EvaluateOps(Arg1, Vals, NumVals); 210 211 Vals[NumVals++] = Elt; 212} 213 214 215int main() { 216 // Seed the table with accesses to the LHS and RHS. 217 ShufTab[0x0123].Cost = 0; 218 ShufTab[0x0123].Op = 0; 219 ShufTab[0x0123].Arg0 = 0x0123; 220 ShufTab[0x4567].Cost = 0; 221 ShufTab[0x4567].Op = 0; 222 ShufTab[0x4567].Arg0 = 0x4567; 223 224 // Seed the first-level of shuffles, shuffles whose inputs are the input to 225 // the vectorshuffle operation. 226 bool MadeChange = true; 227 unsigned OpCount = 0; 228 while (MadeChange) { 229 MadeChange = false; 230 ++OpCount; 231 std::cerr << "Starting iteration #" << OpCount << " with " 232 << getNumEntered() << " entries established.\n"; 233 234 // Scan the table for two reasons: First, compute the maximum cost of any 235 // operation left in the table. Second, make sure that values with undefs 236 // have the cheapest alternative that they match. 237 unsigned MaxCost = ShufTab[0].Cost; 238 for (unsigned i = 1; i != 0x8889; ++i) { 239 if (!isValidMask(i)) continue; 240 if (ShufTab[i].Cost > MaxCost) 241 MaxCost = ShufTab[i].Cost; 242 243 // If this value has an undef, make it be computed the cheapest possible 244 // way of any of the things that it matches. 245 if (hasUndefElements(i)) { 246 // This code is a little bit tricky, so here's the idea: consider some 247 // permutation, like 7u4u. To compute the lowest cost for 7u4u, we 248 // need to take the minimum cost of all of 7[0-8]4[0-8], 81 entries. If 249 // there are 3 undefs, the number rises to 729 entries we have to scan, 250 // and for the 4 undef case, we have to scan the whole table. 251 // 252 // Instead of doing this huge amount of scanning, we process the table 253 // entries *in order*, and use the fact that 'u' is 8, larger than any 254 // valid index. Given an entry like 7u4u then, we only need to scan 255 // 7[0-7]4u - 8 entries. We can get away with this, because we already 256 // know that each of 704u, 714u, 724u, etc contain the minimum value of 257 // all of the 704[0-8], 714[0-8] and 724[0-8] entries respectively. 258 unsigned UndefIdx; 259 if (i & 0x8000) 260 UndefIdx = 0; 261 else if (i & 0x0800) 262 UndefIdx = 1; 263 else if (i & 0x0080) 264 UndefIdx = 2; 265 else if (i & 0x0008) 266 UndefIdx = 3; 267 else 268 abort(); 269 270 unsigned MinVal = i; 271 unsigned MinCost = ShufTab[i].Cost; 272 273 // Scan the 8 entries. 274 for (unsigned j = 0; j != 8; ++j) { 275 unsigned NewElt = setMaskElt(i, UndefIdx, j); 276 if (ShufTab[NewElt].Cost < MinCost) { 277 MinCost = ShufTab[NewElt].Cost; 278 MinVal = NewElt; 279 } 280 } 281 282 // If we found something cheaper than what was here before, use it. 283 if (i != MinVal) { 284 MadeChange = true; 285 ShufTab[i] = ShufTab[MinVal]; 286 } 287 } 288 } 289 290 for (unsigned LHS = 0; LHS != 0x8889; ++LHS) { 291 if (!isValidMask(LHS)) continue; 292 if (ShufTab[LHS].Cost > 1000) continue; 293 294 // If nothing involving this operand could possibly be cheaper than what 295 // we already have, don't consider it. 296 if (ShufTab[LHS].Cost + 1 >= MaxCost) 297 continue; 298 299 for (unsigned opnum = 0, e = TheOperators.size(); opnum != e; ++opnum) { 300 Operator *Op = TheOperators[opnum]; 301 302 // Evaluate op(LHS,LHS) 303 unsigned ResultMask = Op->getTransformedMask(LHS, LHS); 304 305 unsigned Cost = ShufTab[LHS].Cost + 1; 306 if (Cost < ShufTab[ResultMask].Cost) { 307 ShufTab[ResultMask].Cost = Cost; 308 ShufTab[ResultMask].Op = Op; 309 ShufTab[ResultMask].Arg0 = LHS; 310 ShufTab[ResultMask].Arg1 = LHS; 311 MadeChange = true; 312 } 313 314 // If this is a two input instruction, include the op(x,y) cases. If 315 // this is a one input instruction, skip this. 316 if (Op->isOnlyLHSOperator()) continue; 317 318 for (unsigned RHS = 0; RHS != 0x8889; ++RHS) { 319 if (!isValidMask(RHS)) continue; 320 if (ShufTab[RHS].Cost > 1000) continue; 321 322 // If nothing involving this operand could possibly be cheaper than 323 // what we already have, don't consider it. 324 if (ShufTab[RHS].Cost + 1 >= MaxCost) 325 continue; 326 327 328 // Evaluate op(LHS,RHS) 329 unsigned ResultMask = Op->getTransformedMask(LHS, RHS); 330 331 if (ShufTab[ResultMask].Cost <= OpCount || 332 ShufTab[ResultMask].Cost <= ShufTab[LHS].Cost || 333 ShufTab[ResultMask].Cost <= ShufTab[RHS].Cost) 334 continue; 335 336 // Figure out the cost to evaluate this, knowing that CSE's only need 337 // to be evaluated once. 338 unsigned short Vals[30]; 339 unsigned NumVals = 0; 340 EvaluateOps(LHS, Vals, NumVals); 341 EvaluateOps(RHS, Vals, NumVals); 342 343 unsigned Cost = NumVals + 1; 344 if (Cost < ShufTab[ResultMask].Cost) { 345 ShufTab[ResultMask].Cost = Cost; 346 ShufTab[ResultMask].Op = Op; 347 ShufTab[ResultMask].Arg0 = LHS; 348 ShufTab[ResultMask].Arg1 = RHS; 349 MadeChange = true; 350 } 351 } 352 } 353 } 354 } 355 356 std::cerr << "Finished Table has " << getNumEntered() 357 << " entries established.\n"; 358 359 unsigned CostArray[10] = { 0 }; 360 361 // Compute a cost histogram. 362 for (unsigned i = 0; i != 65536; ++i) { 363 if (!isValidMask(i)) continue; 364 if (ShufTab[i].Cost > 9) 365 ++CostArray[9]; 366 else 367 ++CostArray[ShufTab[i].Cost]; 368 } 369 370 for (unsigned i = 0; i != 9; ++i) 371 if (CostArray[i]) 372 std::cout << "// " << CostArray[i] << " entries have cost " << i << "\n"; 373 if (CostArray[9]) 374 std::cout << "// " << CostArray[9] << " entries have higher cost!\n"; 375 376 377 // Build up the table to emit. 378 std::cout << "\n// This table is 6561*4 = 26244 bytes in size.\n"; 379 std::cout << "static const unsigned PerfectShuffleTable[6561+1] = {\n"; 380 381 for (unsigned i = 0; i != 0x8889; ++i) { 382 if (!isValidMask(i)) continue; 383 384 // CostSat - The cost of this operation saturated to two bits. 385 unsigned CostSat = ShufTab[i].Cost; 386 if (CostSat > 4) CostSat = 4; 387 if (CostSat == 0) CostSat = 1; 388 --CostSat; // Cost is now between 0-3. 389 390 unsigned OpNum = ShufTab[i].Op ? ShufTab[i].Op->OpNum : 0; 391 assert(OpNum < 16 && "Too few bits to encode operation!"); 392 393 unsigned LHS = getCompressedMask(ShufTab[i].Arg0); 394 unsigned RHS = getCompressedMask(ShufTab[i].Arg1); 395 396 // Encode this as 2 bits of saturated cost, 4 bits of opcodes, 13 bits of 397 // LHS, and 13 bits of RHS = 32 bits. 398 unsigned Val = (CostSat << 30) | (OpNum << 26) | (LHS << 13) | RHS; 399 400 std::cout << " " << Val << "U,\t// "; 401 PrintMask(i, std::cout); 402 std::cout << ": Cost " << ShufTab[i].Cost; 403 std::cout << " " << (ShufTab[i].Op ? ShufTab[i].Op->getName() : "copy"); 404 std::cout << " "; 405 if (ShufTab[ShufTab[i].Arg0].Cost == 0) { 406 std::cout << getZeroCostOpName(ShufTab[i].Arg0); 407 } else { 408 PrintMask(ShufTab[i].Arg0, std::cout); 409 } 410 411 if (ShufTab[i].Op && !ShufTab[i].Op->isOnlyLHSOperator()) { 412 std::cout << ", "; 413 if (ShufTab[ShufTab[i].Arg1].Cost == 0) { 414 std::cout << getZeroCostOpName(ShufTab[i].Arg1); 415 } else { 416 PrintMask(ShufTab[i].Arg1, std::cout); 417 } 418 } 419 std::cout << "\n"; 420 } 421 std::cout << " 0\n};\n"; 422 423 if (0) { 424 // Print out the table. 425 for (unsigned i = 0; i != 0x8889; ++i) { 426 if (!isValidMask(i)) continue; 427 if (ShufTab[i].Cost < 1000) { 428 PrintMask(i, std::cerr); 429 std::cerr << " - Cost " << ShufTab[i].Cost << " - "; 430 431 unsigned short Vals[30]; 432 unsigned NumVals = 0; 433 EvaluateOps(i, Vals, NumVals); 434 435 for (unsigned j = 0, e = NumVals; j != e; ++j) 436 PrintOperation(j, Vals); 437 std::cerr << "\n"; 438 } 439 } 440 } 441} 442 443 444#define GENERATE_ALTIVEC 445 446#ifdef GENERATE_ALTIVEC 447 448///===---------------------------------------------------------------------===// 449/// The altivec instruction definitions. This is the altivec-specific part of 450/// this file. 451///===---------------------------------------------------------------------===// 452 453// Note that the opcode numbers here must match those in the PPC backend. 454enum { 455 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> 456 OP_VMRGHW, 457 OP_VMRGLW, 458 OP_VSPLTISW0, 459 OP_VSPLTISW1, 460 OP_VSPLTISW2, 461 OP_VSPLTISW3, 462 OP_VSLDOI4, 463 OP_VSLDOI8, 464 OP_VSLDOI12 465}; 466 467struct vmrghw : public Operator { 468 vmrghw() : Operator(0x0415, "vmrghw", OP_VMRGHW) {} 469} the_vmrghw; 470 471struct vmrglw : public Operator { 472 vmrglw() : Operator(0x2637, "vmrglw", OP_VMRGLW) {} 473} the_vmrglw; 474 475template<unsigned Elt> 476struct vspltisw : public Operator { 477 vspltisw(const char *N, unsigned Opc) 478 : Operator(MakeMask(Elt, Elt, Elt, Elt), N, Opc) {} 479}; 480 481vspltisw<0> the_vspltisw0("vspltisw0", OP_VSPLTISW0); 482vspltisw<1> the_vspltisw1("vspltisw1", OP_VSPLTISW1); 483vspltisw<2> the_vspltisw2("vspltisw2", OP_VSPLTISW2); 484vspltisw<3> the_vspltisw3("vspltisw3", OP_VSPLTISW3); 485 486template<unsigned N> 487struct vsldoi : public Operator { 488 vsldoi(const char *Name, unsigned Opc) 489 : Operator(MakeMask(N&7, (N+1)&7, (N+2)&7, (N+3)&7), Name, Opc) { 490 } 491}; 492 493vsldoi<1> the_vsldoi1("vsldoi4" , OP_VSLDOI4); 494vsldoi<2> the_vsldoi2("vsldoi8" , OP_VSLDOI8); 495vsldoi<3> the_vsldoi3("vsldoi12", OP_VSLDOI12); 496 497#endif 498