rsCpuScriptGroup2.cpp revision 1efae29f4bbe6c165caf6dfc4b89cf8a5f8c469b
1#include "rsCpuScriptGroup2.h" 2 3#include <dlfcn.h> 4#include <stdio.h> 5#include <stdlib.h> 6#include <unistd.h> 7 8#include <set> 9#include <sstream> 10#include <string> 11#include <vector> 12 13#ifndef RS_COMPATIBILITY_LIB 14#include "bcc/Config/Config.h" 15#endif 16 17#include "cpu_ref/rsCpuCore.h" 18#include "rsClosure.h" 19#include "rsContext.h" 20#include "rsCpuCore.h" 21#include "rsCpuExecutable.h" 22#include "rsCpuScript.h" 23#include "rsScript.h" 24#include "rsScriptGroup2.h" 25#include "rsScriptIntrinsic.h" 26 27using std::string; 28using std::vector; 29 30namespace android { 31namespace renderscript { 32 33namespace { 34 35const size_t DefaultKernelArgCount = 2; 36 37void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart, 38 uint32_t xend, uint32_t outstep) { 39 const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr; 40 RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo); 41 42 const size_t oldInLen = mutable_kinfo->inLen; 43 44 decltype(mutable_kinfo->inStride) oldInStride; 45 memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride)); 46 47 for (CPUClosure* cpuClosure : closures) { 48 const Closure* closure = cpuClosure->mClosure; 49 50 // There had better be enough space in mutable_kinfo 51 rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT); 52 53 for (size_t i = 0; i < closure->mNumArg; i++) { 54 const void* arg = closure->mArgs[i]; 55 const Allocation* a = (const Allocation*)arg; 56 const uint32_t eStride = a->mHal.state.elementSizeBytes; 57 const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) + 58 eStride * xstart; 59 if (kinfo->dim.y > 1) { 60 ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y; 61 } 62 mutable_kinfo->inPtr[i] = ptr; 63 mutable_kinfo->inStride[i] = eStride; 64 } 65 mutable_kinfo->inLen = closure->mNumArg; 66 67 const Allocation* out = closure->mReturnValue; 68 const uint32_t ostep = out->mHal.state.elementSizeBytes; 69 const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) + 70 ostep * xstart; 71 if (kinfo->dim.y > 1) { 72 ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y; 73 } 74 75 rsAssert(kinfo->outLen <= 1); 76 mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr); 77 78 cpuClosure->mFunc(kinfo, xstart, xend, ostep); 79 } 80 81 mutable_kinfo->inLen = oldInLen; 82 memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride)); 83} 84 85} // namespace 86 87Batch::Batch(CpuScriptGroup2Impl* group, const char* name) : 88 mGroup(group), mFunc(nullptr) { 89 mName = strndup(name, strlen(name)); 90} 91 92Batch::~Batch() { 93 for (CPUClosure* c : mClosures) { 94 delete c; 95 } 96 free(mName); 97} 98 99bool Batch::conflict(CPUClosure* cpuClosure) const { 100 if (mClosures.empty()) { 101 return false; 102 } 103 104 const Closure* closure = cpuClosure->mClosure; 105 106 if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) { 107 // An invoke should be in a batch by itself, so it conflicts with any other 108 // closure. 109 return true; 110 } 111 112 const auto& globalDeps = closure->mGlobalDeps; 113 const auto& argDeps = closure->mArgDeps; 114 115 for (CPUClosure* c : mClosures) { 116 const Closure* batched = c->mClosure; 117 if (globalDeps.find(batched) != globalDeps.end()) { 118 return true; 119 } 120 const auto& it = argDeps.find(batched); 121 if (it != argDeps.end()) { 122 const auto& args = (*it).second; 123 for (const auto &p1 : *args) { 124 if (p1.second.get() != nullptr) { 125 return true; 126 } 127 } 128 } 129 } 130 131 // The compiler fusion pass in bcc expects that kernels chained up through 132 // (1st) input and output. 133 134 const Closure* lastBatched = mClosures.back()->mClosure; 135 const auto& it = argDeps.find(lastBatched); 136 137 if (it == argDeps.end()) { 138 return true; 139 } 140 141 const auto& args = (*it).second; 142 for (const auto &p1 : *args) { 143 if (p1.first == 0 && p1.second.get() == nullptr) { 144 // The new closure depends on the last batched closure's return 145 // value (fieldId being nullptr) for its first argument (argument 0) 146 return false; 147 } 148 } 149 150 return true; 151} 152 153CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, 154 const ScriptGroupBase *sg) : 155 mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)), 156 mExecutable(nullptr), mScriptObj(nullptr) { 157 rsAssert(!mGroup->mClosures.empty()); 158 159 mCpuRefImpl->lockMutex(); 160 Batch* batch = new Batch(this, "Batch0"); 161 int i = 0; 162 for (Closure* closure: mGroup->mClosures) { 163 CPUClosure* cc; 164 const IDBase* funcID = closure->mFunctionID.get(); 165 RsdCpuScriptImpl* si = 166 (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript); 167 if (closure->mIsKernel) { 168 MTLaunchStruct mtls; 169 si->forEachKernelSetup(funcID->mSlot, &mtls); 170 cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel); 171 } else { 172 cc = new CPUClosure(closure, si); 173 } 174 175 if (batch->conflict(cc)) { 176 mBatches.push_back(batch); 177 std::stringstream ss; 178 ss << "Batch" << ++i; 179 batch = new Batch(this, ss.str().c_str()); 180 } 181 182 batch->mClosures.push_back(cc); 183 } 184 185 rsAssert(!batch->mClosures.empty()); 186 mBatches.push_back(batch); 187 188#ifndef RS_COMPATIBILITY_LIB 189 compile(mGroup->mCacheDir); 190 if (mScriptObj != nullptr && mExecutable != nullptr) { 191 for (Batch* batch : mBatches) { 192 batch->resolveFuncPtr(mScriptObj); 193 } 194 } 195#endif // RS_COMPATIBILITY_LIB 196 mCpuRefImpl->unlockMutex(); 197} 198 199void Batch::resolveFuncPtr(void* sharedObj) { 200 std::string funcName(mName); 201 if (mClosures.front()->mClosure->mIsKernel) { 202 funcName.append(".expand"); 203 } 204 mFunc = dlsym(sharedObj, funcName.c_str()); 205 rsAssert (mFunc != nullptr); 206} 207 208CpuScriptGroup2Impl::~CpuScriptGroup2Impl() { 209 for (Batch* batch : mBatches) { 210 delete batch; 211 } 212 delete mExecutable; 213 // TODO: move this dlclose into ~ScriptExecutable(). 214 if (mScriptObj != nullptr) { 215 dlclose(mScriptObj); 216 } 217} 218 219namespace { 220 221#ifndef RS_COMPATIBILITY_LIB 222 223string getCoreLibPath(Context* context, string* coreLibRelaxedPath) { 224 *coreLibRelaxedPath = ""; 225 226 // If we're debugging, use the debug library. 227 if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) { 228 return SYSLIBPATH"/libclcore_debug.bc"; 229 } 230 231 // Check for a platform specific library 232 233#if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON) 234 // NEON-capable ARMv7a devices can use an accelerated math library 235 // for all reduced precision scripts. 236 // ARMv8 does not use NEON, as ASIMD can be used with all precision 237 // levels. 238 *coreLibRelaxedPath = SYSLIBPATH"/libclcore_neon.bc"; 239#endif 240 241#if defined(__i386__) || defined(__x86_64__) 242 // x86 devices will use an optimized library. 243 return SYSLIBPATH"/libclcore_x86.bc"; 244#else 245 return SYSLIBPATH"/libclcore.bc"; 246#endif 247} 248 249void setupCompileArguments( 250 const vector<const char*>& inputs, const vector<string>& kernelBatches, 251 const vector<string>& invokeBatches, 252 const char* outputDir, const char* outputFileName, 253 const char* coreLibPath, const char* coreLibRelaxedPath, 254 const bool emitGlobalInfo, const bool emitGlobalInfoSkipConstant, 255 vector<const char*>* args) { 256 args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH); 257 args->push_back("-fPIC"); 258 args->push_back("-embedRSInfo"); 259 if (emitGlobalInfo) { 260 args->push_back("-rs-global-info"); 261 if (emitGlobalInfoSkipConstant) { 262 args->push_back("-rs-global-info-skip-constant"); 263 } 264 } 265 args->push_back("-mtriple"); 266 args->push_back(DEFAULT_TARGET_TRIPLE_STRING); 267 args->push_back("-bclib"); 268 args->push_back(coreLibPath); 269 args->push_back("-bclib_relaxed"); 270 args->push_back(coreLibRelaxedPath); 271 for (const char* input : inputs) { 272 args->push_back(input); 273 } 274 for (const string& batch : kernelBatches) { 275 args->push_back("-merge"); 276 args->push_back(batch.c_str()); 277 } 278 for (const string& batch : invokeBatches) { 279 args->push_back("-invoke"); 280 args->push_back(batch.c_str()); 281 } 282 args->push_back("-output_path"); 283 args->push_back(outputDir); 284 285 // The output filename has to be the last, in case we need to pop it out and 286 // replace with a different name. 287 args->push_back("-o"); 288 args->push_back(outputFileName); 289} 290 291void generateSourceSlot(RsdCpuReferenceImpl* ctxt, 292 const Closure& closure, 293 const std::vector<const char*>& inputs, 294 std::stringstream& ss) { 295 const IDBase* funcID = (const IDBase*)closure.mFunctionID.get(); 296 const Script* script = funcID->mScript; 297 298 rsAssert (!script->isIntrinsic()); 299 300 const RsdCpuScriptImpl *cpuScript = 301 (const RsdCpuScriptImpl *)ctxt->lookupScript(script); 302 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 303 304 const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) - 305 inputs.begin(); 306 307 ss << index << "," << funcID->mSlot << "."; 308} 309 310#endif // RS_COMPATIBILTY_LIB 311 312} // anonymous namespace 313 314void CpuScriptGroup2Impl::compile(const char* cacheDir) { 315#ifndef RS_COMPATIBILITY_LIB 316 if (mGroup->mClosures.size() < 2) { 317 return; 318 } 319 320 auto comparator = [](const char* str1, const char* str2) -> bool { 321 return strcmp(str1, str2) < 0; 322 }; 323 std::set<const char*, decltype(comparator)> inputSet(comparator); 324 325 for (Closure* closure : mGroup->mClosures) { 326 const Script* script = closure->mFunctionID.get()->mScript; 327 328 // If any script is an intrinsic, give up trying fusing the kernels. 329 if (script->isIntrinsic()) { 330 return; 331 } 332 333 const RsdCpuScriptImpl *cpuScript = 334 (const RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(script); 335 336 const char* bitcodeFilename = cpuScript->getBitcodeFilePath(); 337 inputSet.insert(bitcodeFilename); 338 } 339 340 std::vector<const char*> inputs(inputSet.begin(), inputSet.end()); 341 342 std::vector<string> kernelBatches; 343 std::vector<string> invokeBatches; 344 345 int i = 0; 346 for (const auto& batch : mBatches) { 347 rsAssert(batch->size() > 0); 348 349 std::stringstream ss; 350 ss << batch->mName << ":"; 351 352 if (!batch->mClosures.front()->mClosure->mIsKernel) { 353 rsAssert(batch->size() == 1); 354 generateSourceSlot(mCpuRefImpl, *batch->mClosures.front()->mClosure, inputs, ss); 355 invokeBatches.push_back(ss.str()); 356 } else { 357 for (const auto& cpuClosure : batch->mClosures) { 358 generateSourceSlot(mCpuRefImpl, *cpuClosure->mClosure, inputs, ss); 359 } 360 kernelBatches.push_back(ss.str()); 361 } 362 } 363 364 rsAssert(cacheDir != nullptr); 365 string objFilePath(cacheDir); 366 objFilePath.append("/"); 367 objFilePath.append(mGroup->mName); 368 objFilePath.append(".o"); 369 370 const char* resName = mGroup->mName; 371 string coreLibRelaxedPath; 372 const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(), 373 &coreLibRelaxedPath); 374 375 vector<const char*> arguments; 376 bool emitGlobalInfo = getCpuRefImpl()->getEmbedGlobalInfo(); 377 bool emitGlobalInfoSkipConstant = getCpuRefImpl()->getEmbedGlobalInfoSkipConstant(); 378 setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir, 379 resName, coreLibPath.c_str(), coreLibRelaxedPath.c_str(), 380 emitGlobalInfo, emitGlobalInfoSkipConstant, 381 &arguments); 382 383 std::unique_ptr<const char> cmdLine(rsuJoinStrings(arguments.size() - 1, 384 arguments.data())); 385 386 inputs.push_back(coreLibPath.c_str()); 387 inputs.push_back(coreLibRelaxedPath.c_str()); 388 389 uint32_t checksum = constructBuildChecksum(nullptr, 0, cmdLine.get(), 390 inputs.data(), inputs.size()); 391 392 if (checksum == 0) { 393 return; 394 } 395 396 std::stringstream ss; 397 ss << std::hex << checksum; 398 const char* checksumStr = ss.str().c_str(); 399 400 //===--------------------------------------------------------------------===// 401 // Try to load a shared lib from code cache matching filename and checksum 402 //===--------------------------------------------------------------------===// 403 404 bool alreadyLoaded = false; 405 std::string cloneName; 406 407 mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName, nullptr, 408 &alreadyLoaded); 409 if (mScriptObj != nullptr) { 410 // A shared library named resName is found in code cache directory 411 // cacheDir, and loaded with the handle stored in mScriptObj. 412 413 mExecutable = ScriptExecutable::createFromSharedObject( 414 getCpuRefImpl()->getContext(), mScriptObj, checksum); 415 416 if (mExecutable != nullptr) { 417 // The loaded shared library in mScriptObj has a matching checksum. 418 // An executable object has been created. 419 return; 420 } 421 422 ALOGV("Failed to create an executable object from so file due to " 423 "mismatching checksum"); 424 425 if (alreadyLoaded) { 426 // The shared object found in code cache has already been loaded. 427 // A different file name is needed for the new shared library, to 428 // avoid corrupting the currently loaded instance. 429 430 cloneName.append(resName); 431 cloneName.append("#"); 432 cloneName.append(SharedLibraryUtils::getRandomString(6).string()); 433 434 // The last element in arguments is the output filename. 435 arguments.pop_back(); 436 arguments.push_back(cloneName.c_str()); 437 } 438 439 dlclose(mScriptObj); 440 mScriptObj = nullptr; 441 } 442 443 //===--------------------------------------------------------------------===// 444 // Fuse the input kernels and generate native code in an object file 445 //===--------------------------------------------------------------------===// 446 447 arguments.push_back("-build-checksum"); 448 arguments.push_back(checksumStr); 449 arguments.push_back(nullptr); 450 451 bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH, 452 arguments.size()-1, 453 arguments.data()); 454 if (!compiled) { 455 return; 456 } 457 458 //===--------------------------------------------------------------------===// 459 // Create and load the shared lib 460 //===--------------------------------------------------------------------===// 461 462 if (!SharedLibraryUtils::createSharedLibrary( 463 getCpuRefImpl()->getContext()->getDriverName(), cacheDir, resName)) { 464 ALOGE("Failed to link object file '%s'", resName); 465 unlink(objFilePath.c_str()); 466 return; 467 } 468 469 unlink(objFilePath.c_str()); 470 471 mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); 472 if (mScriptObj == nullptr) { 473 ALOGE("Unable to load '%s'", resName); 474 return; 475 } 476 477 if (alreadyLoaded) { 478 // Delete the temporary, random-named file that we created to avoid 479 // interfering with an already loaded shared library. 480 string cloneFilePath(cacheDir); 481 cloneFilePath.append("/"); 482 cloneFilePath.append(cloneName.c_str()); 483 cloneFilePath.append(".so"); 484 unlink(cloneFilePath.c_str()); 485 } 486 487 mExecutable = ScriptExecutable::createFromSharedObject( 488 getCpuRefImpl()->getContext(), 489 mScriptObj); 490 491#endif // RS_COMPATIBILITY_LIB 492} 493 494void CpuScriptGroup2Impl::execute() { 495 for (auto batch : mBatches) { 496 batch->setGlobalsForBatch(); 497 batch->run(); 498 } 499} 500 501void Batch::setGlobalsForBatch() { 502 for (CPUClosure* cpuClosure : mClosures) { 503 const Closure* closure = cpuClosure->mClosure; 504 const IDBase* funcID = closure->mFunctionID.get(); 505 Script* s = funcID->mScript;; 506 for (const auto& p : closure->mGlobals) { 507 const void* value = p.second.first; 508 int size = p.second.second; 509 if (value == nullptr && size == 0) { 510 // This indicates the current closure depends on another closure for a 511 // global in their shared module (script). In this case we don't need to 512 // copy the value. For example, an invoke intializes a global variable 513 // which a kernel later reads. 514 continue; 515 } 516 rsAssert(p.first != nullptr); 517 Script* script = p.first->mScript; 518 RsdCpuReferenceImpl* ctxt = mGroup->getCpuRefImpl(); 519 const RsdCpuScriptImpl *cpuScript = 520 (const RsdCpuScriptImpl *)ctxt->lookupScript(script); 521 int slot = p.first->mSlot; 522 ScriptExecutable* exec = mGroup->getExecutable(); 523 if (exec != nullptr) { 524 const char* varName = cpuScript->getFieldName(slot); 525 void* addr = exec->getFieldAddress(varName); 526 if (size < 0) { 527 rsrSetObject(mGroup->getCpuRefImpl()->getContext(), 528 (rs_object_base*)addr, (ObjectBase*)value); 529 } else { 530 memcpy(addr, (const void*)&value, size); 531 } 532 } else { 533 // We use -1 size to indicate an ObjectBase rather than a primitive type 534 if (size < 0) { 535 s->setVarObj(slot, (ObjectBase*)value); 536 } else { 537 s->setVar(slot, (const void*)&value, size); 538 } 539 } 540 } 541 } 542} 543 544void Batch::run() { 545 if (!mClosures.front()->mClosure->mIsKernel) { 546 rsAssert(mClosures.size() == 1); 547 548 // This batch contains a single closure for an invoke function 549 CPUClosure* cc = mClosures.front(); 550 const Closure* c = cc->mClosure; 551 552 if (mFunc != nullptr) { 553 // TODO: Need align pointers for x86_64. 554 // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp 555 ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength); 556 } else { 557 const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get(); 558 rsAssert(invokeID != nullptr); 559 cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength); 560 } 561 562 return; 563 } 564 565 if (mFunc != nullptr) { 566 MTLaunchStruct mtls; 567 const CPUClosure* firstCpuClosure = mClosures.front(); 568 const CPUClosure* lastCpuClosure = mClosures.back(); 569 570 firstCpuClosure->mSi->forEachMtlsSetup( 571 (const Allocation**)firstCpuClosure->mClosure->mArgs, 572 firstCpuClosure->mClosure->mNumArg, 573 lastCpuClosure->mClosure->mReturnValue, 574 nullptr, 0, nullptr, &mtls); 575 576 mtls.script = nullptr; 577 mtls.fep.usr = nullptr; 578 mtls.kernel = (ForEachFunc_t)mFunc; 579 580 mGroup->getCpuRefImpl()->launchThreads( 581 (const Allocation**)firstCpuClosure->mClosure->mArgs, 582 firstCpuClosure->mClosure->mNumArg, 583 lastCpuClosure->mClosure->mReturnValue, 584 nullptr, &mtls); 585 586 return; 587 } 588 589 for (CPUClosure* cpuClosure : mClosures) { 590 const Closure* closure = cpuClosure->mClosure; 591 const ScriptKernelID* kernelID = 592 (const ScriptKernelID*)closure->mFunctionID.get(); 593 cpuClosure->mSi->preLaunch(kernelID->mSlot, 594 (const Allocation**)closure->mArgs, 595 closure->mNumArg, closure->mReturnValue, 596 nullptr, 0, nullptr); 597 } 598 599 const CPUClosure* cpuClosure = mClosures.front(); 600 const Closure* closure = cpuClosure->mClosure; 601 MTLaunchStruct mtls; 602 603 if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs, 604 closure->mNumArg, 605 closure->mReturnValue, 606 nullptr, 0, nullptr, &mtls)) { 607 608 mtls.script = nullptr; 609 mtls.kernel = (void (*)())&groupRoot; 610 mtls.fep.usr = &mClosures; 611 612 mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls); 613 } 614 615 for (CPUClosure* cpuClosure : mClosures) { 616 const Closure* closure = cpuClosure->mClosure; 617 const ScriptKernelID* kernelID = 618 (const ScriptKernelID*)closure->mFunctionID.get(); 619 cpuClosure->mSi->postLaunch(kernelID->mSlot, 620 (const Allocation**)closure->mArgs, 621 closure->mNumArg, closure->mReturnValue, 622 nullptr, 0, nullptr); 623 } 624} 625 626} // namespace renderscript 627} // namespace android 628