rsCpuScriptGroup2.cpp revision f02a2b0a2749d4a4f07edbc23eddff2e51d11b72
1#include "rsCpuScriptGroup2.h" 2 3#include <dlfcn.h> 4#include <stdio.h> 5#include <stdlib.h> 6#include <unistd.h> 7 8#include <set> 9#include <sstream> 10#include <string> 11#include <vector> 12 13#ifndef RS_COMPATIBILITY_LIB 14#include <zlib.h> 15 16#include "bcc/Config/Config.h" 17#endif 18 19#include "cpu_ref/rsCpuCore.h" 20#include "rsClosure.h" 21#include "rsContext.h" 22#include "rsCpuCore.h" 23#include "rsCpuExecutable.h" 24#include "rsCpuScript.h" 25#include "rsScript.h" 26#include "rsScriptGroup2.h" 27#include "rsScriptIntrinsic.h" 28 29using std::string; 30using std::vector; 31 32namespace android { 33namespace renderscript { 34 35namespace { 36 37const size_t DefaultKernelArgCount = 2; 38 39void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart, 40 uint32_t xend, uint32_t outstep) { 41 const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr; 42 RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo); 43 44 const size_t oldInLen = mutable_kinfo->inLen; 45 46 decltype(mutable_kinfo->inStride) oldInStride; 47 memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride)); 48 49 for (CPUClosure* cpuClosure : closures) { 50 const Closure* closure = cpuClosure->mClosure; 51 52 // There had better be enough space in mutable_kinfo 53 rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT); 54 55 for (size_t i = 0; i < closure->mNumArg; i++) { 56 const void* arg = closure->mArgs[i]; 57 const Allocation* a = (const Allocation*)arg; 58 const uint32_t eStride = a->mHal.state.elementSizeBytes; 59 const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) + 60 eStride * xstart; 61 if (kinfo->dim.y > 1) { 62 ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y; 63 } 64 mutable_kinfo->inPtr[i] = ptr; 65 mutable_kinfo->inStride[i] = eStride; 66 } 67 mutable_kinfo->inLen = closure->mNumArg; 68 69 const Allocation* out = closure->mReturnValue; 70 const uint32_t ostep = out->mHal.state.elementSizeBytes; 71 const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) + 72 ostep * xstart; 73 if (kinfo->dim.y > 1) { 74 ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y; 75 } 76 77 rsAssert(kinfo->outLen <= 1); 78 mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr); 79 80 cpuClosure->mFunc(kinfo, xstart, xend, ostep); 81 } 82 83 mutable_kinfo->inLen = oldInLen; 84 memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride)); 85} 86 87} // namespace 88 89Batch::Batch(CpuScriptGroup2Impl* group, const char* name) : 90 mGroup(group), mFunc(nullptr) { 91 mName = strndup(name, strlen(name)); 92} 93 94Batch::~Batch() { 95 for (CPUClosure* c : mClosures) { 96 delete c; 97 } 98 free(mName); 99} 100 101bool Batch::conflict(CPUClosure* cpuClosure) const { 102 if (mClosures.empty()) { 103 return false; 104 } 105 106 const Closure* closure = cpuClosure->mClosure; 107 108 if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) { 109 // An invoke should be in a batch by itself, so it conflicts with any other 110 // closure. 111 return true; 112 } 113 114 const auto& globalDeps = closure->mGlobalDeps; 115 const auto& argDeps = closure->mArgDeps; 116 117 for (CPUClosure* c : mClosures) { 118 const Closure* batched = c->mClosure; 119 if (globalDeps.find(batched) != globalDeps.end()) { 120 return true; 121 } 122 const auto& it = argDeps.find(batched); 123 if (it != argDeps.end()) { 124 const auto& args = (*it).second; 125 for (const auto &p1 : *args) { 126 if (p1.second.get() != nullptr) { 127 return true; 128 } 129 } 130 } 131 } 132 133 return false; 134} 135 136CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, 137 const ScriptGroupBase *sg) : 138 mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)), 139 mExecutable(nullptr), mScriptObj(nullptr) { 140 rsAssert(!mGroup->mClosures.empty()); 141 142 Batch* batch = new Batch(this, "Batch0"); 143 int i = 0; 144 for (Closure* closure: mGroup->mClosures) { 145 CPUClosure* cc; 146 const IDBase* funcID = closure->mFunctionID.get(); 147 RsdCpuScriptImpl* si = 148 (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript); 149 if (closure->mIsKernel) { 150 MTLaunchStruct mtls; 151 si->forEachKernelSetup(funcID->mSlot, &mtls); 152 cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel); 153 } else { 154 cc = new CPUClosure(closure, si); 155 } 156 157 if (batch->conflict(cc)) { 158 mBatches.push_back(batch); 159 std::stringstream ss; 160 ss << "Batch" << ++i; 161 batch = new Batch(this, ss.str().c_str()); 162 } 163 164 batch->mClosures.push_back(cc); 165 } 166 167 rsAssert(!batch->mClosures.empty()); 168 mBatches.push_back(batch); 169 170#ifndef RS_COMPATIBILITY_LIB 171 compile(mGroup->mCacheDir); 172 if (mScriptObj != nullptr && mExecutable != nullptr) { 173 for (Batch* batch : mBatches) { 174 batch->resolveFuncPtr(mScriptObj); 175 } 176 } 177#endif // RS_COMPATIBILITY_LIB 178} 179 180void Batch::resolveFuncPtr(void* sharedObj) { 181 std::string funcName(mName); 182 if (mClosures.front()->mClosure->mIsKernel) { 183 funcName.append(".expand"); 184 } 185 mFunc = dlsym(sharedObj, funcName.c_str()); 186 rsAssert (mFunc != nullptr); 187} 188 189CpuScriptGroup2Impl::~CpuScriptGroup2Impl() { 190 for (Batch* batch : mBatches) { 191 delete batch; 192 } 193 delete mExecutable; 194 // TODO: move this dlclose into ~ScriptExecutable(). 195 if (mScriptObj != nullptr) { 196 dlclose(mScriptObj); 197 } 198} 199 200namespace { 201 202#ifndef RS_COMPATIBILITY_LIB 203 204string getCoreLibPath(Context* context, string* coreLibRelaxedPath) { 205 *coreLibRelaxedPath = ""; 206 207 // If we're debugging, use the debug library. 208 if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) { 209 return SYSLIBPATH"/libclcore_debug.bc"; 210 } 211 212 // Check for a platform specific library 213 214#if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON) 215 // NEON-capable ARMv7a devices can use an accelerated math library 216 // for all reduced precision scripts. 217 // ARMv8 does not use NEON, as ASIMD can be used with all precision 218 // levels. 219 *coreLibRelaxedPath = SYSLIBPATH"/libclcore_neon.bc"; 220#endif 221 222#if defined(__i386__) || defined(__x86_64__) 223 // x86 devices will use an optimized library. 224 return SYSLIBPATH"/libclcore_x86.bc"; 225#else 226 return SYSLIBPATH"/libclcore.bc"; 227#endif 228} 229 230bool getChecksum(const std::vector<string>& inputBitcodeFilenames, 231 const string& coreLibPath, const string& coreLibRelaxedPath, 232 const char* commandLine, 233 char* checksumStr) { 234 uint32_t checksum = adler32(0L, Z_NULL, 0); 235 236 for (const auto& bcFilename : inputBitcodeFilenames) { 237 if (!android::renderscript::addFileToChecksum(bcFilename.c_str(), checksum)) { 238 return false; 239 } 240 } 241 242 if (!android::renderscript::addFileToChecksum(coreLibPath.c_str(), checksum)) { 243 return false; 244 } 245 246 if (!coreLibRelaxedPath.empty() && 247 !android::renderscript::addFileToChecksum(coreLibRelaxedPath.c_str(), checksum)) { 248 return false; 249 } 250 251 // include checksum of command line arguments 252 checksum = adler32(checksum, (const unsigned char *) commandLine, 253 strlen(commandLine)); 254 255 sprintf(checksumStr, "%08x", checksum); 256 257 return true; 258} 259 260void setupCompileArguments( 261 const vector<string>& inputs, const vector<string>& kernelBatches, 262 const vector<string>& invokeBatches, 263 const string& output_dir, const string& output_filename, 264 const string& coreLibPath, const string& coreLibRelaxedPath, 265 vector<const char*>* args) { 266 args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH); 267 args->push_back("-fPIC"); 268 args->push_back("-embedRSInfo"); 269 args->push_back("-mtriple"); 270 args->push_back(DEFAULT_TARGET_TRIPLE_STRING); 271 args->push_back("-bclib"); 272 args->push_back(coreLibPath.c_str()); 273 args->push_back("-bclib_relaxed"); 274 args->push_back(coreLibRelaxedPath.c_str()); 275 for (const string& input : inputs) { 276 args->push_back(input.c_str()); 277 } 278 for (const string& batch : kernelBatches) { 279 args->push_back("-merge"); 280 args->push_back(batch.c_str()); 281 } 282 for (const string& batch : invokeBatches) { 283 args->push_back("-invoke"); 284 args->push_back(batch.c_str()); 285 } 286 args->push_back("-output_path"); 287 args->push_back(output_dir.c_str()); 288 args->push_back("-o"); 289 args->push_back(output_filename.c_str()); 290} 291 292void generateSourceSlot(const Closure& closure, 293 const std::vector<std::string>& inputs, 294 std::stringstream& ss) { 295 const IDBase* funcID = (const IDBase*)closure.mFunctionID.get(); 296 const Script* script = funcID->mScript; 297 298 rsAssert (!script->isIntrinsic()); 299 300 const RsdCpuScriptImpl *cpuScript = 301 (const RsdCpuScriptImpl*)script->mHal.drv; 302 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 303 304 const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) - 305 inputs.begin(); 306 307 ss << index << "," << funcID->mSlot << "."; 308} 309 310#endif // RS_COMPATIBILTY_LIB 311 312} // anonymous namespace 313 314void CpuScriptGroup2Impl::compile(const char* cacheDir) { 315#ifndef RS_COMPATIBILITY_LIB 316 if (mGroup->mClosures.size() < 2) { 317 return; 318 } 319 320 std::set<string> inputSet; 321 for (Closure* closure : mGroup->mClosures) { 322 const Script* script = closure->mFunctionID.get()->mScript; 323 324 // If any script is an intrinsic, give up trying fusing the kernels. 325 if (script->isIntrinsic()) { 326 return; 327 } 328 329 const RsdCpuScriptImpl *cpuScript = 330 (const RsdCpuScriptImpl*)script->mHal.drv; 331 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 332 inputSet.insert(bitcodeFilename); 333 } 334 335 std::vector<string> inputs(inputSet.begin(), inputSet.end()); 336 337 std::vector<string> kernelBatches; 338 std::vector<string> invokeBatches; 339 340 int i = 0; 341 for (const auto& batch : mBatches) { 342 rsAssert(batch->size() > 0); 343 344 std::stringstream ss; 345 ss << batch->mName << ":"; 346 347 if (!batch->mClosures.front()->mClosure->mIsKernel) { 348 rsAssert(batch->size() == 1); 349 generateSourceSlot(*batch->mClosures.front()->mClosure, inputs, ss); 350 invokeBatches.push_back(ss.str()); 351 } else { 352 for (const auto& cpuClosure : batch->mClosures) { 353 generateSourceSlot(*cpuClosure->mClosure, inputs, ss); 354 } 355 kernelBatches.push_back(ss.str()); 356 } 357 } 358 359 rsAssert(cacheDir != nullptr); 360 string objFilePath(cacheDir); 361 objFilePath.append("/"); 362 objFilePath.append(mGroup->mName); 363 objFilePath.append(".o"); 364 365 string outputFileName(mGroup->mName); 366 string coreLibRelaxedPath; 367 const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(), 368 &coreLibRelaxedPath); 369 370 vector<const char*> arguments; 371 string output_dir(cacheDir); 372 setupCompileArguments(inputs, kernelBatches, invokeBatches, output_dir, 373 outputFileName, coreLibPath, coreLibRelaxedPath, 374 &arguments); 375 376 std::unique_ptr<const char> cmdLine(rsuJoinStrings(arguments.size() - 1, 377 arguments.data())); 378 379 if (!getChecksum(inputs, coreLibPath, coreLibRelaxedPath, cmdLine.get(), 380 mChecksum)) { 381 return; 382 } 383 384 const char* resName = outputFileName.c_str(); 385 386 //===--------------------------------------------------------------------===// 387 // Try to load a shared lib from code cache matching filename and checksum 388 //===--------------------------------------------------------------------===// 389 390 mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); 391 if (mScriptObj != nullptr) { 392 mExecutable = ScriptExecutable::createFromSharedObject( 393 getCpuRefImpl()->getContext(), mScriptObj); 394 if (mExecutable != nullptr) { 395 if (mExecutable->isChecksumValid(mChecksum)) { 396 return; 397 } else { 398 ALOGE("Invalid checksum from cached so: %s (expected: %s)", 399 mExecutable->getBuildChecksum(), mChecksum); 400 } 401 delete mExecutable; 402 mExecutable = nullptr; 403 } else { 404 ALOGE("Failed to create an executable object from so file"); 405 } 406 dlclose(mScriptObj); 407 mScriptObj = nullptr; 408 } 409 410 //===--------------------------------------------------------------------===// 411 // Fuse the input kernels and generate native code in an object file 412 //===--------------------------------------------------------------------===// 413 414 arguments.push_back("-build-checksum"); 415 arguments.push_back(mChecksum); 416 arguments.push_back(nullptr); 417 418 bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH, 419 arguments.size()-1, 420 arguments.data()); 421 if (!compiled) { 422 return; 423 } 424 425 //===--------------------------------------------------------------------===// 426 // Create and load the shared lib 427 //===--------------------------------------------------------------------===// 428 429 if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) { 430 ALOGE("Failed to link object file '%s'", resName); 431 unlink(objFilePath.c_str()); 432 return; 433 } 434 435 unlink(objFilePath.c_str()); 436 437 mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); 438 if (mScriptObj == nullptr) { 439 ALOGE("Unable to load '%s'", resName); 440 return; 441 } 442 443 mExecutable = ScriptExecutable::createFromSharedObject( 444 getCpuRefImpl()->getContext(), 445 mScriptObj); 446 447#endif // RS_COMPATIBILITY_LIB 448} 449 450void CpuScriptGroup2Impl::execute() { 451 for (auto batch : mBatches) { 452 batch->setGlobalsForBatch(); 453 batch->run(); 454 } 455} 456 457void Batch::setGlobalsForBatch() { 458 for (CPUClosure* cpuClosure : mClosures) { 459 const Closure* closure = cpuClosure->mClosure; 460 const IDBase* funcID = closure->mFunctionID.get(); 461 Script* s = funcID->mScript;; 462 for (const auto& p : closure->mGlobals) { 463 const void* value = p.second.first; 464 int size = p.second.second; 465 if (value == nullptr && size == 0) { 466 // This indicates the current closure depends on another closure for a 467 // global in their shared module (script). In this case we don't need to 468 // copy the value. For example, an invoke intializes a global variable 469 // which a kernel later reads. 470 continue; 471 } 472 rsAssert(p.first != nullptr); 473 Script* script = p.first->mScript; 474 const RsdCpuScriptImpl *cpuScript = 475 (const RsdCpuScriptImpl*)script->mHal.drv; 476 int slot = p.first->mSlot; 477 ScriptExecutable* exec = mGroup->getExecutable(); 478 if (exec != nullptr) { 479 const char* varName = cpuScript->getFieldName(slot); 480 void* addr = exec->getFieldAddress(varName); 481 if (size < 0) { 482 rsrSetObject(mGroup->getCpuRefImpl()->getContext(), 483 (rs_object_base*)addr, (ObjectBase*)value); 484 } else { 485 memcpy(addr, (const void*)&value, size); 486 } 487 } else { 488 // We use -1 size to indicate an ObjectBase rather than a primitive type 489 if (size < 0) { 490 s->setVarObj(slot, (ObjectBase*)value); 491 } else { 492 s->setVar(slot, (const void*)&value, size); 493 } 494 } 495 } 496 } 497} 498 499void Batch::run() { 500 if (!mClosures.front()->mClosure->mIsKernel) { 501 rsAssert(mClosures.size() == 1); 502 503 // This batch contains a single closure for an invoke function 504 CPUClosure* cc = mClosures.front(); 505 const Closure* c = cc->mClosure; 506 507 if (mFunc != nullptr) { 508 // TODO: Need align pointers for x86_64. 509 // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp 510 ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength); 511 } else { 512 const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get(); 513 rsAssert(invokeID != nullptr); 514 cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength); 515 } 516 517 return; 518 } 519 520 if (mFunc != nullptr) { 521 MTLaunchStruct mtls; 522 const CPUClosure* firstCpuClosure = mClosures.front(); 523 const CPUClosure* lastCpuClosure = mClosures.back(); 524 525 firstCpuClosure->mSi->forEachMtlsSetup( 526 (const Allocation**)firstCpuClosure->mClosure->mArgs, 527 firstCpuClosure->mClosure->mNumArg, 528 lastCpuClosure->mClosure->mReturnValue, 529 nullptr, 0, nullptr, &mtls); 530 531 mtls.script = nullptr; 532 mtls.fep.usr = nullptr; 533 mtls.kernel = (ForEachFunc_t)mFunc; 534 535 mGroup->getCpuRefImpl()->launchThreads( 536 (const Allocation**)firstCpuClosure->mClosure->mArgs, 537 firstCpuClosure->mClosure->mNumArg, 538 lastCpuClosure->mClosure->mReturnValue, 539 nullptr, &mtls); 540 541 return; 542 } 543 544 for (CPUClosure* cpuClosure : mClosures) { 545 const Closure* closure = cpuClosure->mClosure; 546 const ScriptKernelID* kernelID = 547 (const ScriptKernelID*)closure->mFunctionID.get(); 548 cpuClosure->mSi->preLaunch(kernelID->mSlot, 549 (const Allocation**)closure->mArgs, 550 closure->mNumArg, closure->mReturnValue, 551 nullptr, 0, nullptr); 552 } 553 554 const CPUClosure* cpuClosure = mClosures.front(); 555 const Closure* closure = cpuClosure->mClosure; 556 MTLaunchStruct mtls; 557 558 if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs, 559 closure->mNumArg, 560 closure->mReturnValue, 561 nullptr, 0, nullptr, &mtls)) { 562 563 mtls.script = nullptr; 564 mtls.kernel = (void (*)())&groupRoot; 565 mtls.fep.usr = &mClosures; 566 567 mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls); 568 } 569 570 for (CPUClosure* cpuClosure : mClosures) { 571 const Closure* closure = cpuClosure->mClosure; 572 const ScriptKernelID* kernelID = 573 (const ScriptKernelID*)closure->mFunctionID.get(); 574 cpuClosure->mSi->postLaunch(kernelID->mSlot, 575 (const Allocation**)closure->mArgs, 576 closure->mNumArg, closure->mReturnValue, 577 nullptr, 0, nullptr); 578 } 579} 580 581} // namespace renderscript 582} // namespace android 583