rsCpuScriptGroup2.cpp revision 958d8b23ac969d13ea3da0a2d9a355f5951afa8c
1#include "rsCpuScriptGroup2.h" 2 3#include <dlfcn.h> 4#include <stdio.h> 5#include <stdlib.h> 6#include <unistd.h> 7 8#include <set> 9#include <sstream> 10#include <string> 11#include <vector> 12 13#ifndef RS_COMPATIBILITY_LIB 14#include "bcc/Config/Config.h" 15#include <sys/wait.h> 16#endif 17 18#include "cpu_ref/rsCpuCore.h" 19#include "cpu_ref/rsCpuCoreRuntime.h" 20#include "rsClosure.h" 21#include "rsContext.h" 22#include "rsCpuCore.h" 23#include "rsCpuExecutable.h" 24#include "rsCpuScript.h" 25#include "rsScript.h" 26#include "rsScriptGroup2.h" 27#include "rsScriptIntrinsic.h" 28 29using std::string; 30using std::vector; 31 32namespace android { 33namespace renderscript { 34 35namespace { 36 37const size_t DefaultKernelArgCount = 2; 38 39void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart, 40 uint32_t xend, uint32_t outstep) { 41 const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr; 42 RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams; 43 const void **oldIns = kparams->ins; 44 uint32_t *oldStrides = kparams->inEStrides; 45 46 std::vector<const void*> ins(DefaultKernelArgCount); 47 std::vector<uint32_t> strides(DefaultKernelArgCount); 48 49 for (CPUClosure* cpuClosure : closures) { 50 const Closure* closure = cpuClosure->mClosure; 51 52 auto in_iter = ins.begin(); 53 auto stride_iter = strides.begin(); 54 55 for (size_t i = 0; i < closure->mNumArg; i++) { 56 const void* arg = closure->mArgs[i]; 57 const Allocation* a = (const Allocation*)arg; 58 const uint32_t eStride = a->mHal.state.elementSizeBytes; 59 const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) + 60 eStride * xstart; 61 if (kparams->dimY > 1) { 62 ptr += a->mHal.drvState.lod[0].stride * kparams->y; 63 } 64 *in_iter++ = ptr; 65 *stride_iter++ = eStride; 66 } 67 68 mutable_kparams->ins = &ins[0]; 69 mutable_kparams->inEStrides = &strides[0]; 70 71 const Allocation* out = closure->mReturnValue; 72 const uint32_t ostep = out->mHal.state.elementSizeBytes; 73 const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) + 74 ostep * xstart; 75 if (kparams->dimY > 1) { 76 ptr += out->mHal.drvState.lod[0].stride * kparams->y; 77 } 78 79 mutable_kparams->out = (void*)ptr; 80 81 cpuClosure->mFunc(kparams, xstart, xend, ostep); 82 } 83 84 mutable_kparams->ins = oldIns; 85 mutable_kparams->inEStrides = oldStrides; 86} 87 88} // namespace 89 90Batch::Batch(CpuScriptGroup2Impl* group, const char* name) : 91 mGroup(group), mFunc(nullptr) { 92 mName = strndup(name, strlen(name)); 93} 94 95Batch::~Batch() { 96 for (CPUClosure* c : mClosures) { 97 delete c; 98 } 99 free(mName); 100} 101 102bool Batch::conflict(CPUClosure* cpuClosure) const { 103 if (mClosures.empty()) { 104 return false; 105 } 106 107 const Closure* closure = cpuClosure->mClosure; 108 109 if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) { 110 // An invoke should be in a batch by itself, so it conflicts with any other 111 // closure. 112 return true; 113 } 114 115 const auto& globalDeps = closure->mGlobalDeps; 116 const auto& argDeps = closure->mArgDeps; 117 118 for (CPUClosure* c : mClosures) { 119 const Closure* batched = c->mClosure; 120 if (globalDeps.find(batched) != globalDeps.end()) { 121 return true; 122 } 123 const auto& it = argDeps.find(batched); 124 if (it != argDeps.end()) { 125 const auto& args = (*it).second; 126 for (const auto &p1 : *args) { 127 if (p1.second->get() != nullptr) { 128 return true; 129 } 130 } 131 } 132 } 133 134 return false; 135} 136 137CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, 138 const ScriptGroupBase *sg) : 139 mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)), 140 mExecutable(nullptr), mScriptObj(nullptr) { 141 rsAssert(!mGroup->mClosures.empty()); 142 143 Batch* batch = new Batch(this, "Batch0"); 144 int i = 0; 145 for (Closure* closure: mGroup->mClosures) { 146 CPUClosure* cc; 147 const IDBase* funcID = closure->mFunctionID.get(); 148 RsdCpuScriptImpl* si = 149 (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript); 150 if (closure->mIsKernel) { 151 MTLaunchStruct mtls; 152 si->forEachKernelSetup(funcID->mSlot, &mtls); 153 cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel); 154 } else { 155 cc = new CPUClosure(closure, si); 156 } 157 158 if (batch->conflict(cc)) { 159 mBatches.push_back(batch); 160 std::stringstream ss; 161 ss << "Batch" << ++i; 162 batch = new Batch(this, ss.str().c_str()); 163 } 164 165 batch->mClosures.push_back(cc); 166 } 167 168 rsAssert(!batch->mClosures.empty()); 169 mBatches.push_back(batch); 170 171#ifndef RS_COMPATIBILITY_LIB 172 compile(mGroup->mCacheDir); 173 if (mScriptObj != nullptr && mExecutable != nullptr) { 174 for (Batch* batch : mBatches) { 175 batch->resolveFuncPtr(mScriptObj); 176 } 177 } 178#endif // RS_COMPATIBILITY_LIB 179} 180 181void Batch::resolveFuncPtr(void* sharedObj) { 182 std::string funcName(mName); 183 if (mClosures.front()->mClosure->mIsKernel) { 184 funcName.append(".expand"); 185 } 186 mFunc = dlsym(sharedObj, funcName.c_str()); 187 rsAssert (mFunc != nullptr); 188} 189 190CpuScriptGroup2Impl::~CpuScriptGroup2Impl() { 191 for (Batch* batch : mBatches) { 192 delete batch; 193 } 194 // TODO: move this dlclose into ~ScriptExecutable(). 195 if (mScriptObj != nullptr) { 196 dlclose(mScriptObj); 197 } 198 delete mExecutable; 199} 200 201namespace { 202 203#ifndef RS_COMPATIBILITY_LIB 204 205string getCoreLibPath(Context* context, string* coreLibRelaxedPath) { 206 *coreLibRelaxedPath = ""; 207 208 // If we're debugging, use the debug library. 209 if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) { 210 return SYSLIBPATH"/libclcore_debug.bc"; 211 } 212 213 // Check for a platform specific library 214 215#if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON) 216 // NEON-capable ARMv7a devices can use an accelerated math library 217 // for all reduced precision scripts. 218 // ARMv8 does not use NEON, as ASIMD can be used with all precision 219 // levels. 220 *coreLibRelaxedPath = SYSLIBPATH"/libclcore_neon.bc"; 221#endif 222 223#if defined(__i386__) || defined(__x86_64__) 224 // x86 devices will use an optimized library. 225 return SYSLIBPATH"/libclcore_x86.bc"; 226#else 227 return SYSLIBPATH"/libclcore.bc"; 228#endif 229} 230 231string getFileName(string path) { 232 unsigned found = path.find_last_of("/\\"); 233 return path.substr(found + 1); 234} 235 236void setupCompileArguments( 237 const vector<string>& inputs, const vector<string>& kernelBatches, 238 const vector<string>& invokeBatches, 239 const string& output_dir, const string& output_filename, 240 const string& coreLibPath, const string& coreLibRelaxedPath, 241 vector<const char*>* args) { 242 args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH); 243 args->push_back("-fPIC"); 244 args->push_back("-embedRSInfo"); 245 args->push_back("-mtriple"); 246 args->push_back(DEFAULT_TARGET_TRIPLE_STRING); 247 args->push_back("-bclib"); 248 args->push_back(coreLibPath.c_str()); 249 args->push_back("-bclib_relaxed"); 250 args->push_back(coreLibRelaxedPath.c_str()); 251 for (const string& input : inputs) { 252 args->push_back(input.c_str()); 253 } 254 for (const string& batch : kernelBatches) { 255 args->push_back("-merge"); 256 args->push_back(batch.c_str()); 257 } 258 for (const string& batch : invokeBatches) { 259 args->push_back("-invoke"); 260 args->push_back(batch.c_str()); 261 } 262 args->push_back("-output_path"); 263 args->push_back(output_dir.c_str()); 264 args->push_back("-o"); 265 args->push_back(output_filename.c_str()); 266 args->push_back(nullptr); 267} 268 269bool fuseAndCompile(const char** arguments, 270 const string& commandLine) { 271 const pid_t pid = fork(); 272 273 if (pid == -1) { 274 ALOGE("Couldn't fork for bcc execution"); 275 return false; 276 } 277 278 if (pid == 0) { 279 // Child process 280 ALOGV("Invoking BCC with: %s", commandLine.c_str()); 281 execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments); 282 283 ALOGE("execv() failed: %s", strerror(errno)); 284 abort(); 285 return false; 286 } 287 288 // Parent process 289 int status = 0; 290 const pid_t w = waitpid(pid, &status, 0); 291 if (w == -1) { 292 return false; 293 } 294 295 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) { 296 ALOGE("bcc terminated unexpectedly"); 297 return false; 298 } 299 300 return true; 301} 302 303void generateSourceSlot(const Closure& closure, 304 const std::vector<std::string>& inputs, 305 std::stringstream& ss) { 306 const IDBase* funcID = (const IDBase*)closure.mFunctionID.get(); 307 const Script* script = funcID->mScript; 308 309 rsAssert (!script->isIntrinsic()); 310 311 const RsdCpuScriptImpl *cpuScript = 312 (const RsdCpuScriptImpl*)script->mHal.drv; 313 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 314 315 const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) - 316 inputs.begin(); 317 318 ss << index << "," << funcID->mSlot << "."; 319} 320 321#endif // RS_COMPATIBILTY_LIB 322 323} // anonymous namespace 324 325void CpuScriptGroup2Impl::compile(const char* cacheDir) { 326#ifndef RS_COMPATIBILITY_LIB 327 if (mGroup->mClosures.size() < 2) { 328 return; 329 } 330 331 //===--------------------------------------------------------------------===// 332 // Fuse the input kernels and generate native code in an object file 333 //===--------------------------------------------------------------------===// 334 335 std::set<string> inputSet; 336 for (Closure* closure : mGroup->mClosures) { 337 const Script* script = closure->mFunctionID.get()->mScript; 338 339 // If any script is an intrinsic, give up trying fusing the kernels. 340 if (script->isIntrinsic()) { 341 return; 342 } 343 344 const RsdCpuScriptImpl *cpuScript = 345 (const RsdCpuScriptImpl*)script->mHal.drv; 346 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 347 inputSet.insert(bitcodeFilename); 348 } 349 350 std::vector<string> inputs(inputSet.begin(), inputSet.end()); 351 352 std::vector<string> kernelBatches; 353 std::vector<string> invokeBatches; 354 355 int i = 0; 356 for (const auto& batch : mBatches) { 357 rsAssert(batch->size() > 0); 358 359 std::stringstream ss; 360 ss << batch->mName << ":"; 361 362 if (!batch->mClosures.front()->mClosure->mIsKernel) { 363 rsAssert(batch->size() == 1); 364 generateSourceSlot(*batch->mClosures.front()->mClosure, inputs, ss); 365 invokeBatches.push_back(ss.str()); 366 } else { 367 for (const auto& cpuClosure : batch->mClosures) { 368 generateSourceSlot(*cpuClosure->mClosure, inputs, ss); 369 } 370 kernelBatches.push_back(ss.str()); 371 } 372 } 373 374 rsAssert(cacheDir != nullptr); 375 string objFilePath(cacheDir); 376 objFilePath.append("/fusedXXXXXX.o"); 377 // Find unique object file name, to make following file names unique. 378 int tempfd = mkstemps(&objFilePath[0], 2); 379 if (tempfd == -1) { 380 return; 381 } 382 TEMP_FAILURE_RETRY(close(tempfd)); 383 384 string outputFileName = getFileName(objFilePath.substr(0, objFilePath.size() - 2)); 385 string coreLibRelaxedPath; 386 const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(), 387 &coreLibRelaxedPath); 388 vector<const char*> arguments; 389 setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir, 390 outputFileName, coreLibPath, coreLibRelaxedPath, &arguments); 391 std::unique_ptr<const char> joined( 392 rsuJoinStrings(arguments.size() - 1, arguments.data())); 393 string commandLine (joined.get()); 394 395 if (!fuseAndCompile(arguments.data(), commandLine)) { 396 unlink(objFilePath.c_str()); 397 return; 398 } 399 400 //===--------------------------------------------------------------------===// 401 // Create and load the shared lib 402 //===--------------------------------------------------------------------===// 403 404 const char* resName = outputFileName.c_str(); 405 406 if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) { 407 ALOGE("Failed to link object file '%s'", resName); 408 return; 409 } 410 411 mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); 412 if (mScriptObj == nullptr) { 413 ALOGE("Unable to load '%s'", resName); 414 return; 415 } 416 417 mExecutable = ScriptExecutable::createFromSharedObject( 418 nullptr, // RS context. Unused. 419 mScriptObj); 420 421#endif // RS_COMPATIBILITY_LIB 422} 423 424void CpuScriptGroup2Impl::execute() { 425 for (auto batch : mBatches) { 426 batch->setGlobalsForBatch(); 427 batch->run(); 428 } 429} 430 431void Batch::setGlobalsForBatch() { 432 for (CPUClosure* cpuClosure : mClosures) { 433 const Closure* closure = cpuClosure->mClosure; 434 const IDBase* funcID = closure->mFunctionID.get(); 435 Script* s = funcID->mScript;; 436 for (const auto& p : closure->mGlobals) { 437 const void* value = p.second.first; 438 int size = p.second.second; 439 if (value == nullptr && size == 0) { 440 // This indicates the current closure depends on another closure for a 441 // global in their shared module (script). In this case we don't need to 442 // copy the value. For example, an invoke intializes a global variable 443 // which a kernel later reads. 444 continue; 445 } 446 rsAssert(p.first != nullptr); 447 ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)", 448 closure, p.first, p.first->mScript, p.first->mSlot); 449 Script* script = p.first->mScript; 450 const RsdCpuScriptImpl *cpuScript = 451 (const RsdCpuScriptImpl*)script->mHal.drv; 452 int slot = p.first->mSlot; 453 ScriptExecutable* exec = mGroup->getExecutable(); 454 if (exec != nullptr) { 455 const char* varName = cpuScript->getFieldName(slot); 456 void* addr = exec->getFieldAddress(varName); 457 if (size < 0) { 458 rsrSetObject(mGroup->getCpuRefImpl()->getContext(), 459 (rs_object_base*)addr, (ObjectBase*)value); 460 } else { 461 memcpy(addr, (const void*)&value, size); 462 } 463 } else { 464 // We use -1 size to indicate an ObjectBase rather than a primitive type 465 if (size < 0) { 466 s->setVarObj(slot, (ObjectBase*)value); 467 } else { 468 s->setVar(slot, (const void*)&value, size); 469 } 470 } 471 } 472 } 473} 474 475void Batch::run() { 476 if (!mClosures.front()->mClosure->mIsKernel) { 477 rsAssert(mClosures.size() == 1); 478 479 // This batch contains a single closure for an invoke function 480 CPUClosure* cc = mClosures.front(); 481 const Closure* c = cc->mClosure; 482 483 if (mFunc != nullptr) { 484 // TODO: Need align pointers for x86_64. 485 // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp 486 ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength); 487 } else { 488 const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get(); 489 rsAssert(invokeID != nullptr); 490 cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength); 491 } 492 493 return; 494 } 495 496 if (mFunc != nullptr) { 497 MTLaunchStruct mtls; 498 const CPUClosure* firstCpuClosure = mClosures.front(); 499 const CPUClosure* lastCpuClosure = mClosures.back(); 500 501 firstCpuClosure->mSi->forEachMtlsSetup( 502 (const Allocation**)firstCpuClosure->mClosure->mArgs, 503 firstCpuClosure->mClosure->mNumArg, 504 lastCpuClosure->mClosure->mReturnValue, 505 nullptr, 0, nullptr, &mtls); 506 507 mtls.script = nullptr; 508 mtls.fep.usr = nullptr; 509 mtls.kernel = (ForEachFunc_t)mFunc; 510 511 mGroup->getCpuRefImpl()->launchThreads( 512 (const Allocation**)firstCpuClosure->mClosure->mArgs, 513 firstCpuClosure->mClosure->mNumArg, 514 lastCpuClosure->mClosure->mReturnValue, 515 nullptr, &mtls); 516 517 return; 518 } 519 520 for (CPUClosure* cpuClosure : mClosures) { 521 const Closure* closure = cpuClosure->mClosure; 522 const ScriptKernelID* kernelID = 523 (const ScriptKernelID*)closure->mFunctionID.get(); 524 cpuClosure->mSi->preLaunch(kernelID->mSlot, 525 (const Allocation**)closure->mArgs, 526 closure->mNumArg, closure->mReturnValue, 527 nullptr, 0, nullptr); 528 } 529 530 const CPUClosure* cpuClosure = mClosures.front(); 531 const Closure* closure = cpuClosure->mClosure; 532 MTLaunchStruct mtls; 533 534 if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs, 535 closure->mNumArg, 536 closure->mReturnValue, 537 nullptr, 0, nullptr, &mtls)) { 538 539 mtls.script = nullptr; 540 mtls.kernel = (void (*)())&groupRoot; 541 mtls.fep.usr = &mClosures; 542 543 mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls); 544 } 545 546 for (CPUClosure* cpuClosure : mClosures) { 547 const Closure* closure = cpuClosure->mClosure; 548 const ScriptKernelID* kernelID = 549 (const ScriptKernelID*)closure->mFunctionID.get(); 550 cpuClosure->mSi->postLaunch(kernelID->mSlot, 551 (const Allocation**)closure->mArgs, 552 closure->mNumArg, closure->mReturnValue, 553 nullptr, 0, nullptr); 554 } 555} 556 557} // namespace renderscript 558} // namespace android 559