rsCpuScriptGroup2.cpp revision 062c287f573ecc06c38ee4295e5627e12c52ac3d
1#include "rsCpuScriptGroup2.h" 2 3#include <dlfcn.h> 4#include <stdio.h> 5#include <stdlib.h> 6#include <unistd.h> 7 8#include <set> 9#include <sstream> 10#include <string> 11#include <vector> 12 13#ifndef RS_COMPATIBILITY_LIB 14#include "bcc/Config/Config.h" 15#include <sys/wait.h> 16#endif 17 18#include "cpu_ref/rsCpuCore.h" 19#include "rsClosure.h" 20#include "rsContext.h" 21#include "rsCpuCore.h" 22#include "rsCpuExecutable.h" 23#include "rsCpuScript.h" 24#include "rsScript.h" 25#include "rsScriptGroup2.h" 26#include "rsScriptIntrinsic.h" 27 28using std::string; 29using std::vector; 30 31namespace android { 32namespace renderscript { 33 34namespace { 35 36const size_t DefaultKernelArgCount = 2; 37 38void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart, 39 uint32_t xend, uint32_t outstep) { 40 const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr; 41 RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams; 42 const void **oldIns = kparams->ins; 43 uint32_t *oldStrides = kparams->inEStrides; 44 45 std::vector<const void*> ins(DefaultKernelArgCount); 46 std::vector<uint32_t> strides(DefaultKernelArgCount); 47 48 for (CPUClosure* cpuClosure : closures) { 49 const Closure* closure = cpuClosure->mClosure; 50 51 auto in_iter = ins.begin(); 52 auto stride_iter = strides.begin(); 53 54 for (size_t i = 0; i < closure->mNumArg; i++) { 55 const void* arg = closure->mArgs[i]; 56 const Allocation* a = (const Allocation*)arg; 57 const uint32_t eStride = a->mHal.state.elementSizeBytes; 58 const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) + 59 eStride * xstart; 60 if (kparams->dimY > 1) { 61 ptr += a->mHal.drvState.lod[0].stride * kparams->y; 62 } 63 *in_iter++ = ptr; 64 *stride_iter++ = eStride; 65 } 66 67 mutable_kparams->ins = &ins[0]; 68 mutable_kparams->inEStrides = &strides[0]; 69 70 const Allocation* out = closure->mReturnValue; 71 const uint32_t ostep = out->mHal.state.elementSizeBytes; 72 const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) + 73 ostep * xstart; 74 if (kparams->dimY > 1) { 75 ptr += out->mHal.drvState.lod[0].stride * kparams->y; 76 } 77 78 mutable_kparams->out = (void*)ptr; 79 80 cpuClosure->mFunc(kparams, xstart, xend, ostep); 81 } 82 83 mutable_kparams->ins = oldIns; 84 mutable_kparams->inEStrides = oldStrides; 85} 86 87} // namespace 88 89Batch::Batch(CpuScriptGroup2Impl* group, const char* name) : 90 mGroup(group), mFunc(nullptr) { 91 mName = strndup(name, strlen(name)); 92} 93 94Batch::~Batch() { 95 for (CPUClosure* c : mClosures) { 96 delete c; 97 } 98 free(mName); 99} 100 101bool Batch::conflict(CPUClosure* cpuClosure) const { 102 if (mClosures.empty()) { 103 return false; 104 } 105 106 const Closure* closure = cpuClosure->mClosure; 107 108 if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) { 109 // An invoke should be in a batch by itself, so it conflicts with any other 110 // closure. 111 return true; 112 } 113 114 const auto& globalDeps = closure->mGlobalDeps; 115 const auto& argDeps = closure->mArgDeps; 116 117 for (CPUClosure* c : mClosures) { 118 const Closure* batched = c->mClosure; 119 if (globalDeps.find(batched) != globalDeps.end()) { 120 return true; 121 } 122 const auto& it = argDeps.find(batched); 123 if (it != argDeps.end()) { 124 const auto& args = (*it).second; 125 for (const auto &p1 : *args) { 126 if (p1.second->get() != nullptr) { 127 return true; 128 } 129 } 130 } 131 } 132 133 return false; 134} 135 136CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, 137 const ScriptGroupBase *sg) : 138 mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)), 139 mExecutable(nullptr), mScriptObj(nullptr) { 140 rsAssert(!mGroup->mClosures.empty()); 141 142 Batch* batch = new Batch(this, "Batch0"); 143 int i = 0; 144 for (Closure* closure: mGroup->mClosures) { 145 CPUClosure* cc; 146 const IDBase* funcID = closure->mFunctionID.get(); 147 RsdCpuScriptImpl* si = 148 (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript); 149 if (closure->mIsKernel) { 150 MTLaunchStruct mtls; 151 si->forEachKernelSetup(funcID->mSlot, &mtls); 152 cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel); 153 } else { 154 cc = new CPUClosure(closure, si); 155 } 156 157 if (batch->conflict(cc)) { 158 mBatches.push_back(batch); 159 std::stringstream ss; 160 ss << "Batch" << ++i; 161 batch = new Batch(this, ss.str().c_str()); 162 } 163 164 batch->mClosures.push_back(cc); 165 } 166 167 rsAssert(!batch->mClosures.empty()); 168 mBatches.push_back(batch); 169 170#ifndef RS_COMPATIBILITY_LIB 171 compile(mGroup->mCacheDir); 172 if (mScriptObj != nullptr && mExecutable != nullptr) { 173 for (Batch* batch : mBatches) { 174 batch->resolveFuncPtr(mScriptObj); 175 } 176 } 177#endif // RS_COMPATIBILITY_LIB 178} 179 180void Batch::resolveFuncPtr(void* sharedObj) { 181 std::string funcName(mName); 182 if (mClosures.front()->mClosure->mIsKernel) { 183 funcName.append(".expand"); 184 } 185 mFunc = dlsym(sharedObj, funcName.c_str()); 186 rsAssert (mFunc != nullptr); 187} 188 189CpuScriptGroup2Impl::~CpuScriptGroup2Impl() { 190 for (Batch* batch : mBatches) { 191 delete batch; 192 } 193 // TODO: move this dlclose into ~ScriptExecutable(). 194 if (mScriptObj != nullptr) { 195 dlclose(mScriptObj); 196 } 197 delete mExecutable; 198} 199 200namespace { 201 202#ifndef RS_COMPATIBILITY_LIB 203 204string getFileName(string path) { 205 unsigned found = path.find_last_of("/\\"); 206 return path.substr(found + 1); 207} 208 209void setupCompileArguments( 210 const vector<string>& inputs, const vector<string>& kernelBatches, 211 const vector<string>& invokeBatches, 212 const string& output_dir, const string& output_filename, 213 const string& rsLib, vector<const char*>* args) { 214 args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH); 215 args->push_back("-fPIC"); 216 args->push_back("-embedRSInfo"); 217 args->push_back("-mtriple"); 218 args->push_back(DEFAULT_TARGET_TRIPLE_STRING); 219 args->push_back("-bclib"); 220 args->push_back(rsLib.c_str()); 221 for (const string& input : inputs) { 222 args->push_back(input.c_str()); 223 } 224 for (const string& batch : kernelBatches) { 225 args->push_back("-merge"); 226 args->push_back(batch.c_str()); 227 } 228 for (const string& batch : invokeBatches) { 229 args->push_back("-invoke"); 230 args->push_back(batch.c_str()); 231 } 232 args->push_back("-output_path"); 233 args->push_back(output_dir.c_str()); 234 args->push_back("-o"); 235 args->push_back(output_filename.c_str()); 236 args->push_back(nullptr); 237} 238 239bool fuseAndCompile(const char** arguments, 240 const string& commandLine) { 241 const pid_t pid = fork(); 242 243 if (pid == -1) { 244 ALOGE("Couldn't fork for bcc execution"); 245 return false; 246 } 247 248 if (pid == 0) { 249 // Child process 250 ALOGV("Invoking BCC with: %s", commandLine.c_str()); 251 execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments); 252 253 ALOGE("execv() failed: %s", strerror(errno)); 254 abort(); 255 return false; 256 } 257 258 // Parent process 259 int status = 0; 260 const pid_t w = waitpid(pid, &status, 0); 261 if (w == -1) { 262 return false; 263 } 264 265 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) { 266 ALOGE("bcc terminated unexpectedly"); 267 return false; 268 } 269 270 return true; 271} 272 273void generateSourceSlot(const Closure& closure, 274 const std::vector<std::string>& inputs, 275 std::stringstream& ss) { 276 const IDBase* funcID = (const IDBase*)closure.mFunctionID.get(); 277 const Script* script = funcID->mScript; 278 279 rsAssert (!script->isIntrinsic()); 280 281 const RsdCpuScriptImpl *cpuScript = 282 (const RsdCpuScriptImpl*)script->mHal.drv; 283 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 284 285 const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) - 286 inputs.begin(); 287 288 ss << index << "," << funcID->mSlot << "."; 289} 290 291#endif // RS_COMPATIBILTY_LIB 292 293} // anonymous namespace 294 295void CpuScriptGroup2Impl::compile(const char* cacheDir) { 296#ifndef RS_COMPATIBILITY_LIB 297 if (mGroup->mClosures.size() < 2) { 298 return; 299 } 300 301 //===--------------------------------------------------------------------===// 302 // Fuse the input kernels and generate native code in an object file 303 //===--------------------------------------------------------------------===// 304 305 std::set<string> inputSet; 306 for (Closure* closure : mGroup->mClosures) { 307 const Script* script = closure->mFunctionID.get()->mScript; 308 309 // If any script is an intrinsic, give up trying fusing the kernels. 310 if (script->isIntrinsic()) { 311 return; 312 } 313 314 const RsdCpuScriptImpl *cpuScript = 315 (const RsdCpuScriptImpl*)script->mHal.drv; 316 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 317 inputSet.insert(bitcodeFilename); 318 } 319 320 std::vector<string> inputs(inputSet.begin(), inputSet.end()); 321 322 std::vector<string> kernelBatches; 323 std::vector<string> invokeBatches; 324 325 int i = 0; 326 for (const auto& batch : mBatches) { 327 rsAssert(batch->size() > 0); 328 329 std::stringstream ss; 330 ss << batch->mName << ":"; 331 332 if (!batch->mClosures.front()->mClosure->mIsKernel) { 333 rsAssert(batch->size() == 1); 334 generateSourceSlot(*batch->mClosures.front()->mClosure, inputs, ss); 335 invokeBatches.push_back(ss.str()); 336 } else { 337 for (const auto& cpuClosure : batch->mClosures) { 338 generateSourceSlot(*cpuClosure->mClosure, inputs, ss); 339 } 340 kernelBatches.push_back(ss.str()); 341 } 342 } 343 344 rsAssert(cacheDir != nullptr); 345 string objFilePath(cacheDir); 346 objFilePath.append("/fusedXXXXXX.o"); 347 // Find unique object file name, to make following file names unique. 348 int tempfd = mkstemps(&objFilePath[0], 2); 349 if (tempfd == -1) { 350 return; 351 } 352 TEMP_FAILURE_RETRY(close(tempfd)); 353 354 string outputFileName = getFileName(objFilePath.substr(0, objFilePath.size() - 2)); 355 string rsLibPath(SYSLIBPATH"/libclcore.bc"); 356 vector<const char*> arguments; 357 setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir, 358 outputFileName, rsLibPath, &arguments); 359 std::unique_ptr<const char> joined( 360 rsuJoinStrings(arguments.size() - 1, arguments.data())); 361 string commandLine (joined.get()); 362 363 if (!fuseAndCompile(arguments.data(), commandLine)) { 364 unlink(objFilePath.c_str()); 365 return; 366 } 367 368 //===--------------------------------------------------------------------===// 369 // Create and load the shared lib 370 //===--------------------------------------------------------------------===// 371 372 const char* resName = outputFileName.c_str(); 373 374 if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) { 375 ALOGE("Failed to link object file '%s'", resName); 376 return; 377 } 378 379 mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); 380 if (mScriptObj == nullptr) { 381 ALOGE("Unable to load '%s'", resName); 382 return; 383 } 384 385 mExecutable = ScriptExecutable::createFromSharedObject( 386 nullptr, // RS context. Unused. 387 mScriptObj); 388 389#endif // RS_COMPATIBILITY_LIB 390} 391 392void CpuScriptGroup2Impl::execute() { 393 for (auto batch : mBatches) { 394 batch->setGlobalsForBatch(); 395 batch->run(); 396 } 397} 398 399void Batch::setGlobalsForBatch() { 400 for (CPUClosure* cpuClosure : mClosures) { 401 const Closure* closure = cpuClosure->mClosure; 402 const IDBase* funcID = closure->mFunctionID.get(); 403 Script* s = funcID->mScript;; 404 for (const auto& p : closure->mGlobals) { 405 const void* value = p.second.first; 406 int size = p.second.second; 407 if (value == nullptr && size == 0) { 408 // This indicates the current closure depends on another closure for a 409 // global in their shared module (script). In this case we don't need to 410 // copy the value. For example, an invoke intializes a global variable 411 // which a kernel later reads. 412 continue; 413 } 414 rsAssert(p.first != nullptr); 415 ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)", 416 closure, p.first, p.first->mScript, p.first->mSlot); 417 Script* script = p.first->mScript; 418 const RsdCpuScriptImpl *cpuScript = 419 (const RsdCpuScriptImpl*)script->mHal.drv; 420 int slot = p.first->mSlot; 421 ScriptExecutable* exec = mGroup->getExecutable(); 422 if (exec != nullptr) { 423 const char* varName = cpuScript->getFieldName(slot); 424 void* addr = exec->getFieldAddress(varName); 425 if (size < 0) { 426 rsrSetObject(mGroup->getCpuRefImpl()->getContext(), 427 (rs_object_base*)addr, (ObjectBase*)value); 428 } else { 429 memcpy(addr, (const void*)&value, size); 430 } 431 } else { 432 // We use -1 size to indicate an ObjectBase rather than a primitive type 433 if (size < 0) { 434 s->setVarObj(slot, (ObjectBase*)value); 435 } else { 436 s->setVar(slot, (const void*)&value, size); 437 } 438 } 439 } 440 } 441} 442 443void Batch::run() { 444 if (!mClosures.front()->mClosure->mIsKernel) { 445 rsAssert(mClosures.size() == 1); 446 447 // This batch contains a single closure for an invoke function 448 CPUClosure* cc = mClosures.front(); 449 const Closure* c = cc->mClosure; 450 451 if (mFunc != nullptr) { 452 // TODO: Need align pointers for x86_64. 453 // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp 454 ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength); 455 } else { 456 const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get(); 457 rsAssert(invokeID != nullptr); 458 cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength); 459 } 460 461 return; 462 } 463 464 if (mFunc != nullptr) { 465 MTLaunchStruct mtls; 466 const CPUClosure* firstCpuClosure = mClosures.front(); 467 const CPUClosure* lastCpuClosure = mClosures.back(); 468 469 firstCpuClosure->mSi->forEachMtlsSetup( 470 (const Allocation**)firstCpuClosure->mClosure->mArgs, 471 firstCpuClosure->mClosure->mNumArg, 472 lastCpuClosure->mClosure->mReturnValue, 473 nullptr, 0, nullptr, &mtls); 474 475 mtls.script = nullptr; 476 mtls.fep.usr = nullptr; 477 mtls.kernel = (ForEachFunc_t)mFunc; 478 479 mGroup->getCpuRefImpl()->launchThreads( 480 (const Allocation**)firstCpuClosure->mClosure->mArgs, 481 firstCpuClosure->mClosure->mNumArg, 482 lastCpuClosure->mClosure->mReturnValue, 483 nullptr, &mtls); 484 485 return; 486 } 487 488 for (CPUClosure* cpuClosure : mClosures) { 489 const Closure* closure = cpuClosure->mClosure; 490 const ScriptKernelID* kernelID = 491 (const ScriptKernelID*)closure->mFunctionID.get(); 492 cpuClosure->mSi->preLaunch(kernelID->mSlot, 493 (const Allocation**)closure->mArgs, 494 closure->mNumArg, closure->mReturnValue, 495 nullptr, 0, nullptr); 496 } 497 498 const CPUClosure* cpuClosure = mClosures.front(); 499 const Closure* closure = cpuClosure->mClosure; 500 MTLaunchStruct mtls; 501 502 if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs, 503 closure->mNumArg, 504 closure->mReturnValue, 505 nullptr, 0, nullptr, &mtls)) { 506 507 mtls.script = nullptr; 508 mtls.kernel = (void (*)())&groupRoot; 509 mtls.fep.usr = &mClosures; 510 511 mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls); 512 } 513 514 for (CPUClosure* cpuClosure : mClosures) { 515 const Closure* closure = cpuClosure->mClosure; 516 const ScriptKernelID* kernelID = 517 (const ScriptKernelID*)closure->mFunctionID.get(); 518 cpuClosure->mSi->postLaunch(kernelID->mSlot, 519 (const Allocation**)closure->mArgs, 520 closure->mNumArg, closure->mReturnValue, 521 nullptr, 0, nullptr); 522 } 523} 524 525} // namespace renderscript 526} // namespace android 527