rsCpuScriptGroup2.cpp revision 3e5318a36be470ba7a8c5cf82bbe069178733b11
1#include "rsCpuScriptGroup2.h" 2 3#include <dlfcn.h> 4#include <stdio.h> 5#include <stdlib.h> 6#include <unistd.h> 7 8#include <string> 9#include <vector> 10 11#ifndef RS_COMPATIBILITY_LIB 12#include "bcc/Config/Config.h" 13#include <sys/wait.h> 14#endif 15 16#include "cpu_ref/rsCpuCore.h" 17#include "rsClosure.h" 18#include "rsContext.h" 19#include "rsCpuCore.h" 20#include "rsCpuExecutable.h" 21#include "rsCpuScript.h" 22#include "rsScript.h" 23#include "rsScriptGroup2.h" 24#include "rsScriptIntrinsic.h" 25 26using std::string; 27using std::vector; 28 29namespace android { 30namespace renderscript { 31 32namespace { 33 34const size_t DefaultKernelArgCount = 2; 35 36void groupRoot(const RsExpandKernelParams *kparams, uint32_t xstart, 37 uint32_t xend, uint32_t outstep) { 38 const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kparams->usr; 39 RsExpandKernelParams *mutable_kparams = (RsExpandKernelParams *)kparams; 40 const void **oldIns = kparams->ins; 41 uint32_t *oldStrides = kparams->inEStrides; 42 43 std::vector<const void*> ins(DefaultKernelArgCount); 44 std::vector<uint32_t> strides(DefaultKernelArgCount); 45 46 for (CPUClosure* cpuClosure : closures) { 47 const Closure* closure = cpuClosure->mClosure; 48 49 auto in_iter = ins.begin(); 50 auto stride_iter = strides.begin(); 51 52 for (size_t i = 0; i < closure->mNumArg; i++) { 53 const void* arg = closure->mArgs[i]; 54 const Allocation* a = (const Allocation*)arg; 55 const uint32_t eStride = a->mHal.state.elementSizeBytes; 56 const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) + 57 eStride * xstart; 58 if (kparams->dimY > 1) { 59 ptr += a->mHal.drvState.lod[0].stride * kparams->y; 60 } 61 *in_iter++ = ptr; 62 *stride_iter++ = eStride; 63 } 64 65 mutable_kparams->ins = &ins[0]; 66 mutable_kparams->inEStrides = &strides[0]; 67 68 const Allocation* out = closure->mReturnValue; 69 const uint32_t ostep = out->mHal.state.elementSizeBytes; 70 const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) + 71 ostep * xstart; 72 if (kparams->dimY > 1) { 73 ptr += out->mHal.drvState.lod[0].stride * kparams->y; 74 } 75 76 mutable_kparams->out = (void*)ptr; 77 78 mutable_kparams->usr = cpuClosure->mUsrPtr; 79 80 cpuClosure->mFunc(kparams, xstart, xend, ostep); 81 } 82 83 mutable_kparams->ins = oldIns; 84 mutable_kparams->inEStrides = oldStrides; 85 mutable_kparams->usr = &closures; 86} 87 88} // namespace 89 90Batch::~Batch() { 91 for (CPUClosure* c : mClosures) { 92 delete c; 93 } 94 if (mScriptObj) { 95 dlclose(mScriptObj); 96 } 97} 98 99bool Batch::conflict(CPUClosure* cpuClosure) const { 100 if (mClosures.empty()) { 101 return false; 102 } 103 104 const Closure* closure = cpuClosure->mClosure; 105 106 if (closure->mKernelID.get() == nullptr || 107 mClosures.front()->mClosure->mKernelID.get() == nullptr) { 108 // An invoke should be in a batch by itself, so it conflicts with any other 109 // closure. 110 return true; 111 } 112 113 const auto& globalDeps = closure->mGlobalDeps; 114 const auto& argDeps = closure->mArgDeps; 115 116 for (CPUClosure* c : mClosures) { 117 const Closure* batched = c->mClosure; 118 if (globalDeps.find(batched) != globalDeps.end()) { 119 return true; 120 } 121 const auto& it = argDeps.find(batched); 122 if (it != argDeps.end()) { 123 const auto& args = (*it).second; 124 for (const auto &p1 : *args) { 125 if (p1.second->get() != nullptr) { 126 return true; 127 } 128 } 129 } 130 } 131 132 return false; 133} 134 135CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, 136 const ScriptGroupBase *sg) : 137 mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)) { 138 rsAssert(!mGroup->mClosures.empty()); 139 140 Batch* batch = new Batch(this); 141 for (Closure* closure: mGroup->mClosures) { 142 const ScriptKernelID* kernelID = closure->mKernelID.get(); 143 RsdCpuScriptImpl* si; 144 CPUClosure* cc; 145 if (kernelID != nullptr) { 146 si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(kernelID->mScript); 147 MTLaunchStruct mtls; 148 si->forEachKernelSetup(kernelID->mSlot, &mtls); 149 // TODO: Is mtls.fep.usrLen ever used? 150 cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel, 151 mtls.fep.usr, mtls.fep.usrLen); 152 } else { 153 si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript( 154 closure->mInvokeID->mScript); 155 cc = new CPUClosure(closure, si); 156 } 157 158 if (batch->conflict(cc)) { 159 mBatches.push_back(batch); 160 batch = new Batch(this); 161 } 162 163 batch->mClosures.push_back(cc); 164 } 165 166 rsAssert(!batch->mClosures.empty()); 167 mBatches.push_back(batch); 168 169#ifndef RS_COMPATIBILITY_LIB 170 for (Batch* batch : mBatches) { 171 batch->tryToCreateFusedKernel(mGroup->mCacheDir); 172 } 173#endif 174} 175 176CpuScriptGroup2Impl::~CpuScriptGroup2Impl() { 177 for (Batch* batch : mBatches) { 178 delete batch; 179 } 180} 181 182namespace { 183 184#ifndef RS_COMPATIBILITY_LIB 185 186string getFileName(string path) { 187 unsigned found = path.find_last_of("/\\"); 188 return path.substr(found + 1); 189} 190 191void setupCompileArguments( 192 const vector<string>& inputs, const vector<int>& kernels, 193 const string& output_dir, const string& output_filename, 194 const string& rsLib, vector<const char*>* args) { 195 args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH); 196 args->push_back("-fPIC"); 197 args->push_back("-embedRSInfo"); 198 args->push_back("-mtriple"); 199 args->push_back(DEFAULT_TARGET_TRIPLE_STRING); 200 args->push_back("-bclib"); 201 args->push_back(rsLib.c_str()); 202 for (const string& input : inputs) { 203 args->push_back(input.c_str()); 204 } 205 for (int kernel : kernels) { 206 args->push_back("-k"); 207 string strKernel = std::to_string(kernel); 208 args->push_back(strKernel.c_str()); 209 } 210 args->push_back("-output_path"); 211 args->push_back(output_dir.c_str()); 212 args->push_back("-o"); 213 args->push_back(output_filename.c_str()); 214 args->push_back(nullptr); 215} 216 217bool fuseAndCompile(const char** arguments, 218 const string& commandLine) { 219 const pid_t pid = fork(); 220 221 if (pid == -1) { 222 ALOGE("Couldn't fork for bcc execution"); 223 return false; 224 } 225 226 if (pid == 0) { 227 // Child process 228 ALOGV("Invoking BCC with: %s", commandLine.c_str()); 229 execv(RsdCpuScriptImpl::BCC_EXE_PATH, (char* const*)arguments); 230 231 ALOGE("execv() failed: %s", strerror(errno)); 232 abort(); 233 return false; 234 } 235 236 // Parent process 237 int status = 0; 238 const pid_t w = waitpid(pid, &status, 0); 239 if (w == -1) { 240 return false; 241 } 242 243 if (!WIFEXITED(status) || WEXITSTATUS(status) != 0 ) { 244 ALOGE("bcc terminated unexpectedly"); 245 return false; 246 } 247 248 return true; 249} 250#endif 251 252} // anonymous namespace 253 254void Batch::tryToCreateFusedKernel(const char *cacheDir) { 255#ifndef RS_COMPATIBILITY_LIB 256 if (mClosures.size() < 2) { 257 return; 258 } 259 260 //===--------------------------------------------------------------------===// 261 // Fuse the input kernels and generate native code in an object file 262 //===--------------------------------------------------------------------===// 263 264 std::vector<string> inputFiles; 265 std::vector<int> slots; 266 267 for (CPUClosure* cpuClosure : mClosures) { 268 const Closure* closure = cpuClosure->mClosure; 269 const ScriptKernelID* kernelID = closure->mKernelID.get(); 270 const Script* script = kernelID->mScript; 271 272 if (script->isIntrinsic()) { 273 return; 274 } 275 276 const RsdCpuScriptImpl *cpuScript = 277 (const RsdCpuScriptImpl*)script->mHal.drv; 278 279 const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); 280 281 inputFiles.push_back(bitcodeFilename); 282 slots.push_back(kernelID->mSlot); 283 } 284 285 rsAssert(cacheDir != nullptr); 286 string objFilePath(cacheDir); 287 objFilePath.append("/fusedXXXXXX.o"); 288 // Find unique object file name, to make following file names unique. 289 int tempfd = mkstemps(&objFilePath[0], 2); 290 if (tempfd == -1) { 291 return; 292 } 293 TEMP_FAILURE_RETRY(close(tempfd)); 294 295 string outputFileName = getFileName(objFilePath.substr(0, objFilePath.size() - 2)); 296 string rsLibPath(SYSLIBPATH"/libclcore.bc"); 297 vector<const char*> arguments; 298 setupCompileArguments(inputFiles, slots, cacheDir, outputFileName, rsLibPath, 299 &arguments); 300 std::unique_ptr<const char> joined( 301 rsuJoinStrings(arguments.size() - 1, arguments.data())); 302 string commandLine (joined.get()); 303 304 if (!fuseAndCompile(arguments.data(), commandLine)) { 305 unlink(objFilePath.c_str()); 306 return; 307 } 308 309 //===--------------------------------------------------------------------===// 310 // Create and load the shared lib 311 //===--------------------------------------------------------------------===// 312 313 const char* resName = outputFileName.c_str(); 314 315 if (!SharedLibraryUtils::createSharedLibrary(cacheDir, resName)) { 316 ALOGE("Failed to link object file '%s'", resName); 317 return; 318 } 319 320 void* mSharedObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); 321 if (mSharedObj == nullptr) { 322 ALOGE("Unable to load '%s'", resName); 323 return; 324 } 325 326 mExecutable = ScriptExecutable::createFromSharedObject( 327 nullptr, // RS context. Unused. 328 mSharedObj); 329 330#endif // RS_COMPATIBILITY_LIB 331} 332 333void CpuScriptGroup2Impl::execute() { 334 for (auto batch : mBatches) { 335 batch->setGlobalsForBatch(); 336 batch->run(); 337 } 338} 339 340void Batch::setGlobalsForBatch() { 341 for (CPUClosure* cpuClosure : mClosures) { 342 const Closure* closure = cpuClosure->mClosure; 343 const ScriptKernelID* kernelID = closure->mKernelID.get(); 344 Script* s; 345 if (kernelID != nullptr) { 346 s = kernelID->mScript; 347 } else { 348 s = cpuClosure->mClosure->mInvokeID->mScript; 349 } 350 for (const auto& p : closure->mGlobals) { 351 const void* value = p.second.first; 352 int size = p.second.second; 353 if (value == nullptr && size == 0) { 354 // This indicates the current closure depends on another closure for a 355 // global in their shared module (script). In this case we don't need to 356 // copy the value. For example, an invoke intializes a global variable 357 // which a kernel later reads. 358 continue; 359 } 360 rsAssert(p.first != nullptr); 361 ALOGV("Evaluating closure %p, setting field %p (Script %p, slot: %d)", 362 closure, p.first, p.first->mScript, p.first->mSlot); 363 // We use -1 size to indicate an ObjectBase rather than a primitive type 364 if (size < 0) { 365 s->setVarObj(p.first->mSlot, (ObjectBase*)value); 366 } else { 367 s->setVar(p.first->mSlot, (const void*)&value, size); 368 } 369 } 370 } 371} 372 373void Batch::run() { 374 if (mExecutable != nullptr) { 375 MTLaunchStruct mtls; 376 const CPUClosure* firstCpuClosure = mClosures.front(); 377 const CPUClosure* lastCpuClosure = mClosures.back(); 378 379 firstCpuClosure->mSi->forEachMtlsSetup( 380 (const Allocation**)firstCpuClosure->mClosure->mArgs, 381 firstCpuClosure->mClosure->mNumArg, 382 lastCpuClosure->mClosure->mReturnValue, 383 nullptr, 0, nullptr, &mtls); 384 385 mtls.script = nullptr; 386 mtls.fep.usr = nullptr; 387 mtls.kernel = mExecutable->getForEachFunction(0); 388 389 mGroup->getCpuRefImpl()->launchThreads( 390 (const Allocation**)firstCpuClosure->mClosure->mArgs, 391 firstCpuClosure->mClosure->mNumArg, 392 lastCpuClosure->mClosure->mReturnValue, 393 nullptr, &mtls); 394 395 return; 396 } 397 398 if (mClosures.size() == 1 && 399 mClosures.front()->mClosure->mKernelID.get() == nullptr) { 400 // This closure is for an invoke function 401 CPUClosure* cc = mClosures.front(); 402 const Closure* c = cc->mClosure; 403 const ScriptInvokeID* invokeID = c->mInvokeID; 404 rsAssert(invokeID != nullptr); 405 cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength); 406 return; 407 } 408 409 for (CPUClosure* cpuClosure : mClosures) { 410 const Closure* closure = cpuClosure->mClosure; 411 const ScriptKernelID* kernelID = closure->mKernelID.get(); 412 cpuClosure->mSi->preLaunch(kernelID->mSlot, 413 (const Allocation**)closure->mArgs, 414 closure->mNumArg, closure->mReturnValue, 415 cpuClosure->mUsrPtr, cpuClosure->mUsrSize, 416 nullptr); 417 } 418 419 const CPUClosure* cpuClosure = mClosures.front(); 420 const Closure* closure = cpuClosure->mClosure; 421 MTLaunchStruct mtls; 422 423 if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs, 424 closure->mNumArg, 425 closure->mReturnValue, 426 nullptr, 0, nullptr, &mtls)) { 427 428 mtls.script = nullptr; 429 mtls.kernel = (void (*)())&groupRoot; 430 mtls.fep.usr = &mClosures; 431 432 mGroup->getCpuRefImpl()->launchThreads(nullptr, 0, nullptr, nullptr, &mtls); 433 } 434 435 for (CPUClosure* cpuClosure : mClosures) { 436 const Closure* closure = cpuClosure->mClosure; 437 const ScriptKernelID* kernelID = closure->mKernelID.get(); 438 cpuClosure->mSi->postLaunch(kernelID->mSlot, 439 (const Allocation**)closure->mArgs, 440 closure->mNumArg, closure->mReturnValue, 441 nullptr, 0, nullptr); 442 } 443} 444 445} // namespace renderscript 446} // namespace android 447