rsCpuIntrinsic3DLUT.cpp revision 8994abbe699bb05fa70cff101becc925db6b2c26
17c4b888f2147edf99690b6af75470774ff31c43bJason Sams/* 27c4b888f2147edf99690b6af75470774ff31c43bJason Sams * Copyright (C) 2012 The Android Open Source Project 37c4b888f2147edf99690b6af75470774ff31c43bJason Sams * 47c4b888f2147edf99690b6af75470774ff31c43bJason Sams * Licensed under the Apache License, Version 2.0 (the "License"); 57c4b888f2147edf99690b6af75470774ff31c43bJason Sams * you may not use this file except in compliance with the License. 67c4b888f2147edf99690b6af75470774ff31c43bJason Sams * You may obtain a copy of the License at 77c4b888f2147edf99690b6af75470774ff31c43bJason Sams * 87c4b888f2147edf99690b6af75470774ff31c43bJason Sams * http://www.apache.org/licenses/LICENSE-2.0 97c4b888f2147edf99690b6af75470774ff31c43bJason Sams * 107c4b888f2147edf99690b6af75470774ff31c43bJason Sams * Unless required by applicable law or agreed to in writing, software 117c4b888f2147edf99690b6af75470774ff31c43bJason Sams * distributed under the License is distributed on an "AS IS" BASIS, 127c4b888f2147edf99690b6af75470774ff31c43bJason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 137c4b888f2147edf99690b6af75470774ff31c43bJason Sams * See the License for the specific language governing permissions and 147c4b888f2147edf99690b6af75470774ff31c43bJason Sams * limitations under the License. 157c4b888f2147edf99690b6af75470774ff31c43bJason Sams */ 167c4b888f2147edf99690b6af75470774ff31c43bJason Sams 177c4b888f2147edf99690b6af75470774ff31c43bJason Sams 187c4b888f2147edf99690b6af75470774ff31c43bJason Sams#include "rsCpuIntrinsic.h" 197c4b888f2147edf99690b6af75470774ff31c43bJason Sams#include "rsCpuIntrinsicInlines.h" 207c4b888f2147edf99690b6af75470774ff31c43bJason Sams 217c4b888f2147edf99690b6af75470774ff31c43bJason Samsusing namespace android; 227c4b888f2147edf99690b6af75470774ff31c43bJason Samsusing namespace android::renderscript; 237c4b888f2147edf99690b6af75470774ff31c43bJason Sams 247c4b888f2147edf99690b6af75470774ff31c43bJason Samsnamespace android { 257c4b888f2147edf99690b6af75470774ff31c43bJason Samsnamespace renderscript { 267c4b888f2147edf99690b6af75470774ff31c43bJason Sams 277c4b888f2147edf99690b6af75470774ff31c43bJason Sams 287c4b888f2147edf99690b6af75470774ff31c43bJason Samsclass RsdCpuScriptIntrinsic3DLUT : public RsdCpuScriptIntrinsic { 297c4b888f2147edf99690b6af75470774ff31c43bJason Samspublic: 307c4b888f2147edf99690b6af75470774ff31c43bJason Sams virtual void populateScript(Script *); 317c4b888f2147edf99690b6af75470774ff31c43bJason Sams virtual void invokeFreeChildren(); 327c4b888f2147edf99690b6af75470774ff31c43bJason Sams 337c4b888f2147edf99690b6af75470774ff31c43bJason Sams virtual void setGlobalObj(uint32_t slot, ObjectBase *data); 347c4b888f2147edf99690b6af75470774ff31c43bJason Sams 357c4b888f2147edf99690b6af75470774ff31c43bJason Sams virtual ~RsdCpuScriptIntrinsic3DLUT(); 367c4b888f2147edf99690b6af75470774ff31c43bJason Sams RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); 377c4b888f2147edf99690b6af75470774ff31c43bJason Sams 387c4b888f2147edf99690b6af75470774ff31c43bJason Samsprotected: 397c4b888f2147edf99690b6af75470774ff31c43bJason Sams ObjectBaseRef<Allocation> mLUT; 407c4b888f2147edf99690b6af75470774ff31c43bJason Sams 417c4b888f2147edf99690b6af75470774ff31c43bJason Sams static void kernel(const RsForEachStubParamStruct *p, 427c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint32_t xstart, uint32_t xend, 437c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint32_t instep, uint32_t outstep); 447c4b888f2147edf99690b6af75470774ff31c43bJason Sams}; 457c4b888f2147edf99690b6af75470774ff31c43bJason Sams 467c4b888f2147edf99690b6af75470774ff31c43bJason Sams} 477c4b888f2147edf99690b6af75470774ff31c43bJason Sams} 487c4b888f2147edf99690b6af75470774ff31c43bJason Sams 497c4b888f2147edf99690b6af75470774ff31c43bJason Sams 507c4b888f2147edf99690b6af75470774ff31c43bJason Samsvoid RsdCpuScriptIntrinsic3DLUT::setGlobalObj(uint32_t slot, ObjectBase *data) { 517c4b888f2147edf99690b6af75470774ff31c43bJason Sams rsAssert(slot == 0); 527c4b888f2147edf99690b6af75470774ff31c43bJason Sams mLUT.set(static_cast<Allocation *>(data)); 537c4b888f2147edf99690b6af75470774ff31c43bJason Sams} 547c4b888f2147edf99690b6af75470774ff31c43bJason Sams 557c4b888f2147edf99690b6af75470774ff31c43bJason Samsextern "C" void rsdIntrinsic3DLUT_K(void *dst, const void *src, const void *lut, 567c4b888f2147edf99690b6af75470774ff31c43bJason Sams size_t lut_stride_y, size_t lut_stride_z, 577c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint32_t count, const void *constants); 587c4b888f2147edf99690b6af75470774ff31c43bJason Sams 597c4b888f2147edf99690b6af75470774ff31c43bJason Sams 607c4b888f2147edf99690b6af75470774ff31c43bJason Samsvoid RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p, 617c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint32_t xstart, uint32_t xend, 627c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint32_t instep, uint32_t outstep) { 637c4b888f2147edf99690b6af75470774ff31c43bJason Sams RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr; 647c4b888f2147edf99690b6af75470774ff31c43bJason Sams 657c4b888f2147edf99690b6af75470774ff31c43bJason Sams uchar4 *out = (uchar4 *)p->out; 667c4b888f2147edf99690b6af75470774ff31c43bJason Sams uchar4 *in = (uchar4 *)p->in; 677c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint32_t x1 = xstart; 687c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint32_t x2 = xend; 697c4b888f2147edf99690b6af75470774ff31c43bJason Sams 707c4b888f2147edf99690b6af75470774ff31c43bJason Sams const uchar *bp = (const uchar *)cp->mLUT->mHal.drvState.lod[0].mallocPtr; 717c4b888f2147edf99690b6af75470774ff31c43bJason Sams 727c4b888f2147edf99690b6af75470774ff31c43bJason Sams int4 dims = { 738994abbe699bb05fa70cff101becc925db6b2c26synergy dev static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimX - 1), 748994abbe699bb05fa70cff101becc925db6b2c26synergy dev static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimY - 1), 758994abbe699bb05fa70cff101becc925db6b2c26synergy dev static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimZ - 1), 76d533c4c66510f2b83b9397607756479d79baae10Stephen Hines -1 777c4b888f2147edf99690b6af75470774ff31c43bJason Sams }; 78d533c4c66510f2b83b9397607756479d79baae10Stephen Hines const float4 m = (float4)(1.f / 255.f) * convert_float4(dims); 797c4b888f2147edf99690b6af75470774ff31c43bJason Sams const int4 coordMul = convert_int4(m * (float4)0x8000); 807c4b888f2147edf99690b6af75470774ff31c43bJason Sams const size_t stride_y = cp->mLUT->mHal.drvState.lod[0].stride; 817c4b888f2147edf99690b6af75470774ff31c43bJason Sams const size_t stride_z = stride_y * cp->mLUT->mHal.drvState.lod[0].dimY; 827c4b888f2147edf99690b6af75470774ff31c43bJason Sams 837c4b888f2147edf99690b6af75470774ff31c43bJason Sams //ALOGE("strides %zu %zu", stride_y, stride_z); 847c4b888f2147edf99690b6af75470774ff31c43bJason Sams 857c4b888f2147edf99690b6af75470774ff31c43bJason Sams while (x1 < x2) { 86f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams#if defined(ARCH_ARM_HAVE_VFP) 87f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams if (gArchUseSIMD) { 88f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams int32_t len = (x2 - x1 - 1) >> 1; 89f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams if(len > 0) { 90f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams const short neon_constants[] = { 918994abbe699bb05fa70cff101becc925db6b2c26synergy dev static_cast<short>(coordMul.x), static_cast<short>(coordMul.y), 928994abbe699bb05fa70cff101becc925db6b2c26synergy dev static_cast<short>(coordMul.z), 0, 0, 0, 0, static_cast<short>(0xffff), 93f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams 94f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams }; 95f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams 96f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants); 97f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams x1 += len << 1; 98f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams out += len << 1; 99f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams in += len << 1; 100f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams } 1017c4b888f2147edf99690b6af75470774ff31c43bJason Sams } 1027c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1037c4b888f2147edf99690b6af75470774ff31c43bJason Sams#endif 1047c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1057c4b888f2147edf99690b6af75470774ff31c43bJason Sams int4 baseCoord = convert_int4(*in) * coordMul; 1067c4b888f2147edf99690b6af75470774ff31c43bJason Sams int4 coord1 = baseCoord >> (int4)15; 1077c4b888f2147edf99690b6af75470774ff31c43bJason Sams //int4 coord2 = min(coord1 + 1, gDims - 1); 1087c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1097c4b888f2147edf99690b6af75470774ff31c43bJason Sams int4 weight2 = baseCoord & 0x7fff; 1107c4b888f2147edf99690b6af75470774ff31c43bJason Sams int4 weight1 = (int4)0x8000 - weight2; 1117c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1127c4b888f2147edf99690b6af75470774ff31c43bJason Sams //ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w); 1137c4b888f2147edf99690b6af75470774ff31c43bJason Sams const uchar *bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z); 1147c4b888f2147edf99690b6af75470774ff31c43bJason Sams const uchar4 *pt_00 = (const uchar4 *)&bp2[0]; 1157c4b888f2147edf99690b6af75470774ff31c43bJason Sams const uchar4 *pt_10 = (const uchar4 *)&bp2[stride_y]; 1167c4b888f2147edf99690b6af75470774ff31c43bJason Sams const uchar4 *pt_01 = (const uchar4 *)&bp2[stride_z]; 1177c4b888f2147edf99690b6af75470774ff31c43bJason Sams const uchar4 *pt_11 = (const uchar4 *)&bp2[stride_y + stride_z]; 1187c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1197c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 v000 = convert_uint4(pt_00[0]); 1207c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 v100 = convert_uint4(pt_00[1]); 1217c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 v010 = convert_uint4(pt_10[0]); 1227c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 v110 = convert_uint4(pt_10[1]); 1237c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 v001 = convert_uint4(pt_01[0]); 1247c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 v101 = convert_uint4(pt_01[1]); 1257c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 v011 = convert_uint4(pt_11[0]); 1267c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 v111 = convert_uint4(pt_11[1]); 1277c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1287c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7; 1297c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7; 1307c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7; 1317c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7; 1327c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1335e3fb0b3cfadcb44a74cf4b6ec9ec65c11ba811eStephen Hines uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15; 1345e3fb0b3cfadcb44a74cf4b6ec9ec65c11ba811eStephen Hines uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15; 1357c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1365e3fb0b3cfadcb44a74cf4b6ec9ec65c11ba811eStephen Hines uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15; 1377c4b888f2147edf99690b6af75470774ff31c43bJason Sams uint4 v2 = (v + 0x7f) >> (int4)8; 1387c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1397c4b888f2147edf99690b6af75470774ff31c43bJason Sams uchar4 ret = convert_uchar4(v2); 1400b575de8ed0b628d84d256f5846500b0385979bdTim Murray ret.w = in->w; 1417c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1427c4b888f2147edf99690b6af75470774ff31c43bJason Sams #if 0 1437c4b888f2147edf99690b6af75470774ff31c43bJason Sams if (!x1) { 1447c4b888f2147edf99690b6af75470774ff31c43bJason Sams ALOGE("in %08x %08x %08x %08x", in->r, in->g, in->b, in->a); 1457c4b888f2147edf99690b6af75470774ff31c43bJason Sams ALOGE("baseCoord %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z, baseCoord.w); 1467c4b888f2147edf99690b6af75470774ff31c43bJason Sams ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w); 1477c4b888f2147edf99690b6af75470774ff31c43bJason Sams ALOGE("weight1 %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w); 1487c4b888f2147edf99690b6af75470774ff31c43bJason Sams ALOGE("weight2 %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w); 1497c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1507c4b888f2147edf99690b6af75470774ff31c43bJason Sams ALOGE("v000 %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w); 1517c4b888f2147edf99690b6af75470774ff31c43bJason Sams ALOGE("v100 %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w); 1527c4b888f2147edf99690b6af75470774ff31c43bJason Sams ALOGE("yz00 %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w); 1537c4b888f2147edf99690b6af75470774ff31c43bJason Sams ALOGE("z0 %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w); 1547c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1557c4b888f2147edf99690b6af75470774ff31c43bJason Sams ALOGE("v %08x %08x %08x %08x", v.x, v.y, v.z, v.w); 1567c4b888f2147edf99690b6af75470774ff31c43bJason Sams ALOGE("v2 %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w); 1577c4b888f2147edf99690b6af75470774ff31c43bJason Sams } 1587c4b888f2147edf99690b6af75470774ff31c43bJason Sams #endif 1597c4b888f2147edf99690b6af75470774ff31c43bJason Sams *out = ret; 1607c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1617c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1627c4b888f2147edf99690b6af75470774ff31c43bJason Sams in++; 1637c4b888f2147edf99690b6af75470774ff31c43bJason Sams out++; 1647c4b888f2147edf99690b6af75470774ff31c43bJason Sams x1++; 1657c4b888f2147edf99690b6af75470774ff31c43bJason Sams } 1667c4b888f2147edf99690b6af75470774ff31c43bJason Sams} 1677c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1687c4b888f2147edf99690b6af75470774ff31c43bJason SamsRsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, 1697c4b888f2147edf99690b6af75470774ff31c43bJason Sams const Script *s, const Element *e) 1707c4b888f2147edf99690b6af75470774ff31c43bJason Sams : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) { 1717c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1727c4b888f2147edf99690b6af75470774ff31c43bJason Sams mRootPtr = &kernel; 1737c4b888f2147edf99690b6af75470774ff31c43bJason Sams} 1747c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1757c4b888f2147edf99690b6af75470774ff31c43bJason SamsRsdCpuScriptIntrinsic3DLUT::~RsdCpuScriptIntrinsic3DLUT() { 1767c4b888f2147edf99690b6af75470774ff31c43bJason Sams} 1777c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1787c4b888f2147edf99690b6af75470774ff31c43bJason Samsvoid RsdCpuScriptIntrinsic3DLUT::populateScript(Script *s) { 1797c4b888f2147edf99690b6af75470774ff31c43bJason Sams s->mHal.info.exportedVariableCount = 1; 1807c4b888f2147edf99690b6af75470774ff31c43bJason Sams} 1817c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1827c4b888f2147edf99690b6af75470774ff31c43bJason Samsvoid RsdCpuScriptIntrinsic3DLUT::invokeFreeChildren() { 1837c4b888f2147edf99690b6af75470774ff31c43bJason Sams mLUT.clear(); 1847c4b888f2147edf99690b6af75470774ff31c43bJason Sams} 1857c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1867c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1877c4b888f2147edf99690b6af75470774ff31c43bJason SamsRsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx, 1887c4b888f2147edf99690b6af75470774ff31c43bJason Sams const Script *s, const Element *e) { 1897c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1907c4b888f2147edf99690b6af75470774ff31c43bJason Sams return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e); 1917c4b888f2147edf99690b6af75470774ff31c43bJason Sams} 1927c4b888f2147edf99690b6af75470774ff31c43bJason Sams 1937c4b888f2147edf99690b6af75470774ff31c43bJason Sams 194