rsCpuIntrinsic3DLUT.cpp revision 5dcaaa5f50926bebf6877e254c521faa7e2593e3
144d362409d5469aed47d19e7908d19bd194493aThomas Graf/* 244d362409d5469aed47d19e7908d19bd194493aThomas Graf * Copyright (C) 2012 The Android Open Source Project 344d362409d5469aed47d19e7908d19bd194493aThomas Graf * 444d362409d5469aed47d19e7908d19bd194493aThomas Graf * Licensed under the Apache License, Version 2.0 (the "License"); 544d362409d5469aed47d19e7908d19bd194493aThomas Graf * you may not use this file except in compliance with the License. 644d362409d5469aed47d19e7908d19bd194493aThomas Graf * You may obtain a copy of the License at 744d362409d5469aed47d19e7908d19bd194493aThomas Graf * 844d362409d5469aed47d19e7908d19bd194493aThomas Graf * http://www.apache.org/licenses/LICENSE-2.0 91155370f520cb64657e25153255cf7dc1424317fThomas Graf * 1044d362409d5469aed47d19e7908d19bd194493aThomas Graf * Unless required by applicable law or agreed to in writing, software 1144d362409d5469aed47d19e7908d19bd194493aThomas Graf * distributed under the License is distributed on an "AS IS" BASIS, 1244d362409d5469aed47d19e7908d19bd194493aThomas Graf * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 136782b6f709d03877a5661a4c8d8f8bd1b461f43fThomas Graf * See the License for the specific language governing permissions and 1444d362409d5469aed47d19e7908d19bd194493aThomas Graf * limitations under the License. 1544d362409d5469aed47d19e7908d19bd194493aThomas Graf */ 1644d362409d5469aed47d19e7908d19bd194493aThomas Graf 1744d362409d5469aed47d19e7908d19bd194493aThomas Graf 1844d362409d5469aed47d19e7908d19bd194493aThomas Graf#include "rsCpuIntrinsic.h" 1944d362409d5469aed47d19e7908d19bd194493aThomas Graf#include "rsCpuIntrinsicInlines.h" 2044d362409d5469aed47d19e7908d19bd194493aThomas Graf 2144d362409d5469aed47d19e7908d19bd194493aThomas Grafusing namespace android; 2244d362409d5469aed47d19e7908d19bd194493aThomas Grafusing namespace android::renderscript; 2344d362409d5469aed47d19e7908d19bd194493aThomas Graf 2444d362409d5469aed47d19e7908d19bd194493aThomas Grafnamespace android { 2544d362409d5469aed47d19e7908d19bd194493aThomas Grafnamespace renderscript { 2644d362409d5469aed47d19e7908d19bd194493aThomas Graf 2744d362409d5469aed47d19e7908d19bd194493aThomas Graf 2844d362409d5469aed47d19e7908d19bd194493aThomas Grafclass RsdCpuScriptIntrinsic3DLUT : public RsdCpuScriptIntrinsic { 291155370f520cb64657e25153255cf7dc1424317fThomas Grafpublic: 3044d362409d5469aed47d19e7908d19bd194493aThomas Graf virtual void populateScript(Script *); 3144d362409d5469aed47d19e7908d19bd194493aThomas Graf virtual void invokeFreeChildren(); 3244d362409d5469aed47d19e7908d19bd194493aThomas Graf 3344d362409d5469aed47d19e7908d19bd194493aThomas Graf virtual void setGlobalObj(uint32_t slot, ObjectBase *data); 3444d362409d5469aed47d19e7908d19bd194493aThomas Graf 3544d362409d5469aed47d19e7908d19bd194493aThomas Graf virtual ~RsdCpuScriptIntrinsic3DLUT(); 3644d362409d5469aed47d19e7908d19bd194493aThomas Graf RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); 3744d362409d5469aed47d19e7908d19bd194493aThomas Graf 3844d362409d5469aed47d19e7908d19bd194493aThomas Grafprotected: 391155370f520cb64657e25153255cf7dc1424317fThomas Graf ObjectBaseRef<Allocation> mLUT; 4044d362409d5469aed47d19e7908d19bd194493aThomas Graf 4144d362409d5469aed47d19e7908d19bd194493aThomas Graf static void kernel(const RsForEachStubParamStruct *p, 4244d362409d5469aed47d19e7908d19bd194493aThomas Graf uint32_t xstart, uint32_t xend, 4344d362409d5469aed47d19e7908d19bd194493aThomas Graf uint32_t instep, uint32_t outstep); 4444d362409d5469aed47d19e7908d19bd194493aThomas Graf}; 451155370f520cb64657e25153255cf7dc1424317fThomas Graf 4644d362409d5469aed47d19e7908d19bd194493aThomas Graf} 4744d362409d5469aed47d19e7908d19bd194493aThomas Graf} 4844d362409d5469aed47d19e7908d19bd194493aThomas Graf 4944d362409d5469aed47d19e7908d19bd194493aThomas Graf 5044d362409d5469aed47d19e7908d19bd194493aThomas Grafvoid RsdCpuScriptIntrinsic3DLUT::setGlobalObj(uint32_t slot, ObjectBase *data) { 5144d362409d5469aed47d19e7908d19bd194493aThomas Graf rsAssert(slot == 0); 5244d362409d5469aed47d19e7908d19bd194493aThomas Graf mLUT.set(static_cast<Allocation *>(data)); 5344d362409d5469aed47d19e7908d19bd194493aThomas Graf} 5444d362409d5469aed47d19e7908d19bd194493aThomas Graf 5544d362409d5469aed47d19e7908d19bd194493aThomas Grafextern "C" size_t rsdIntrinsic3DLUT_K(void *dst, void const *in, size_t count, 5644d362409d5469aed47d19e7908d19bd194493aThomas Graf void const *lut, 5744d362409d5469aed47d19e7908d19bd194493aThomas Graf int32_t pitchy, int32_t pitchz, 5844d362409d5469aed47d19e7908d19bd194493aThomas Graf int dimx, int dimy, int dimz); 5944d362409d5469aed47d19e7908d19bd194493aThomas Graf 6044d362409d5469aed47d19e7908d19bd194493aThomas Graf 6144d362409d5469aed47d19e7908d19bd194493aThomas Grafvoid RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p, 6244d362409d5469aed47d19e7908d19bd194493aThomas Graf uint32_t xstart, uint32_t xend, 6344d362409d5469aed47d19e7908d19bd194493aThomas Graf uint32_t instep, uint32_t outstep) { 6444d362409d5469aed47d19e7908d19bd194493aThomas Graf RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr; 6544d362409d5469aed47d19e7908d19bd194493aThomas Graf 6644d362409d5469aed47d19e7908d19bd194493aThomas Graf uchar4 *out = (uchar4 *)p->out; 6744d362409d5469aed47d19e7908d19bd194493aThomas Graf uchar4 *in = (uchar4 *)p->in; 6844d362409d5469aed47d19e7908d19bd194493aThomas Graf uint32_t x1 = xstart; 6944d362409d5469aed47d19e7908d19bd194493aThomas Graf uint32_t x2 = xend; 7044d362409d5469aed47d19e7908d19bd194493aThomas Graf 7144d362409d5469aed47d19e7908d19bd194493aThomas Graf const uchar *bp = (const uchar *)cp->mLUT->mHal.drvState.lod[0].mallocPtr; 7244d362409d5469aed47d19e7908d19bd194493aThomas Graf 7344d362409d5469aed47d19e7908d19bd194493aThomas Graf int4 dims = { 7444d362409d5469aed47d19e7908d19bd194493aThomas Graf static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimX - 1), 7544d362409d5469aed47d19e7908d19bd194493aThomas Graf static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimY - 1), 7644d362409d5469aed47d19e7908d19bd194493aThomas Graf static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimZ - 1), 7744d362409d5469aed47d19e7908d19bd194493aThomas Graf -1 7844d362409d5469aed47d19e7908d19bd194493aThomas Graf }; 7944d362409d5469aed47d19e7908d19bd194493aThomas Graf const float4 m = (float4)(1.f / 255.f) * convert_float4(dims); 8044d362409d5469aed47d19e7908d19bd194493aThomas Graf const int4 coordMul = convert_int4(m * (float4)0x8000); 8144d362409d5469aed47d19e7908d19bd194493aThomas Graf const size_t stride_y = cp->mLUT->mHal.drvState.lod[0].stride; 8244d362409d5469aed47d19e7908d19bd194493aThomas Graf const size_t stride_z = stride_y * cp->mLUT->mHal.drvState.lod[0].dimY; 8344d362409d5469aed47d19e7908d19bd194493aThomas Graf 8444d362409d5469aed47d19e7908d19bd194493aThomas Graf //ALOGE("strides %zu %zu", stride_y, stride_z); 8544d362409d5469aed47d19e7908d19bd194493aThomas Graf 8644d362409d5469aed47d19e7908d19bd194493aThomas Graf while (x1 < x2) { 8744d362409d5469aed47d19e7908d19bd194493aThomas Graf#if defined(ARCH_ARM_HAVE_VFP) 8844d362409d5469aed47d19e7908d19bd194493aThomas Graf if (gArchUseSIMD) { 8944d362409d5469aed47d19e7908d19bd194493aThomas Graf int32_t len = x2 - x1; 9044d362409d5469aed47d19e7908d19bd194493aThomas Graf if(len >= 8) { 9144d362409d5469aed47d19e7908d19bd194493aThomas Graf size_t done; 9244d362409d5469aed47d19e7908d19bd194493aThomas Graf done = len - rsdIntrinsic3DLUT_K(out, in, len, 9344d362409d5469aed47d19e7908d19bd194493aThomas Graf bp, stride_y, stride_z, 9444d362409d5469aed47d19e7908d19bd194493aThomas Graf dims.x, dims.y, dims.z); 9544d362409d5469aed47d19e7908d19bd194493aThomas Graf 9644d362409d5469aed47d19e7908d19bd194493aThomas Graf x1 += done; 9744d362409d5469aed47d19e7908d19bd194493aThomas Graf out += done; 9844d362409d5469aed47d19e7908d19bd194493aThomas Graf in += done; 9944d362409d5469aed47d19e7908d19bd194493aThomas Graf } 10044d362409d5469aed47d19e7908d19bd194493aThomas Graf } 10144d362409d5469aed47d19e7908d19bd194493aThomas Graf#endif 10244d362409d5469aed47d19e7908d19bd194493aThomas Graf 10344d362409d5469aed47d19e7908d19bd194493aThomas Graf int4 baseCoord = convert_int4(*in) * coordMul; 10444d362409d5469aed47d19e7908d19bd194493aThomas Graf int4 coord1 = baseCoord >> (int4)15; 10544d362409d5469aed47d19e7908d19bd194493aThomas Graf //int4 coord2 = min(coord1 + 1, gDims - 1); 10644d362409d5469aed47d19e7908d19bd194493aThomas Graf 10744d362409d5469aed47d19e7908d19bd194493aThomas Graf int4 weight2 = baseCoord & 0x7fff; 10844d362409d5469aed47d19e7908d19bd194493aThomas Graf int4 weight1 = (int4)0x8000 - weight2; 10944d362409d5469aed47d19e7908d19bd194493aThomas Graf 11044d362409d5469aed47d19e7908d19bd194493aThomas Graf //ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w); 11144d362409d5469aed47d19e7908d19bd194493aThomas Graf const uchar *bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z); 11244d362409d5469aed47d19e7908d19bd194493aThomas Graf const uchar4 *pt_00 = (const uchar4 *)&bp2[0]; 11344d362409d5469aed47d19e7908d19bd194493aThomas Graf const uchar4 *pt_10 = (const uchar4 *)&bp2[stride_y]; 11444d362409d5469aed47d19e7908d19bd194493aThomas Graf const uchar4 *pt_01 = (const uchar4 *)&bp2[stride_z]; 11544d362409d5469aed47d19e7908d19bd194493aThomas Graf const uchar4 *pt_11 = (const uchar4 *)&bp2[stride_y + stride_z]; 11644d362409d5469aed47d19e7908d19bd194493aThomas Graf 11744d362409d5469aed47d19e7908d19bd194493aThomas Graf uint4 v000 = convert_uint4(pt_00[0]); 11844d362409d5469aed47d19e7908d19bd194493aThomas Graf uint4 v100 = convert_uint4(pt_00[1]); 11944d362409d5469aed47d19e7908d19bd194493aThomas Graf uint4 v010 = convert_uint4(pt_10[0]); 12044d362409d5469aed47d19e7908d19bd194493aThomas Graf uint4 v110 = convert_uint4(pt_10[1]); 12144d362409d5469aed47d19e7908d19bd194493aThomas Graf uint4 v001 = convert_uint4(pt_01[0]); 12244d362409d5469aed47d19e7908d19bd194493aThomas Graf uint4 v101 = convert_uint4(pt_01[1]); 12344d362409d5469aed47d19e7908d19bd194493aThomas Graf uint4 v011 = convert_uint4(pt_11[0]); 12444d362409d5469aed47d19e7908d19bd194493aThomas Graf uint4 v111 = convert_uint4(pt_11[1]); 125 126 uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7; 127 uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7; 128 uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7; 129 uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7; 130 131 uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15; 132 uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15; 133 134 uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15; 135 uint4 v2 = (v + 0x7f) >> (int4)8; 136 137 uchar4 ret = convert_uchar4(v2); 138 ret.w = in->w; 139 140 #if 0 141 if (!x1) { 142 ALOGE("in %08x %08x %08x %08x", in->r, in->g, in->b, in->a); 143 ALOGE("baseCoord %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z, baseCoord.w); 144 ALOGE("coord1 %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w); 145 ALOGE("weight1 %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w); 146 ALOGE("weight2 %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w); 147 148 ALOGE("v000 %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w); 149 ALOGE("v100 %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w); 150 ALOGE("yz00 %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w); 151 ALOGE("z0 %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w); 152 153 ALOGE("v %08x %08x %08x %08x", v.x, v.y, v.z, v.w); 154 ALOGE("v2 %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w); 155 } 156 #endif 157 *out = ret; 158 159 160 in++; 161 out++; 162 x1++; 163 } 164} 165 166RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, 167 const Script *s, const Element *e) 168 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) { 169 170 mRootPtr = &kernel; 171} 172 173RsdCpuScriptIntrinsic3DLUT::~RsdCpuScriptIntrinsic3DLUT() { 174} 175 176void RsdCpuScriptIntrinsic3DLUT::populateScript(Script *s) { 177 s->mHal.info.exportedVariableCount = 1; 178} 179 180void RsdCpuScriptIntrinsic3DLUT::invokeFreeChildren() { 181 mLUT.clear(); 182} 183 184 185RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx, 186 const Script *s, const Element *e) { 187 188 return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e); 189} 190 191 192