rsCpuIntrinsic3DLUT.cpp revision 5dcaaa5f50926bebf6877e254c521faa7e2593e3
144d362409d5469aed47d19e7908d19bd194493aThomas Graf/*
244d362409d5469aed47d19e7908d19bd194493aThomas Graf * Copyright (C) 2012 The Android Open Source Project
344d362409d5469aed47d19e7908d19bd194493aThomas Graf *
444d362409d5469aed47d19e7908d19bd194493aThomas Graf * Licensed under the Apache License, Version 2.0 (the "License");
544d362409d5469aed47d19e7908d19bd194493aThomas Graf * you may not use this file except in compliance with the License.
644d362409d5469aed47d19e7908d19bd194493aThomas Graf * You may obtain a copy of the License at
744d362409d5469aed47d19e7908d19bd194493aThomas Graf *
844d362409d5469aed47d19e7908d19bd194493aThomas Graf *      http://www.apache.org/licenses/LICENSE-2.0
91155370f520cb64657e25153255cf7dc1424317fThomas Graf *
1044d362409d5469aed47d19e7908d19bd194493aThomas Graf * Unless required by applicable law or agreed to in writing, software
1144d362409d5469aed47d19e7908d19bd194493aThomas Graf * distributed under the License is distributed on an "AS IS" BASIS,
1244d362409d5469aed47d19e7908d19bd194493aThomas Graf * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
136782b6f709d03877a5661a4c8d8f8bd1b461f43fThomas Graf * See the License for the specific language governing permissions and
1444d362409d5469aed47d19e7908d19bd194493aThomas Graf * limitations under the License.
1544d362409d5469aed47d19e7908d19bd194493aThomas Graf */
1644d362409d5469aed47d19e7908d19bd194493aThomas Graf
1744d362409d5469aed47d19e7908d19bd194493aThomas Graf
1844d362409d5469aed47d19e7908d19bd194493aThomas Graf#include "rsCpuIntrinsic.h"
1944d362409d5469aed47d19e7908d19bd194493aThomas Graf#include "rsCpuIntrinsicInlines.h"
2044d362409d5469aed47d19e7908d19bd194493aThomas Graf
2144d362409d5469aed47d19e7908d19bd194493aThomas Grafusing namespace android;
2244d362409d5469aed47d19e7908d19bd194493aThomas Grafusing namespace android::renderscript;
2344d362409d5469aed47d19e7908d19bd194493aThomas Graf
2444d362409d5469aed47d19e7908d19bd194493aThomas Grafnamespace android {
2544d362409d5469aed47d19e7908d19bd194493aThomas Grafnamespace renderscript {
2644d362409d5469aed47d19e7908d19bd194493aThomas Graf
2744d362409d5469aed47d19e7908d19bd194493aThomas Graf
2844d362409d5469aed47d19e7908d19bd194493aThomas Grafclass RsdCpuScriptIntrinsic3DLUT : public RsdCpuScriptIntrinsic {
291155370f520cb64657e25153255cf7dc1424317fThomas Grafpublic:
3044d362409d5469aed47d19e7908d19bd194493aThomas Graf    virtual void populateScript(Script *);
3144d362409d5469aed47d19e7908d19bd194493aThomas Graf    virtual void invokeFreeChildren();
3244d362409d5469aed47d19e7908d19bd194493aThomas Graf
3344d362409d5469aed47d19e7908d19bd194493aThomas Graf    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
3444d362409d5469aed47d19e7908d19bd194493aThomas Graf
3544d362409d5469aed47d19e7908d19bd194493aThomas Graf    virtual ~RsdCpuScriptIntrinsic3DLUT();
3644d362409d5469aed47d19e7908d19bd194493aThomas Graf    RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
3744d362409d5469aed47d19e7908d19bd194493aThomas Graf
3844d362409d5469aed47d19e7908d19bd194493aThomas Grafprotected:
391155370f520cb64657e25153255cf7dc1424317fThomas Graf    ObjectBaseRef<Allocation> mLUT;
4044d362409d5469aed47d19e7908d19bd194493aThomas Graf
4144d362409d5469aed47d19e7908d19bd194493aThomas Graf    static void kernel(const RsForEachStubParamStruct *p,
4244d362409d5469aed47d19e7908d19bd194493aThomas Graf                       uint32_t xstart, uint32_t xend,
4344d362409d5469aed47d19e7908d19bd194493aThomas Graf                       uint32_t instep, uint32_t outstep);
4444d362409d5469aed47d19e7908d19bd194493aThomas Graf};
451155370f520cb64657e25153255cf7dc1424317fThomas Graf
4644d362409d5469aed47d19e7908d19bd194493aThomas Graf}
4744d362409d5469aed47d19e7908d19bd194493aThomas Graf}
4844d362409d5469aed47d19e7908d19bd194493aThomas Graf
4944d362409d5469aed47d19e7908d19bd194493aThomas Graf
5044d362409d5469aed47d19e7908d19bd194493aThomas Grafvoid RsdCpuScriptIntrinsic3DLUT::setGlobalObj(uint32_t slot, ObjectBase *data) {
5144d362409d5469aed47d19e7908d19bd194493aThomas Graf    rsAssert(slot == 0);
5244d362409d5469aed47d19e7908d19bd194493aThomas Graf    mLUT.set(static_cast<Allocation *>(data));
5344d362409d5469aed47d19e7908d19bd194493aThomas Graf}
5444d362409d5469aed47d19e7908d19bd194493aThomas Graf
5544d362409d5469aed47d19e7908d19bd194493aThomas Grafextern "C" size_t rsdIntrinsic3DLUT_K(void *dst, void const *in, size_t count,
5644d362409d5469aed47d19e7908d19bd194493aThomas Graf                                      void const *lut,
5744d362409d5469aed47d19e7908d19bd194493aThomas Graf                                      int32_t pitchy, int32_t pitchz,
5844d362409d5469aed47d19e7908d19bd194493aThomas Graf                                      int dimx, int dimy, int dimz);
5944d362409d5469aed47d19e7908d19bd194493aThomas Graf
6044d362409d5469aed47d19e7908d19bd194493aThomas Graf
6144d362409d5469aed47d19e7908d19bd194493aThomas Grafvoid RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p,
6244d362409d5469aed47d19e7908d19bd194493aThomas Graf                                      uint32_t xstart, uint32_t xend,
6344d362409d5469aed47d19e7908d19bd194493aThomas Graf                                      uint32_t instep, uint32_t outstep) {
6444d362409d5469aed47d19e7908d19bd194493aThomas Graf    RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
6544d362409d5469aed47d19e7908d19bd194493aThomas Graf
6644d362409d5469aed47d19e7908d19bd194493aThomas Graf    uchar4 *out = (uchar4 *)p->out;
6744d362409d5469aed47d19e7908d19bd194493aThomas Graf    uchar4 *in = (uchar4 *)p->in;
6844d362409d5469aed47d19e7908d19bd194493aThomas Graf    uint32_t x1 = xstart;
6944d362409d5469aed47d19e7908d19bd194493aThomas Graf    uint32_t x2 = xend;
7044d362409d5469aed47d19e7908d19bd194493aThomas Graf
7144d362409d5469aed47d19e7908d19bd194493aThomas Graf    const uchar *bp = (const uchar *)cp->mLUT->mHal.drvState.lod[0].mallocPtr;
7244d362409d5469aed47d19e7908d19bd194493aThomas Graf
7344d362409d5469aed47d19e7908d19bd194493aThomas Graf    int4 dims = {
7444d362409d5469aed47d19e7908d19bd194493aThomas Graf        static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimX - 1),
7544d362409d5469aed47d19e7908d19bd194493aThomas Graf        static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimY - 1),
7644d362409d5469aed47d19e7908d19bd194493aThomas Graf        static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimZ - 1),
7744d362409d5469aed47d19e7908d19bd194493aThomas Graf        -1
7844d362409d5469aed47d19e7908d19bd194493aThomas Graf    };
7944d362409d5469aed47d19e7908d19bd194493aThomas Graf    const float4 m = (float4)(1.f / 255.f) * convert_float4(dims);
8044d362409d5469aed47d19e7908d19bd194493aThomas Graf    const int4 coordMul = convert_int4(m * (float4)0x8000);
8144d362409d5469aed47d19e7908d19bd194493aThomas Graf    const size_t stride_y = cp->mLUT->mHal.drvState.lod[0].stride;
8244d362409d5469aed47d19e7908d19bd194493aThomas Graf    const size_t stride_z = stride_y * cp->mLUT->mHal.drvState.lod[0].dimY;
8344d362409d5469aed47d19e7908d19bd194493aThomas Graf
8444d362409d5469aed47d19e7908d19bd194493aThomas Graf    //ALOGE("strides %zu %zu", stride_y, stride_z);
8544d362409d5469aed47d19e7908d19bd194493aThomas Graf
8644d362409d5469aed47d19e7908d19bd194493aThomas Graf    while (x1 < x2) {
8744d362409d5469aed47d19e7908d19bd194493aThomas Graf#if defined(ARCH_ARM_HAVE_VFP)
8844d362409d5469aed47d19e7908d19bd194493aThomas Graf        if (gArchUseSIMD) {
8944d362409d5469aed47d19e7908d19bd194493aThomas Graf            int32_t len = x2 - x1;
9044d362409d5469aed47d19e7908d19bd194493aThomas Graf            if(len >= 8) {
9144d362409d5469aed47d19e7908d19bd194493aThomas Graf                size_t done;
9244d362409d5469aed47d19e7908d19bd194493aThomas Graf               done = len - rsdIntrinsic3DLUT_K(out, in, len,
9344d362409d5469aed47d19e7908d19bd194493aThomas Graf                                      bp, stride_y, stride_z,
9444d362409d5469aed47d19e7908d19bd194493aThomas Graf                                      dims.x, dims.y, dims.z);
9544d362409d5469aed47d19e7908d19bd194493aThomas Graf
9644d362409d5469aed47d19e7908d19bd194493aThomas Graf                x1 += done;
9744d362409d5469aed47d19e7908d19bd194493aThomas Graf                out += done;
9844d362409d5469aed47d19e7908d19bd194493aThomas Graf                in += done;
9944d362409d5469aed47d19e7908d19bd194493aThomas Graf            }
10044d362409d5469aed47d19e7908d19bd194493aThomas Graf        }
10144d362409d5469aed47d19e7908d19bd194493aThomas Graf#endif
10244d362409d5469aed47d19e7908d19bd194493aThomas Graf
10344d362409d5469aed47d19e7908d19bd194493aThomas Graf        int4 baseCoord = convert_int4(*in) * coordMul;
10444d362409d5469aed47d19e7908d19bd194493aThomas Graf        int4 coord1 = baseCoord >> (int4)15;
10544d362409d5469aed47d19e7908d19bd194493aThomas Graf        //int4 coord2 = min(coord1 + 1, gDims - 1);
10644d362409d5469aed47d19e7908d19bd194493aThomas Graf
10744d362409d5469aed47d19e7908d19bd194493aThomas Graf        int4 weight2 = baseCoord & 0x7fff;
10844d362409d5469aed47d19e7908d19bd194493aThomas Graf        int4 weight1 = (int4)0x8000 - weight2;
10944d362409d5469aed47d19e7908d19bd194493aThomas Graf
11044d362409d5469aed47d19e7908d19bd194493aThomas Graf        //ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
11144d362409d5469aed47d19e7908d19bd194493aThomas Graf        const uchar *bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z);
11244d362409d5469aed47d19e7908d19bd194493aThomas Graf        const uchar4 *pt_00 = (const uchar4 *)&bp2[0];
11344d362409d5469aed47d19e7908d19bd194493aThomas Graf        const uchar4 *pt_10 = (const uchar4 *)&bp2[stride_y];
11444d362409d5469aed47d19e7908d19bd194493aThomas Graf        const uchar4 *pt_01 = (const uchar4 *)&bp2[stride_z];
11544d362409d5469aed47d19e7908d19bd194493aThomas Graf        const uchar4 *pt_11 = (const uchar4 *)&bp2[stride_y + stride_z];
11644d362409d5469aed47d19e7908d19bd194493aThomas Graf
11744d362409d5469aed47d19e7908d19bd194493aThomas Graf        uint4 v000 = convert_uint4(pt_00[0]);
11844d362409d5469aed47d19e7908d19bd194493aThomas Graf        uint4 v100 = convert_uint4(pt_00[1]);
11944d362409d5469aed47d19e7908d19bd194493aThomas Graf        uint4 v010 = convert_uint4(pt_10[0]);
12044d362409d5469aed47d19e7908d19bd194493aThomas Graf        uint4 v110 = convert_uint4(pt_10[1]);
12144d362409d5469aed47d19e7908d19bd194493aThomas Graf        uint4 v001 = convert_uint4(pt_01[0]);
12244d362409d5469aed47d19e7908d19bd194493aThomas Graf        uint4 v101 = convert_uint4(pt_01[1]);
12344d362409d5469aed47d19e7908d19bd194493aThomas Graf        uint4 v011 = convert_uint4(pt_11[0]);
12444d362409d5469aed47d19e7908d19bd194493aThomas Graf        uint4 v111 = convert_uint4(pt_11[1]);
125
126        uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7;
127        uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7;
128        uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7;
129        uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7;
130
131        uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15;
132        uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15;
133
134        uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15;
135        uint4 v2 = (v + 0x7f) >> (int4)8;
136
137        uchar4 ret = convert_uchar4(v2);
138        ret.w = in->w;
139
140        #if 0
141        if (!x1) {
142            ALOGE("in          %08x %08x %08x %08x", in->r, in->g, in->b, in->a);
143            ALOGE("baseCoord   %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z, baseCoord.w);
144            ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
145            ALOGE("weight1     %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w);
146            ALOGE("weight2     %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w);
147
148            ALOGE("v000        %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w);
149            ALOGE("v100        %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w);
150            ALOGE("yz00        %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w);
151            ALOGE("z0          %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w);
152
153            ALOGE("v           %08x %08x %08x %08x", v.x, v.y, v.z, v.w);
154            ALOGE("v2          %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w);
155        }
156        #endif
157        *out = ret;
158
159
160        in++;
161        out++;
162        x1++;
163    }
164}
165
166RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx,
167                                                     const Script *s, const Element *e)
168            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
169
170    mRootPtr = &kernel;
171}
172
173RsdCpuScriptIntrinsic3DLUT::~RsdCpuScriptIntrinsic3DLUT() {
174}
175
176void RsdCpuScriptIntrinsic3DLUT::populateScript(Script *s) {
177    s->mHal.info.exportedVariableCount = 1;
178}
179
180void RsdCpuScriptIntrinsic3DLUT::invokeFreeChildren() {
181    mLUT.clear();
182}
183
184
185RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
186                                    const Script *s, const Element *e) {
187
188    return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e);
189}
190
191
192