rsCpuIntrinsic3DLUT.cpp revision 5edd18e4307e3c223b5db8a6cc5ca309a3a69c2a
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#include "rsCpuIntrinsic.h"
19#include "rsCpuIntrinsicInlines.h"
20
21using namespace android;
22using namespace android::renderscript;
23
24namespace android {
25namespace renderscript {
26
27
28class RsdCpuScriptIntrinsic3DLUT : public RsdCpuScriptIntrinsic {
29public:
30    virtual void populateScript(Script *);
31    virtual void invokeFreeChildren();
32
33    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
34
35    virtual ~RsdCpuScriptIntrinsic3DLUT();
36    RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
37
38protected:
39    ObjectBaseRef<Allocation> mLUT;
40
41    static void kernel(const RsForEachStubParamStruct *p,
42                       uint32_t xstart, uint32_t xend,
43                       uint32_t instep, uint32_t outstep);
44};
45
46}
47}
48
49
50void RsdCpuScriptIntrinsic3DLUT::setGlobalObj(uint32_t slot, ObjectBase *data) {
51    rsAssert(slot == 0);
52    mLUT.set(static_cast<Allocation *>(data));
53}
54
55extern "C" void rsdIntrinsic3DLUT_K(void *dst, const void *src, const void *lut,
56                                    size_t lut_stride_y, size_t lut_stride_z,
57                                    uint32_t count, const void *constants);
58
59
60void RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p,
61                                      uint32_t xstart, uint32_t xend,
62                                      uint32_t instep, uint32_t outstep) {
63// FIXME(srhines)!!!! Temporary WAR for non-neon arm crash in clang.
64#if defined(ARCH_ARM_HAVE_NEON)
65    RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
66
67    uchar4 *out = (uchar4 *)p->out;
68    uchar4 *in = (uchar4 *)p->in;
69    uint32_t x1 = xstart;
70    uint32_t x2 = xend;
71
72    const uchar *bp = (const uchar *)cp->mLUT->mHal.drvState.lod[0].mallocPtr;
73
74    int4 dims = {
75        cp->mLUT->mHal.drvState.lod[0].dimX,
76        cp->mLUT->mHal.drvState.lod[0].dimY,
77        cp->mLUT->mHal.drvState.lod[0].dimZ,
78        0
79    };
80    const float4 m = (float4)(1.f / 255.f) * convert_float4(dims - 1);
81    const int4 coordMul = convert_int4(m * (float4)0x8000);
82    const size_t stride_y = cp->mLUT->mHal.drvState.lod[0].stride;
83    const size_t stride_z = stride_y * cp->mLUT->mHal.drvState.lod[0].dimY;
84
85    //ALOGE("strides %zu %zu", stride_y, stride_z);
86
87    while (x1 < x2) {
88#if defined(ARCH_ARM_HAVE_NEON)
89        int32_t len = (x2 - x1 - 1) >> 1;
90        if(len > 0) {
91            const short neon_constants[] = {
92                coordMul.x, coordMul.y, coordMul.z, 0,
93                0, 0, 0, 0xffff,
94
95            };
96
97            rsdIntrinsic3DLUT_K(out, in, bp, stride_y, stride_z, len, neon_constants);
98            x1 += len << 1;
99            out += len << 1;
100            in += len << 1;
101        }
102
103#endif
104
105        int4 baseCoord = convert_int4(*in) * coordMul;
106        int4 coord1 = baseCoord >> (int4)15;
107        //int4 coord2 = min(coord1 + 1, gDims - 1);
108
109        int4 weight2 = baseCoord & 0x7fff;
110        int4 weight1 = (int4)0x8000 - weight2;
111
112        //ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
113        const uchar *bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z);
114        const uchar4 *pt_00 = (const uchar4 *)&bp2[0];
115        const uchar4 *pt_10 = (const uchar4 *)&bp2[stride_y];
116        const uchar4 *pt_01 = (const uchar4 *)&bp2[stride_z];
117        const uchar4 *pt_11 = (const uchar4 *)&bp2[stride_y + stride_z];
118
119        uint4 v000 = convert_uint4(pt_00[0]);
120        uint4 v100 = convert_uint4(pt_00[1]);
121        uint4 v010 = convert_uint4(pt_10[0]);
122        uint4 v110 = convert_uint4(pt_10[1]);
123        uint4 v001 = convert_uint4(pt_01[0]);
124        uint4 v101 = convert_uint4(pt_01[1]);
125        uint4 v011 = convert_uint4(pt_11[0]);
126        uint4 v111 = convert_uint4(pt_11[1]);
127
128        uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7;
129        uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7;
130        uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7;
131        uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7;
132
133        uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15;
134        uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15;
135
136        uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15;
137        uint4 v2 = (v + 0x7f) >> (int4)8;
138
139        uchar4 ret = convert_uchar4(v2);
140        ret.a = in->a;
141
142        #if 0
143        if (!x1) {
144            ALOGE("in          %08x %08x %08x %08x", in->r, in->g, in->b, in->a);
145            ALOGE("baseCoord   %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z, baseCoord.w);
146            ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
147            ALOGE("weight1     %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w);
148            ALOGE("weight2     %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w);
149
150            ALOGE("v000        %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w);
151            ALOGE("v100        %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w);
152            ALOGE("yz00        %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w);
153            ALOGE("z0          %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w);
154
155            ALOGE("v           %08x %08x %08x %08x", v.x, v.y, v.z, v.w);
156            ALOGE("v2          %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w);
157        }
158        #endif
159        *out = ret;
160
161
162        in++;
163        out++;
164        x1++;
165    }
166#endif
167}
168
169RsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx,
170                                                     const Script *s, const Element *e)
171            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
172
173    mRootPtr = &kernel;
174}
175
176RsdCpuScriptIntrinsic3DLUT::~RsdCpuScriptIntrinsic3DLUT() {
177}
178
179void RsdCpuScriptIntrinsic3DLUT::populateScript(Script *s) {
180    s->mHal.info.exportedVariableCount = 1;
181}
182
183void RsdCpuScriptIntrinsic3DLUT::invokeFreeChildren() {
184    mLUT.clear();
185}
186
187
188RsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
189                                    const Script *s, const Element *e) {
190
191    return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e);
192}
193
194
195