17c4b888f2147edf99690b6af75470774ff31c43bJason Sams/*
27c4b888f2147edf99690b6af75470774ff31c43bJason Sams * Copyright (C) 2012 The Android Open Source Project
37c4b888f2147edf99690b6af75470774ff31c43bJason Sams *
47c4b888f2147edf99690b6af75470774ff31c43bJason Sams * Licensed under the Apache License, Version 2.0 (the "License");
57c4b888f2147edf99690b6af75470774ff31c43bJason Sams * you may not use this file except in compliance with the License.
67c4b888f2147edf99690b6af75470774ff31c43bJason Sams * You may obtain a copy of the License at
77c4b888f2147edf99690b6af75470774ff31c43bJason Sams *
87c4b888f2147edf99690b6af75470774ff31c43bJason Sams *      http://www.apache.org/licenses/LICENSE-2.0
97c4b888f2147edf99690b6af75470774ff31c43bJason Sams *
107c4b888f2147edf99690b6af75470774ff31c43bJason Sams * Unless required by applicable law or agreed to in writing, software
117c4b888f2147edf99690b6af75470774ff31c43bJason Sams * distributed under the License is distributed on an "AS IS" BASIS,
127c4b888f2147edf99690b6af75470774ff31c43bJason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
137c4b888f2147edf99690b6af75470774ff31c43bJason Sams * See the License for the specific language governing permissions and
147c4b888f2147edf99690b6af75470774ff31c43bJason Sams * limitations under the License.
157c4b888f2147edf99690b6af75470774ff31c43bJason Sams */
167c4b888f2147edf99690b6af75470774ff31c43bJason Sams
177c4b888f2147edf99690b6af75470774ff31c43bJason Sams
187c4b888f2147edf99690b6af75470774ff31c43bJason Sams#include "rsCpuIntrinsic.h"
197c4b888f2147edf99690b6af75470774ff31c43bJason Sams#include "rsCpuIntrinsicInlines.h"
207c4b888f2147edf99690b6af75470774ff31c43bJason Sams
217c4b888f2147edf99690b6af75470774ff31c43bJason Samsusing namespace android;
227c4b888f2147edf99690b6af75470774ff31c43bJason Samsusing namespace android::renderscript;
237c4b888f2147edf99690b6af75470774ff31c43bJason Sams
247c4b888f2147edf99690b6af75470774ff31c43bJason Samsnamespace android {
257c4b888f2147edf99690b6af75470774ff31c43bJason Samsnamespace renderscript {
267c4b888f2147edf99690b6af75470774ff31c43bJason Sams
277c4b888f2147edf99690b6af75470774ff31c43bJason Sams
287c4b888f2147edf99690b6af75470774ff31c43bJason Samsclass RsdCpuScriptIntrinsic3DLUT : public RsdCpuScriptIntrinsic {
297c4b888f2147edf99690b6af75470774ff31c43bJason Samspublic:
307c4b888f2147edf99690b6af75470774ff31c43bJason Sams    virtual void populateScript(Script *);
317c4b888f2147edf99690b6af75470774ff31c43bJason Sams    virtual void invokeFreeChildren();
327c4b888f2147edf99690b6af75470774ff31c43bJason Sams
337c4b888f2147edf99690b6af75470774ff31c43bJason Sams    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
347c4b888f2147edf99690b6af75470774ff31c43bJason Sams
357c4b888f2147edf99690b6af75470774ff31c43bJason Sams    virtual ~RsdCpuScriptIntrinsic3DLUT();
367c4b888f2147edf99690b6af75470774ff31c43bJason Sams    RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
377c4b888f2147edf99690b6af75470774ff31c43bJason Sams
387c4b888f2147edf99690b6af75470774ff31c43bJason Samsprotected:
397c4b888f2147edf99690b6af75470774ff31c43bJason Sams    ObjectBaseRef<Allocation> mLUT;
407c4b888f2147edf99690b6af75470774ff31c43bJason Sams
417c4b888f2147edf99690b6af75470774ff31c43bJason Sams    static void kernel(const RsForEachStubParamStruct *p,
427c4b888f2147edf99690b6af75470774ff31c43bJason Sams                       uint32_t xstart, uint32_t xend,
437c4b888f2147edf99690b6af75470774ff31c43bJason Sams                       uint32_t instep, uint32_t outstep);
447c4b888f2147edf99690b6af75470774ff31c43bJason Sams};
457c4b888f2147edf99690b6af75470774ff31c43bJason Sams
467c4b888f2147edf99690b6af75470774ff31c43bJason Sams}
477c4b888f2147edf99690b6af75470774ff31c43bJason Sams}
487c4b888f2147edf99690b6af75470774ff31c43bJason Sams
497c4b888f2147edf99690b6af75470774ff31c43bJason Sams
507c4b888f2147edf99690b6af75470774ff31c43bJason Samsvoid RsdCpuScriptIntrinsic3DLUT::setGlobalObj(uint32_t slot, ObjectBase *data) {
517c4b888f2147edf99690b6af75470774ff31c43bJason Sams    rsAssert(slot == 0);
527c4b888f2147edf99690b6af75470774ff31c43bJason Sams    mLUT.set(static_cast<Allocation *>(data));
537c4b888f2147edf99690b6af75470774ff31c43bJason Sams}
547c4b888f2147edf99690b6af75470774ff31c43bJason Sams
5507e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosieextern "C" void rsdIntrinsic3DLUT_K(void *dst, void const *in, size_t count,
565dcaaa5f50926bebf6877e254c521faa7e2593e3Simon Hosie                                      void const *lut,
575dcaaa5f50926bebf6877e254c521faa7e2593e3Simon Hosie                                      int32_t pitchy, int32_t pitchz,
585dcaaa5f50926bebf6877e254c521faa7e2593e3Simon Hosie                                      int dimx, int dimy, int dimz);
597c4b888f2147edf99690b6af75470774ff31c43bJason Sams
607c4b888f2147edf99690b6af75470774ff31c43bJason Sams
617c4b888f2147edf99690b6af75470774ff31c43bJason Samsvoid RsdCpuScriptIntrinsic3DLUT::kernel(const RsForEachStubParamStruct *p,
627c4b888f2147edf99690b6af75470774ff31c43bJason Sams                                      uint32_t xstart, uint32_t xend,
637c4b888f2147edf99690b6af75470774ff31c43bJason Sams                                      uint32_t instep, uint32_t outstep) {
647c4b888f2147edf99690b6af75470774ff31c43bJason Sams    RsdCpuScriptIntrinsic3DLUT *cp = (RsdCpuScriptIntrinsic3DLUT *)p->usr;
657c4b888f2147edf99690b6af75470774ff31c43bJason Sams
66d25fef7232a939faaffcdb83a1be28285313c38eJason Sams    uchar4 *out = (uchar4 *)p->out;
67d25fef7232a939faaffcdb83a1be28285313c38eJason Sams    uchar4 *in = (uchar4 *)p->in;
687c4b888f2147edf99690b6af75470774ff31c43bJason Sams    uint32_t x1 = xstart;
697c4b888f2147edf99690b6af75470774ff31c43bJason Sams    uint32_t x2 = xend;
707c4b888f2147edf99690b6af75470774ff31c43bJason Sams
717c4b888f2147edf99690b6af75470774ff31c43bJason Sams    const uchar *bp = (const uchar *)cp->mLUT->mHal.drvState.lod[0].mallocPtr;
727c4b888f2147edf99690b6af75470774ff31c43bJason Sams
737c4b888f2147edf99690b6af75470774ff31c43bJason Sams    int4 dims = {
748994abbe699bb05fa70cff101becc925db6b2c26synergy dev        static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimX - 1),
758994abbe699bb05fa70cff101becc925db6b2c26synergy dev        static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimY - 1),
768994abbe699bb05fa70cff101becc925db6b2c26synergy dev        static_cast<int>(cp->mLUT->mHal.drvState.lod[0].dimZ - 1),
77d533c4c66510f2b83b9397607756479d79baae10Stephen Hines        -1
787c4b888f2147edf99690b6af75470774ff31c43bJason Sams    };
79d533c4c66510f2b83b9397607756479d79baae10Stephen Hines    const float4 m = (float4)(1.f / 255.f) * convert_float4(dims);
807c4b888f2147edf99690b6af75470774ff31c43bJason Sams    const int4 coordMul = convert_int4(m * (float4)0x8000);
817c4b888f2147edf99690b6af75470774ff31c43bJason Sams    const size_t stride_y = cp->mLUT->mHal.drvState.lod[0].stride;
827c4b888f2147edf99690b6af75470774ff31c43bJason Sams    const size_t stride_z = stride_y * cp->mLUT->mHal.drvState.lod[0].dimY;
837c4b888f2147edf99690b6af75470774ff31c43bJason Sams
847c4b888f2147edf99690b6af75470774ff31c43bJason Sams    //ALOGE("strides %zu %zu", stride_y, stride_z);
857c4b888f2147edf99690b6af75470774ff31c43bJason Sams
86074424a4ac5b093331df2c92e7a5bcbfff136b71Jason Sams#if defined(ARCH_ARM_USE_INTRINSICS)
8707e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosie    if (gArchUseSIMD) {
8807e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosie        int32_t len = x2 - x1;
8907e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosie        if(len > 0) {
9007e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosie            rsdIntrinsic3DLUT_K(out, in, len,
9107e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosie                                bp, stride_y, stride_z,
9207e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosie                                dims.x, dims.y, dims.z);
9307e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosie            x1 += len;
9407e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosie            out += len;
9507e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosie            in += len;
967c4b888f2147edf99690b6af75470774ff31c43bJason Sams        }
9707e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosie    }
987c4b888f2147edf99690b6af75470774ff31c43bJason Sams#endif
997c4b888f2147edf99690b6af75470774ff31c43bJason Sams
10007e4665c04a71462e6cfc1c2bb2300a9ed111e60Simon Hosie    while (x1 < x2) {
1017c4b888f2147edf99690b6af75470774ff31c43bJason Sams        int4 baseCoord = convert_int4(*in) * coordMul;
1027c4b888f2147edf99690b6af75470774ff31c43bJason Sams        int4 coord1 = baseCoord >> (int4)15;
1037c4b888f2147edf99690b6af75470774ff31c43bJason Sams        //int4 coord2 = min(coord1 + 1, gDims - 1);
1047c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1057c4b888f2147edf99690b6af75470774ff31c43bJason Sams        int4 weight2 = baseCoord & 0x7fff;
1067c4b888f2147edf99690b6af75470774ff31c43bJason Sams        int4 weight1 = (int4)0x8000 - weight2;
1077c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1087c4b888f2147edf99690b6af75470774ff31c43bJason Sams        //ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
1097c4b888f2147edf99690b6af75470774ff31c43bJason Sams        const uchar *bp2 = bp + (coord1.x * 4) + (coord1.y * stride_y) + (coord1.z * stride_z);
1107c4b888f2147edf99690b6af75470774ff31c43bJason Sams        const uchar4 *pt_00 = (const uchar4 *)&bp2[0];
1117c4b888f2147edf99690b6af75470774ff31c43bJason Sams        const uchar4 *pt_10 = (const uchar4 *)&bp2[stride_y];
1127c4b888f2147edf99690b6af75470774ff31c43bJason Sams        const uchar4 *pt_01 = (const uchar4 *)&bp2[stride_z];
1137c4b888f2147edf99690b6af75470774ff31c43bJason Sams        const uchar4 *pt_11 = (const uchar4 *)&bp2[stride_y + stride_z];
1147c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1157c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 v000 = convert_uint4(pt_00[0]);
1167c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 v100 = convert_uint4(pt_00[1]);
1177c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 v010 = convert_uint4(pt_10[0]);
1187c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 v110 = convert_uint4(pt_10[1]);
1197c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 v001 = convert_uint4(pt_01[0]);
1207c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 v101 = convert_uint4(pt_01[1]);
1217c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 v011 = convert_uint4(pt_11[0]);
1227c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 v111 = convert_uint4(pt_11[1]);
1237c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1247c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 yz00 = ((v000 * weight1.x) + (v100 * weight2.x)) >> (int4)7;
1257c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 yz10 = ((v010 * weight1.x) + (v110 * weight2.x)) >> (int4)7;
1267c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 yz01 = ((v001 * weight1.x) + (v101 * weight2.x)) >> (int4)7;
1277c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 yz11 = ((v011 * weight1.x) + (v111 * weight2.x)) >> (int4)7;
1287c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1295e3fb0b3cfadcb44a74cf4b6ec9ec65c11ba811eStephen Hines        uint4 z0 = ((yz00 * weight1.y) + (yz10 * weight2.y)) >> (int4)15;
1305e3fb0b3cfadcb44a74cf4b6ec9ec65c11ba811eStephen Hines        uint4 z1 = ((yz01 * weight1.y) + (yz11 * weight2.y)) >> (int4)15;
1317c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1325e3fb0b3cfadcb44a74cf4b6ec9ec65c11ba811eStephen Hines        uint4 v = ((z0 * weight1.z) + (z1 * weight2.z)) >> (int4)15;
1337c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uint4 v2 = (v + 0x7f) >> (int4)8;
1347c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1357c4b888f2147edf99690b6af75470774ff31c43bJason Sams        uchar4 ret = convert_uchar4(v2);
1360b575de8ed0b628d84d256f5846500b0385979bdTim Murray        ret.w = in->w;
1377c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1387c4b888f2147edf99690b6af75470774ff31c43bJason Sams        #if 0
1397c4b888f2147edf99690b6af75470774ff31c43bJason Sams        if (!x1) {
1407c4b888f2147edf99690b6af75470774ff31c43bJason Sams            ALOGE("in          %08x %08x %08x %08x", in->r, in->g, in->b, in->a);
1417c4b888f2147edf99690b6af75470774ff31c43bJason Sams            ALOGE("baseCoord   %08x %08x %08x %08x", baseCoord.x, baseCoord.y, baseCoord.z, baseCoord.w);
1427c4b888f2147edf99690b6af75470774ff31c43bJason Sams            ALOGE("coord1      %08x %08x %08x %08x", coord1.x, coord1.y, coord1.z, coord1.w);
1437c4b888f2147edf99690b6af75470774ff31c43bJason Sams            ALOGE("weight1     %08x %08x %08x %08x", weight1.x, weight1.y, weight1.z, weight1.w);
1447c4b888f2147edf99690b6af75470774ff31c43bJason Sams            ALOGE("weight2     %08x %08x %08x %08x", weight2.x, weight2.y, weight2.z, weight2.w);
1457c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1467c4b888f2147edf99690b6af75470774ff31c43bJason Sams            ALOGE("v000        %08x %08x %08x %08x", v000.x, v000.y, v000.z, v000.w);
1477c4b888f2147edf99690b6af75470774ff31c43bJason Sams            ALOGE("v100        %08x %08x %08x %08x", v100.x, v100.y, v100.z, v100.w);
1487c4b888f2147edf99690b6af75470774ff31c43bJason Sams            ALOGE("yz00        %08x %08x %08x %08x", yz00.x, yz00.y, yz00.z, yz00.w);
1497c4b888f2147edf99690b6af75470774ff31c43bJason Sams            ALOGE("z0          %08x %08x %08x %08x", z0.x, z0.y, z0.z, z0.w);
1507c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1517c4b888f2147edf99690b6af75470774ff31c43bJason Sams            ALOGE("v           %08x %08x %08x %08x", v.x, v.y, v.z, v.w);
1527c4b888f2147edf99690b6af75470774ff31c43bJason Sams            ALOGE("v2          %08x %08x %08x %08x", v2.x, v2.y, v2.z, v2.w);
1537c4b888f2147edf99690b6af75470774ff31c43bJason Sams        }
1547c4b888f2147edf99690b6af75470774ff31c43bJason Sams        #endif
1557c4b888f2147edf99690b6af75470774ff31c43bJason Sams        *out = ret;
1567c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1577c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1587c4b888f2147edf99690b6af75470774ff31c43bJason Sams        in++;
1597c4b888f2147edf99690b6af75470774ff31c43bJason Sams        out++;
1607c4b888f2147edf99690b6af75470774ff31c43bJason Sams        x1++;
1617c4b888f2147edf99690b6af75470774ff31c43bJason Sams    }
1627c4b888f2147edf99690b6af75470774ff31c43bJason Sams}
1637c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1647c4b888f2147edf99690b6af75470774ff31c43bJason SamsRsdCpuScriptIntrinsic3DLUT::RsdCpuScriptIntrinsic3DLUT(RsdCpuReferenceImpl *ctx,
1657c4b888f2147edf99690b6af75470774ff31c43bJason Sams                                                     const Script *s, const Element *e)
1667c4b888f2147edf99690b6af75470774ff31c43bJason Sams            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_3DLUT) {
1677c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1687c4b888f2147edf99690b6af75470774ff31c43bJason Sams    mRootPtr = &kernel;
1697c4b888f2147edf99690b6af75470774ff31c43bJason Sams}
1707c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1717c4b888f2147edf99690b6af75470774ff31c43bJason SamsRsdCpuScriptIntrinsic3DLUT::~RsdCpuScriptIntrinsic3DLUT() {
1727c4b888f2147edf99690b6af75470774ff31c43bJason Sams}
1737c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1747c4b888f2147edf99690b6af75470774ff31c43bJason Samsvoid RsdCpuScriptIntrinsic3DLUT::populateScript(Script *s) {
1757c4b888f2147edf99690b6af75470774ff31c43bJason Sams    s->mHal.info.exportedVariableCount = 1;
1767c4b888f2147edf99690b6af75470774ff31c43bJason Sams}
1777c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1787c4b888f2147edf99690b6af75470774ff31c43bJason Samsvoid RsdCpuScriptIntrinsic3DLUT::invokeFreeChildren() {
1797c4b888f2147edf99690b6af75470774ff31c43bJason Sams    mLUT.clear();
1807c4b888f2147edf99690b6af75470774ff31c43bJason Sams}
1817c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1827c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1837c4b888f2147edf99690b6af75470774ff31c43bJason SamsRsdCpuScriptImpl * rsdIntrinsic_3DLUT(RsdCpuReferenceImpl *ctx,
1847c4b888f2147edf99690b6af75470774ff31c43bJason Sams                                    const Script *s, const Element *e) {
1857c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1867c4b888f2147edf99690b6af75470774ff31c43bJason Sams    return new RsdCpuScriptIntrinsic3DLUT(ctx, s, e);
1877c4b888f2147edf99690b6af75470774ff31c43bJason Sams}
1887c4b888f2147edf99690b6af75470774ff31c43bJason Sams
1897c4b888f2147edf99690b6af75470774ff31c43bJason Sams
190