rsCpuIntrinsicConvolve5x5.cpp revision ce0351debba8dadd1a7af2b3e926de6d787b49af
1d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams/*
2d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * Copyright (C) 2012 The Android Open Source Project
3d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams *
4d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * Licensed under the Apache License, Version 2.0 (the "License");
5d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * you may not use this file except in compliance with the License.
6d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * You may obtain a copy of the License at
7d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams *
8d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams *      http://www.apache.org/licenses/LICENSE-2.0
9d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams *
10d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * Unless required by applicable law or agreed to in writing, software
11d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * distributed under the License is distributed on an "AS IS" BASIS,
12d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * See the License for the specific language governing permissions and
14d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * limitations under the License.
15d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams */
16d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
17d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
18709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsic.h"
19709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsicInlines.h"
20d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
21d85e283087ecd00760a0d8d0c9d8482cda845efcJason Samsusing namespace android;
22d85e283087ecd00760a0d8d0c9d8482cda845efcJason Samsusing namespace android::renderscript;
23d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
24709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace android {
25709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace renderscript {
26709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
27709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
28709a0978ae141198018ca9769f8d96292a8928e6Jason Samsclass RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
29709a0978ae141198018ca9769f8d96292a8928e6Jason Samspublic:
30709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    virtual void populateScript(Script *);
31709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    virtual void invokeFreeChildren();
32709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
33709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
34709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
35709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
36709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    virtual ~RsdCpuScriptIntrinsicConvolve5x5();
37c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
38709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
39709a0978ae141198018ca9769f8d96292a8928e6Jason Samsprotected:
40d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    float fp[28];
41d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    short ip[28];
42d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    ObjectBaseRef<Allocation> alloc;
43709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
44709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
45709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    static void kernel(const RsForEachStubParamStruct *p,
46709a0978ae141198018ca9769f8d96292a8928e6Jason Sams                       uint32_t xstart, uint32_t xend,
47709a0978ae141198018ca9769f8d96292a8928e6Jason Sams                       uint32_t instep, uint32_t outstep);
48709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
49709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
50d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams};
51d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
52709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
53d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams}
54d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
55709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
56709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    rsAssert(slot == 1);
57709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    alloc.set(static_cast<Allocation *>(data));
58709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
59d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
60709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
61709a0978ae141198018ca9769f8d96292a8928e6Jason Sams                                                    const void *data, size_t dataLength) {
62d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    rsAssert(slot == 0);
63709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    memcpy (&fp, data, dataLength);
64d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    for(int ct=0; ct < 25; ct++) {
65ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams        ip[ct] = (short)(fp[ct] * 255.f);
66d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    }
67d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams}
68d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
69d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
70d85e283087ecd00760a0d8d0c9d8482cda845efcJason Samsstatic void One(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
71d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
72d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                const float* coeff) {
73d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
74d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x0 = rsMax((int32_t)x-2, 0);
75d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
76d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x2 = x;
77d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
78d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
79d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
80d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    float4 px = convert_float4(py0[x0]) * coeff[0] +
81d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py0[x1]) * coeff[1] +
82d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py0[x2]) * coeff[2] +
83d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py0[x3]) * coeff[3] +
84d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py0[x4]) * coeff[4] +
85d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
86d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py1[x0]) * coeff[5] +
87d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py1[x1]) * coeff[6] +
88d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py1[x2]) * coeff[7] +
89d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py1[x3]) * coeff[8] +
90d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py1[x4]) * coeff[9] +
91d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
92d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py2[x0]) * coeff[10] +
93d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py2[x1]) * coeff[11] +
94d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py2[x2]) * coeff[12] +
95d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py2[x3]) * coeff[13] +
96d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py2[x4]) * coeff[14] +
97d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
98d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py3[x0]) * coeff[15] +
99d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py3[x1]) * coeff[16] +
100d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py3[x2]) * coeff[17] +
101d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py3[x3]) * coeff[18] +
102d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py3[x4]) * coeff[19] +
103d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
104d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py4[x0]) * coeff[20] +
105d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py4[x1]) * coeff[21] +
106d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py4[x2]) * coeff[22] +
107d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py4[x3]) * coeff[23] +
108d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py4[x4]) * coeff[24];
109d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
110d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    px = clamp(px, 0.f, 255.f);
111d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
112ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams    //if ((out[0].r != o.r) || (out[0].y != o.y) || (out[0].z != o.z) || (out[0].w != o.w)) {
113ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams        //ALOGE("x %i  %i,%i,%i,%i  %i,%i,%i,%i", x, o.x, o.y, o.z, o.w, out[0].x, out[0].y, out[0].z, out[0].w);
114ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams    //}
115ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams    //o.w = 0xff;
116ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams    out->rgba = o.rgba;
117d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams}
118d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
119a1b08e2cacf3891fcd6895422c6124887b75975eJason Samsextern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
120a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams                                          const void *y2, const void *y3, const void *y4,
121a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams                                          const short *coef, uint32_t count);
122a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
123709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::kernel(const RsForEachStubParamStruct *p,
124709a0978ae141198018ca9769f8d96292a8928e6Jason Sams                                              uint32_t xstart, uint32_t xend,
125709a0978ae141198018ca9769f8d96292a8928e6Jason Sams                                              uint32_t instep, uint32_t outstep) {
126709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
127b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams    if (!cp->alloc.get()) {
128b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams        ALOGE("Convolve5x5 executed without input, skipping");
129b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams        return;
130b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams    }
131709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
132709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
133d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
134d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t y0 = rsMax((int32_t)p->y-2, 0);
135d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t y1 = rsMax((int32_t)p->y-1, 0);
136d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t y2 = p->y;
137d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
138d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
139d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
140709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
141709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
142709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
143709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
144709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
145d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
146d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uchar4 *out = (uchar4 *)p->out;
147d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x1 = xstart;
148d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x2 = xend;
149d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
150a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams    while((x1 < x2) && (x1 < 2)) {
151a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        One(p, x1, out, py0, py1, py2, py3, py4, cp->fp);
152a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        out++;
153a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        x1++;
154a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams    }
155a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
156a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams#if defined(ARCH_ARM_HAVE_NEON)
157a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams    if((x1 + 3) < x2) {
158a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        uint32_t len = (x2 - x1 - 3) >> 1;
159a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
160a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        out += len << 1;
161a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        x1 += len << 1;
162a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams    }
163a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams#endif
164a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
165d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    while(x1 < x2) {
166d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams        One(p, x1, out, py0, py1, py2, py3, py4, cp->fp);
167d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams        out++;
168d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams        x1++;
169d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    }
170d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams}
171d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
172d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
173709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
174c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
175c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
176d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
177709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    mRootPtr = &kernel;
178ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams    for(int ct=0; ct < 25; ct++) {
179709a0978ae141198018ca9769f8d96292a8928e6Jason Sams        fp[ct] = 1.f / 25.f;
180ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams        ip[ct] = (short)(fp[ct] * 255.f);
181d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    }
182d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams}
183d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
184709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
185709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
186709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
187709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
188709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    s->mHal.info.exportedVariableCount = 2;
189709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
190709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
191709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
192709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    alloc.clear();
193709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
194709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
195709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
196c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason SamsRsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
197c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams                                            const Script *s, const Element *e) {
198709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
199c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
200709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
201709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
202709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
203d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
204