1e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/*
2e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Copyright (C) 2012 The Android Open Source Project
3e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
4e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Licensed under the Apache License, Version 2.0 (the "License");
5e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * you may not use this file except in compliance with the License.
6e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * You may obtain a copy of the License at
7e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
8e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *      http://www.apache.org/licenses/LICENSE-2.0
9e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *
10e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Unless required by applicable law or agreed to in writing, software
11e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * distributed under the License is distributed on an "AS IS" BASIS,
12e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * See the License for the specific language governing permissions and
14e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * limitations under the License.
15e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams */
16e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
17e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
18709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsic.h"
19709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsicInlines.h"
20e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
21e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Samsusing namespace android;
22e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Samsusing namespace android::renderscript;
23e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
24709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace android {
25709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace renderscript {
26709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
27709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
28709a0978ae141198018ca9769f8d96292a8928e6Jason Samsclass RsdCpuScriptIntrinsicConvolve3x3 : public RsdCpuScriptIntrinsic {
29709a0978ae141198018ca9769f8d96292a8928e6Jason Samspublic:
30709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    virtual void populateScript(Script *);
31709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    virtual void invokeFreeChildren();
32709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
33709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
34709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
35709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
36709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    virtual ~RsdCpuScriptIntrinsicConvolve3x3();
37c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *);
38709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
39709a0978ae141198018ca9769f8d96292a8928e6Jason Samsprotected:
40c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    float mFp[16];
41c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    short mIp[16];
42c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    ObjectBaseRef<const Allocation> mAlloc;
43c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    ObjectBaseRef<const Element> mElement;
44709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
453b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    static void kernelU1(const RsForEachStubParamStruct *p,
463b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t xstart, uint32_t xend,
473b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t instep, uint32_t outstep);
483b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    static void kernelU2(const RsForEachStubParamStruct *p,
493b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t xstart, uint32_t xend,
503b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t instep, uint32_t outstep);
513b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    static void kernelU4(const RsForEachStubParamStruct *p,
523b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t xstart, uint32_t xend,
533b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t instep, uint32_t outstep);
543b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    static void kernelF1(const RsForEachStubParamStruct *p,
553b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t xstart, uint32_t xend,
563b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t instep, uint32_t outstep);
573b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    static void kernelF2(const RsForEachStubParamStruct *p,
583b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t xstart, uint32_t xend,
593b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t instep, uint32_t outstep);
603b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    static void kernelF4(const RsForEachStubParamStruct *p,
613b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t xstart, uint32_t xend,
623b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                         uint32_t instep, uint32_t outstep);
63e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams};
64e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
65709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
66e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams}
67e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
68e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
69709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) {
70709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    rsAssert(slot == 1);
71c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    mAlloc.set(static_cast<Allocation *>(data));
72709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
73709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
74709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data,
75709a0978ae141198018ca9769f8d96292a8928e6Jason Sams                                                    size_t dataLength) {
76e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    rsAssert(slot == 0);
77c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    memcpy (&mFp, data, dataLength);
78e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    for(int ct=0; ct < 9; ct++) {
793b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        if (mFp[ct] >= 0) {
803b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
813b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        } else {
823b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
833b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
84e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    }
85e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams}
86e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
87709a0978ae141198018ca9769f8d96292a8928e6Jason Samsextern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1,
88709a0978ae141198018ca9769f8d96292a8928e6Jason Sams                                          const void *y2, const short *coef, uint32_t count);
89e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
90e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
913b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
923b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const uchar4 *py0, const uchar4 *py1, const uchar4 *py2,
933b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const float* coeff) {
94e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
95d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
964cca49b13db92b13ca07c1d330ad450d1b10f507Tim Murray    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
97e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
98e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    float4 px = convert_float4(py0[x1]) * coeff[0] +
99e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams                convert_float4(py0[x]) * coeff[1] +
100e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams                convert_float4(py0[x2]) * coeff[2] +
101e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams                convert_float4(py1[x1]) * coeff[3] +
102e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams                convert_float4(py1[x]) * coeff[4] +
103e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams                convert_float4(py1[x2]) * coeff[5] +
104e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams                convert_float4(py2[x1]) * coeff[6] +
105e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams                convert_float4(py2[x]) * coeff[7] +
106e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams                convert_float4(py2[x2]) * coeff[8];
107e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
1084283f579c424f07bc07c7f075398053eed3f8281Miao Wang    px = clamp(px + 0.5f, 0.f, 255.f);
109e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
110e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    *out = o;
111e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams}
112e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
1133b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
1143b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const uchar2 *py0, const uchar2 *py1, const uchar2 *py2,
1153b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const float* coeff) {
1163b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1173b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
1183b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
1193b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1203b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    float2 px = convert_float2(py0[x1]) * coeff[0] +
1213b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                convert_float2(py0[x]) * coeff[1] +
1223b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                convert_float2(py0[x2]) * coeff[2] +
1233b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                convert_float2(py1[x1]) * coeff[3] +
1243b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                convert_float2(py1[x]) * coeff[4] +
1253b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                convert_float2(py1[x2]) * coeff[5] +
1263b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                convert_float2(py2[x1]) * coeff[6] +
1273b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                convert_float2(py2[x]) * coeff[7] +
1283b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                convert_float2(py2[x2]) * coeff[8];
1293b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1304283f579c424f07bc07c7f075398053eed3f8281Miao Wang    px = clamp(px + 0.5f, 0.f, 255.f);
1313b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    *out = convert_uchar2(px);
1323b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams}
1333b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1343b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
1353b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const uchar *py0, const uchar *py1, const uchar *py2,
1363b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const float* coeff) {
1373b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1383b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
1393b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
1403b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1413b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    float px = ((float)py0[x1]) * coeff[0] +
1423b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams               ((float)py0[x]) * coeff[1] +
1433b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams               ((float)py0[x2]) * coeff[2] +
1443b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams               ((float)py1[x1]) * coeff[3] +
1453b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams               ((float)py1[x]) * coeff[4] +
1463b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams               ((float)py1[x2]) * coeff[5] +
1473b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams               ((float)py2[x1]) * coeff[6] +
1483b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams               ((float)py2[x]) * coeff[7] +
1493b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams               ((float)py2[x2]) * coeff[8];
1504283f579c424f07bc07c7f075398053eed3f8281Miao Wang    *out = clamp(px + 0.5f, 0.f, 255.f);
1513b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams}
1523b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1533b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
1543b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const float4 *py0, const float4 *py1, const float4 *py2,
1553b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const float* coeff) {
1563b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1573b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
1583b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
1593b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
1603b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
1613b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
1623b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams}
1633b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1643b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
1653b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const float2 *py0, const float2 *py1, const float2 *py2,
1663b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const float* coeff) {
1673b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1683b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
1693b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
1703b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
1713b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
1723b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
1733b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams}
1743b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1753b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
1763b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const float *py0, const float *py1, const float *py2,
1773b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                          const float* coeff) {
1783b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1793b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
1803b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1);
1813b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
1823b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams           (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
1833b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams           (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
1843b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams}
1853b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
1863b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsForEachStubParamStruct *p,
1873b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t xstart, uint32_t xend,
1883b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t instep, uint32_t outstep) {
189709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
190b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams
191c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    if (!cp->mAlloc.get()) {
192b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams        ALOGE("Convolve3x3 executed without input, skipping");
193b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams        return;
194b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams    }
195c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
196c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
197e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
198e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
199e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
200709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py0 = (const uchar4 *)(pin + stride * y2);
201709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py1 = (const uchar4 *)(pin + stride * p->y);
202709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py2 = (const uchar4 *)(pin + stride * y1);
203e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
204e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    uchar4 *out = (uchar4 *)p->out;
205e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    uint32_t x1 = xstart;
206e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    uint32_t x2 = xend;
207e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    if(x1 == 0) {
2083b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        ConvolveOneU4(p, 0, out, py0, py1, py2, cp->mFp);
209e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        x1 ++;
210e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        out++;
211e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    }
212e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
213e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    if(x2 > x1) {
214074424a4ac5b093331df2c92e7a5bcbfff136b71Jason Sams#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
215f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams        if (gArchUseSIMD) {
216f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams            int32_t len = (x2 - x1 - 1) >> 1;
217f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams            if(len > 0) {
218f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams                rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
219f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams                x1 += len << 1;
220f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams                out += len << 1;
221f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams            }
222e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        }
223e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams#endif
224e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
225e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        while(x1 != x2) {
2263b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            ConvolveOneU4(p, x1, out, py0, py1, py2, cp->mFp);
2273b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            out++;
2283b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            x1++;
2293b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
2303b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
2313b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams}
2323b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
2333b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsForEachStubParamStruct *p,
2343b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t xstart, uint32_t xend,
2353b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t instep, uint32_t outstep) {
2363b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
2373b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
2383b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if (!cp->mAlloc.get()) {
2393b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        ALOGE("Convolve3x3 executed without input, skipping");
2403b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        return;
2413b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
2423b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
2433b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
2443b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
2453b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
2463b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
2473b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const uchar2 *py0 = (const uchar2 *)(pin + stride * y2);
2483b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const uchar2 *py1 = (const uchar2 *)(pin + stride * p->y);
2493b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const uchar2 *py2 = (const uchar2 *)(pin + stride * y1);
2503b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
2513b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uchar2 *out = (uchar2 *)p->out;
2523b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x1 = xstart;
2533b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x2 = xend;
2543b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if(x1 == 0) {
2553b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        ConvolveOneU2(p, 0, out, py0, py1, py2, cp->mFp);
2563b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        x1 ++;
2573b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        out++;
2583b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
2593b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
2603b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if(x2 > x1) {
2613b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#if 0//defined(ARCH_ARM_HAVE_NEON)
2623b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        int32_t len = (x2 - x1 - 1) >> 1;
2633b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        if(len > 0) {
2643b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
2653b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            x1 += len << 1;
2663b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            out += len << 1;
2673b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
2683b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#endif
2693b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
2703b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        while(x1 != x2) {
2713b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            ConvolveOneU2(p, x1, out, py0, py1, py2, cp->mFp);
2723b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            out++;
2733b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            x1++;
2743b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
2753b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
2763b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams}
2773b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
2783b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsForEachStubParamStruct *p,
2793b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t xstart, uint32_t xend,
2803b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t instep, uint32_t outstep) {
2813b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
2823b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
2833b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if (!cp->mAlloc.get()) {
2843b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        ALOGE("Convolve3x3 executed without input, skipping");
2853b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        return;
2863b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
2873b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
2883b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
2893b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
2903b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
2913b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
2923b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const uchar *py0 = (const uchar *)(pin + stride * y2);
2933b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const uchar *py1 = (const uchar *)(pin + stride * p->y);
2943b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const uchar *py2 = (const uchar *)(pin + stride * y1);
2953b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
2963b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uchar *out = (uchar *)p->out;
2973b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x1 = xstart;
2983b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x2 = xend;
2993b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if(x1 == 0) {
3003b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        ConvolveOneU1(p, 0, out, py0, py1, py2, cp->mFp);
3013b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        x1 ++;
3023b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        out++;
3033b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
3043b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3053b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if(x2 > x1) {
3063b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#if 0//defined(ARCH_ARM_HAVE_NEON)
3073b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        int32_t len = (x2 - x1 - 1) >> 1;
3083b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        if(len > 0) {
3093b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
3103b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            x1 += len << 1;
3113b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            out += len << 1;
3123b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
3133b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#endif
3143b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3153b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        while(x1 != x2) {
3163b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            ConvolveOneU1(p, x1, out, py0, py1, py2, cp->mFp);
3173b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            out++;
3183b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            x1++;
3193b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
3203b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
3213b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams}
3223b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3233b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsForEachStubParamStruct *p,
3243b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t xstart, uint32_t xend,
3253b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t instep, uint32_t outstep) {
3263b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
3273b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3283b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if (!cp->mAlloc.get()) {
3293b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        ALOGE("Convolve3x3 executed without input, skipping");
3303b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        return;
3313b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
3323b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
3333b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
3343b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3353b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
3363b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
3373b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const float4 *py0 = (const float4 *)(pin + stride * y2);
3383b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const float4 *py1 = (const float4 *)(pin + stride * p->y);
3393b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const float4 *py2 = (const float4 *)(pin + stride * y1);
3403b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3413b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    float4 *out = (float4 *)p->out;
3423b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x1 = xstart;
3433b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x2 = xend;
3443b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if(x1 == 0) {
3453b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        ConvolveOneF4(p, 0, out, py0, py1, py2, cp->mFp);
3463b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        x1 ++;
3473b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        out++;
3483b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
3493b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3503b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if(x2 > x1) {
3513b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#if 0//defined(ARCH_ARM_HAVE_NEON)
3523b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        int32_t len = (x2 - x1 - 1) >> 1;
3533b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        if(len > 0) {
3543b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
3553b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            x1 += len << 1;
3563b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            out += len << 1;
3573b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
3583b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#endif
3593b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3603b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        while(x1 != x2) {
3613b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            ConvolveOneF4(p, x1, out, py0, py1, py2, cp->mFp);
3623b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            out++;
3633b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            x1++;
3643b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
3653b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
3663b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams}
3673b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3683b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsForEachStubParamStruct *p,
3693b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t xstart, uint32_t xend,
3703b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t instep, uint32_t outstep) {
3713b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
3723b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3733b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if (!cp->mAlloc.get()) {
3743b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        ALOGE("Convolve3x3 executed without input, skipping");
3753b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        return;
3763b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
3773b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
3783b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
3793b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3803b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
3813b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
3823b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const float2 *py0 = (const float2 *)(pin + stride * y2);
3833b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const float2 *py1 = (const float2 *)(pin + stride * p->y);
3843b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const float2 *py2 = (const float2 *)(pin + stride * y1);
3853b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3863b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    float2 *out = (float2 *)p->out;
3873b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x1 = xstart;
3883b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x2 = xend;
3893b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if(x1 == 0) {
3903b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        ConvolveOneF2(p, 0, out, py0, py1, py2, cp->mFp);
3913b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        x1 ++;
3923b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        out++;
3933b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
3943b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
3953b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if(x2 > x1) {
3963b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#if 0//defined(ARCH_ARM_HAVE_NEON)
3973b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        int32_t len = (x2 - x1 - 1) >> 1;
3983b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        if(len > 0) {
3993b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
4003b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            x1 += len << 1;
4013b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            out += len << 1;
4023b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
4033b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#endif
4043b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
4053b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        while(x1 != x2) {
4063b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            ConvolveOneF2(p, x1, out, py0, py1, py2, cp->mFp);
4073b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            out++;
4083b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            x1++;
4093b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
4103b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
4113b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams}
4123b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsForEachStubParamStruct *p,
4133b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t xstart, uint32_t xend,
4143b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams                                                uint32_t instep, uint32_t outstep) {
4153b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr;
4163b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
4173b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if (!cp->mAlloc.get()) {
4183b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        ALOGE("Convolve3x3 executed without input, skipping");
4193b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        return;
4203b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
4213b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
4223b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
4233b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
4243b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1));
4253b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t y2 = rsMax((int32_t)p->y - 1, 0);
4263b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const float *py0 = (const float *)(pin + stride * y2);
4273b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const float *py1 = (const float *)(pin + stride * p->y);
4283b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    const float *py2 = (const float *)(pin + stride * y1);
4293b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
4303b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    float *out = (float *)p->out;
4313b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x1 = xstart;
4323b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    uint32_t x2 = xend;
4333b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if(x1 == 0) {
4343b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        ConvolveOneF1(p, 0, out, py0, py1, py2, cp->mFp);
4353b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        x1 ++;
4363b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        out++;
4373b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
4383b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
4393b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if(x2 > x1) {
4403b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#if 0//defined(ARCH_ARM_HAVE_NEON)
4413b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        int32_t len = (x2 - x1 - 1) >> 1;
4423b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        if(len > 0) {
4433b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len);
4443b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            x1 += len << 1;
4453b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            out += len << 1;
4463b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
4473b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#endif
4483b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams
4493b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        while(x1 != x2) {
4503b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            ConvolveOneF1(p, x1, out, py0, py1, py2, cp->mFp);
451e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams            out++;
452e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams            x1++;
453e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams        }
454e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    }
455e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams}
456e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
457709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3(
458c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
459c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) {
460e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
4613b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    if (e->getType() == RS_TYPE_FLOAT_32) {
4623b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        switch(e->getVectorSize()) {
4633b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        case 1:
4643b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            mRootPtr = &kernelF1;
4653b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            break;
4663b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        case 2:
4673b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            mRootPtr = &kernelF2;
4683b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            break;
4693b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        case 3:
4703b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        case 4:
4713b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            mRootPtr = &kernelF4;
4723b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            break;
4733b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
4743b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    } else {
4753b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        switch(e->getVectorSize()) {
4763b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        case 1:
4773b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            mRootPtr = &kernelU1;
4783b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            break;
4793b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        case 2:
4803b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            mRootPtr = &kernelU2;
4813b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            break;
4823b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        case 3:
4833b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        case 4:
4843b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            mRootPtr = &kernelU4;
4853b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams            break;
4863b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        }
4873b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams    }
488e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    for(int ct=0; ct < 9; ct++) {
489c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams        mFp[ct] = 1.f / 9.f;
4903b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams        mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
491e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams    }
492709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
493709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
494709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicConvolve3x3::~RsdCpuScriptIntrinsicConvolve3x3() {
495709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
496709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
497709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::populateScript(Script *s) {
498709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    s->mHal.info.exportedVariableCount = 2;
499709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
500709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
501709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() {
502c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    mAlloc.clear();
503709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
504709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
505709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
506c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason SamsRsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
507709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
508c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e);
509e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams}
510e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
511e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams
512