1d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams/*
2d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * Copyright (C) 2012 The Android Open Source Project
3d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams *
4d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * Licensed under the Apache License, Version 2.0 (the "License");
5d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * you may not use this file except in compliance with the License.
6d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * You may obtain a copy of the License at
7d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams *
8d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams *      http://www.apache.org/licenses/LICENSE-2.0
9d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams *
10d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * Unless required by applicable law or agreed to in writing, software
11d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * distributed under the License is distributed on an "AS IS" BASIS,
12d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * See the License for the specific language governing permissions and
14d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * limitations under the License.
15d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams */
16d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
17d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
18709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsic.h"
19709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsicInlines.h"
20d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
21d85e283087ecd00760a0d8d0c9d8482cda845efcJason Samsusing namespace android;
22d85e283087ecd00760a0d8d0c9d8482cda845efcJason Samsusing namespace android::renderscript;
23d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
24709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace android {
25709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace renderscript {
26709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
27709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
28709a0978ae141198018ca9769f8d96292a8928e6Jason Samsclass RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
29709a0978ae141198018ca9769f8d96292a8928e6Jason Samspublic:
30c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines    void populateScript(Script *) override;
31c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines    void invokeFreeChildren() override;
32709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
33c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines    void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
34c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines    void setGlobalObj(uint32_t slot, ObjectBase *data) override;
35709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
36c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines    ~RsdCpuScriptIntrinsicConvolve5x5() override;
37c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
38709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
39709a0978ae141198018ca9769f8d96292a8928e6Jason Samsprotected:
4034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    float mFp[28];
4134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    short mIp[28];
42d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    ObjectBaseRef<Allocation> alloc;
43709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
44709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
45b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    static void kernelU1(const RsExpandKernelDriverInfo *info,
4634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                         uint32_t xstart, uint32_t xend,
479ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                         uint32_t outstep);
48b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    static void kernelU2(const RsExpandKernelDriverInfo *info,
4934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                         uint32_t xstart, uint32_t xend,
509ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                         uint32_t outstep);
51b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    static void kernelU4(const RsExpandKernelDriverInfo *info,
5234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                         uint32_t xstart, uint32_t xend,
539ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                         uint32_t outstep);
54b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    static void kernelF1(const RsExpandKernelDriverInfo *info,
5534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                         uint32_t xstart, uint32_t xend,
569ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                         uint32_t outstep);
57b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    static void kernelF2(const RsExpandKernelDriverInfo *info,
5834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                         uint32_t xstart, uint32_t xend,
599ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                         uint32_t outstep);
60b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    static void kernelF4(const RsExpandKernelDriverInfo *info,
6134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                         uint32_t xstart, uint32_t xend,
629ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                         uint32_t outstep);
63709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
64709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
65d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams};
66d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
67709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
68d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams}
69d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
70709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
71709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    rsAssert(slot == 1);
72709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    alloc.set(static_cast<Allocation *>(data));
73709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
74d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
75709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
76709a0978ae141198018ca9769f8d96292a8928e6Jason Sams                                                    const void *data, size_t dataLength) {
77d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    rsAssert(slot == 0);
7834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    memcpy (&mFp, data, dataLength);
79d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    for(int ct=0; ct < 25; ct++) {
8034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        if (mFp[ct] >= 0) {
8134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
8234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        } else {
8334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
8434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        }
85d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    }
86d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams}
87d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
88d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
89b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossstatic void OneU4(const RsExpandKernelDriverInfo *info, uint32_t x, uchar4 *out,
9034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
9134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const float* coeff) {
92d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
93d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x0 = rsMax((int32_t)x-2, 0);
94d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
95d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x2 = x;
96b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
97b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
98d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
99d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    float4 px = convert_float4(py0[x0]) * coeff[0] +
100d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py0[x1]) * coeff[1] +
101d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py0[x2]) * coeff[2] +
102d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py0[x3]) * coeff[3] +
103d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py0[x4]) * coeff[4] +
104d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
105d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py1[x0]) * coeff[5] +
106d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py1[x1]) * coeff[6] +
107d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py1[x2]) * coeff[7] +
108d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py1[x3]) * coeff[8] +
109d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py1[x4]) * coeff[9] +
110d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
111d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py2[x0]) * coeff[10] +
112d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py2[x1]) * coeff[11] +
113d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py2[x2]) * coeff[12] +
114d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py2[x3]) * coeff[13] +
115d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py2[x4]) * coeff[14] +
116d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
117d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py3[x0]) * coeff[15] +
118d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py3[x1]) * coeff[16] +
119d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py3[x2]) * coeff[17] +
120d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py3[x3]) * coeff[18] +
121d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py3[x4]) * coeff[19] +
122d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
123d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py4[x0]) * coeff[20] +
124d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py4[x1]) * coeff[21] +
125d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py4[x2]) * coeff[22] +
126d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py4[x3]) * coeff[23] +
127d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams                convert_float4(py4[x4]) * coeff[24];
1284283f579c424f07bc07c7f075398053eed3f8281Miao Wang    px = clamp(px + 0.5f, 0.f, 255.f);
12934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    *out = convert_uchar4(px);
13034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams}
131d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
132b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossstatic void OneU2(const RsExpandKernelDriverInfo *info, uint32_t x, uchar2 *out,
13334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
13434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const float* coeff) {
13534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
13634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x0 = rsMax((int32_t)x-2, 0);
13734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
13834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x2 = x;
139b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
140b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
14134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
14234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    float2 px = convert_float2(py0[x0]) * coeff[0] +
14334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py0[x1]) * coeff[1] +
14434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py0[x2]) * coeff[2] +
14534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py0[x3]) * coeff[3] +
14634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py0[x4]) * coeff[4] +
14734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
14834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py1[x0]) * coeff[5] +
14934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py1[x1]) * coeff[6] +
15034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py1[x2]) * coeff[7] +
15134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py1[x3]) * coeff[8] +
15234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py1[x4]) * coeff[9] +
15334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
15434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py2[x0]) * coeff[10] +
15534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py2[x1]) * coeff[11] +
15634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py2[x2]) * coeff[12] +
15734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py2[x3]) * coeff[13] +
15834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py2[x4]) * coeff[14] +
15934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
16034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py3[x0]) * coeff[15] +
16134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py3[x1]) * coeff[16] +
16234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py3[x2]) * coeff[17] +
16334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py3[x3]) * coeff[18] +
16434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py3[x4]) * coeff[19] +
16534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
16634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py4[x0]) * coeff[20] +
16734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py4[x1]) * coeff[21] +
16834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py4[x2]) * coeff[22] +
16934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py4[x3]) * coeff[23] +
17034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                convert_float2(py4[x4]) * coeff[24];
1714283f579c424f07bc07c7f075398053eed3f8281Miao Wang    px = clamp(px + 0.5f, 0.f, 255.f);
17234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    *out = convert_uchar2(px);
17334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams}
17434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
175b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossstatic void OneU1(const RsExpandKernelDriverInfo *info, uint32_t x, uchar *out,
17634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
17734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const float* coeff) {
17834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
17934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x0 = rsMax((int32_t)x-2, 0);
18034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
18134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x2 = x;
182b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
183b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
18434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
18534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    float px = (float)(py0[x0]) * coeff[0] +
18634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py0[x1]) * coeff[1] +
18734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py0[x2]) * coeff[2] +
18834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py0[x3]) * coeff[3] +
18934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py0[x4]) * coeff[4] +
19034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
19134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py1[x0]) * coeff[5] +
19234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py1[x1]) * coeff[6] +
19334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py1[x2]) * coeff[7] +
19434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py1[x3]) * coeff[8] +
19534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py1[x4]) * coeff[9] +
19634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
19734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py2[x0]) * coeff[10] +
19834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py2[x1]) * coeff[11] +
19934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py2[x2]) * coeff[12] +
20034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py2[x3]) * coeff[13] +
20134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py2[x4]) * coeff[14] +
20234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
20334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py3[x0]) * coeff[15] +
20434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py3[x1]) * coeff[16] +
20534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py3[x2]) * coeff[17] +
20634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py3[x3]) * coeff[18] +
20734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py3[x4]) * coeff[19] +
20834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
20934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py4[x0]) * coeff[20] +
21034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py4[x1]) * coeff[21] +
21134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py4[x2]) * coeff[22] +
21234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py4[x3]) * coeff[23] +
21334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               (float)(py4[x4]) * coeff[24];
2144283f579c424f07bc07c7f075398053eed3f8281Miao Wang    px = clamp(px + 0.5f, 0.f, 255.f);
21534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    *out = px;
21634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams}
21734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
218b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossstatic void OneF4(const RsExpandKernelDriverInfo *info, uint32_t x, float4 *out,
21934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
22034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const float* coeff) {
22134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
22234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x0 = rsMax((int32_t)x-2, 0);
22334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
22434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x2 = x;
225b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
226b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
22734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
22834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    float4 px = py0[x0] * coeff[0] +
22934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py0[x1] * coeff[1] +
23034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py0[x2] * coeff[2] +
23134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py0[x3] * coeff[3] +
23234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py0[x4] * coeff[4] +
23334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
23434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py1[x0] * coeff[5] +
23534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py1[x1] * coeff[6] +
23634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py1[x2] * coeff[7] +
23734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py1[x3] * coeff[8] +
23834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py1[x4] * coeff[9] +
23934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
24034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py2[x0] * coeff[10] +
24134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py2[x1] * coeff[11] +
24234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py2[x2] * coeff[12] +
24334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py2[x3] * coeff[13] +
24434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py2[x4] * coeff[14] +
24534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
24634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py3[x0] * coeff[15] +
24734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py3[x1] * coeff[16] +
24834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py3[x2] * coeff[17] +
24934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py3[x3] * coeff[18] +
25034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py3[x4] * coeff[19] +
25134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
25234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py4[x0] * coeff[20] +
25334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py4[x1] * coeff[21] +
25434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py4[x2] * coeff[22] +
25534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py4[x3] * coeff[23] +
25634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py4[x4] * coeff[24];
25734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    *out = px;
25834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams}
25934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
260b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossstatic void OneF2(const RsExpandKernelDriverInfo *info, uint32_t x, float2 *out,
26134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
26234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const float* coeff) {
26334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
26434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x0 = rsMax((int32_t)x-2, 0);
26534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
26634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x2 = x;
267b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
268b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
26934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
27034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    float2 px = py0[x0] * coeff[0] +
27134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py0[x1] * coeff[1] +
27234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py0[x2] * coeff[2] +
27334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py0[x3] * coeff[3] +
27434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py0[x4] * coeff[4] +
27534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
27634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py1[x0] * coeff[5] +
27734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py1[x1] * coeff[6] +
27834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py1[x2] * coeff[7] +
27934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py1[x3] * coeff[8] +
28034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py1[x4] * coeff[9] +
28134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
28234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py2[x0] * coeff[10] +
28334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py2[x1] * coeff[11] +
28434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py2[x2] * coeff[12] +
28534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py2[x3] * coeff[13] +
28634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py2[x4] * coeff[14] +
28734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
28834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py3[x0] * coeff[15] +
28934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py3[x1] * coeff[16] +
29034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py3[x2] * coeff[17] +
29134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py3[x3] * coeff[18] +
29234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py3[x4] * coeff[19] +
29334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
29434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py4[x0] * coeff[20] +
29534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py4[x1] * coeff[21] +
29634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py4[x2] * coeff[22] +
29734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py4[x3] * coeff[23] +
29834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                py4[x4] * coeff[24];
29934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    *out = px;
300d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams}
301d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
302b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossstatic void OneF1(const RsExpandKernelDriverInfo *info, uint32_t x, float *out,
30334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
30434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                  const float* coeff) {
30534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
30634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x0 = rsMax((int32_t)x-2, 0);
30734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x1 = rsMax((int32_t)x-1, 0);
30834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x2 = x;
309b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(info->dim.x-1));
310b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(info->dim.x-1));
31134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
31234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    float px = py0[x0] * coeff[0] +
31334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py0[x1] * coeff[1] +
31434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py0[x2] * coeff[2] +
31534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py0[x3] * coeff[3] +
31634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py0[x4] * coeff[4] +
31734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
31834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py1[x0] * coeff[5] +
31934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py1[x1] * coeff[6] +
32034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py1[x2] * coeff[7] +
32134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py1[x3] * coeff[8] +
32234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py1[x4] * coeff[9] +
32334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
32434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py2[x0] * coeff[10] +
32534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py2[x1] * coeff[11] +
32634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py2[x2] * coeff[12] +
32734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py2[x3] * coeff[13] +
32834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py2[x4] * coeff[14] +
32934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
33034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py3[x0] * coeff[15] +
33134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py3[x1] * coeff[16] +
33234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py3[x2] * coeff[17] +
33334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py3[x3] * coeff[18] +
33434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py3[x4] * coeff[19] +
33534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
33634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py4[x0] * coeff[20] +
33734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py4[x1] * coeff[21] +
33834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py4[x2] * coeff[22] +
33934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py4[x3] * coeff[23] +
34034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams               py4[x4] * coeff[24];
34134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    *out = px;
34234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams}
34334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
34434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
345a1b08e2cacf3891fcd6895422c6124887b75975eJason Samsextern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
346a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams                                          const void *y2, const void *y3, const void *y4,
347a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams                                          const short *coef, uint32_t count);
348a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
349b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossvoid RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsExpandKernelDriverInfo *info,
35034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                                                uint32_t xstart, uint32_t xend,
3519ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                                                uint32_t outstep) {
352b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
353b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams    if (!cp->alloc.get()) {
354b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams        ALOGE("Convolve5x5 executed without input, skipping");
355b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams        return;
356b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams    }
357709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
358709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
359d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
360b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
361b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
362b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y2 = info->current.y;
363b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
364b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
365d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
366709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
367709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
368709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
369709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
370709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
371d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
372b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uchar4 *out = (uchar4 *)info->outPtr[0];
373d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x1 = xstart;
374d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    uint32_t x2 = xend;
375d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
376a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams    while((x1 < x2) && (x1 < 2)) {
377b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
378a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        out++;
379a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        x1++;
380a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams    }
3817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(ARCH_X86_HAVE_SSSE3)
3827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    // for x86 SIMD, require minimum of 7 elements (4 for SIMD,
3837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    // 3 for end boundary where x may hit the end boundary)
3847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    if (gArchUseSIMD &&((x1 + 6) < x2)) {
3857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        // subtract 3 for end boundary
3867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        uint32_t len = (x2 - x1 - 3) >> 2;
38745d29c41b1b9805991dcd8557f6d1b70977f5428Yong Chen        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
3887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out += len << 2;
3897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x1 += len << 2;
3907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
3917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif
392a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
393074424a4ac5b093331df2c92e7a5bcbfff136b71Jason Sams#if defined(ARCH_ARM_USE_INTRINSICS)
394f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams    if(gArchUseSIMD && ((x1 + 3) < x2)) {
395a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        uint32_t len = (x2 - x1 - 3) >> 1;
396de52a834dbcb2a3196948e7b9f67d395493ea9a4Jason Sams        rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len);
39734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out += len << 1;
39834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1 += len << 1;
39934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
40034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#endif
40134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
40234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    while(x1 < x2) {
403b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneU4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
40434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out++;
40534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1++;
40634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
40734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams}
40834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
409b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossvoid RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsExpandKernelDriverInfo *info,
41034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                                                uint32_t xstart, uint32_t xend,
4119ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                                                uint32_t outstep) {
412b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
41334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    if (!cp->alloc.get()) {
41434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        ALOGE("Convolve5x5 executed without input, skipping");
41534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        return;
41634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
41734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
41834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
41934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
420b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
421b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
422b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y2 = info->current.y;
423b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
424b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
42534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
42634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
42734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
42834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
42934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
43034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
43134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
432b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uchar2 *out = (uchar2 *)info->outPtr[0];
43334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x1 = xstart;
43434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x2 = xend;
43534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
43634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    while((x1 < x2) && (x1 < 2)) {
437b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
43834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out++;
43934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1++;
44034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
44134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
44234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#if 0//defined(ARCH_ARM_HAVE_NEON)
44334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    if((x1 + 3) < x2) {
44434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        uint32_t len = (x2 - x1 - 3) >> 1;
445a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
446a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        out += len << 1;
447a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams        x1 += len << 1;
448a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams    }
449a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams#endif
450a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams
451d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    while(x1 < x2) {
452b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneU2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
453d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams        out++;
454d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams        x1++;
455d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    }
456d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams}
457d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
458b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossvoid RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsExpandKernelDriverInfo *info,
45934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                                                uint32_t xstart, uint32_t xend,
4609ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                                                uint32_t outstep) {
461b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
46234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    if (!cp->alloc.get()) {
46334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        ALOGE("Convolve5x5 executed without input, skipping");
46434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        return;
46534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
46634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
46734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
46834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
469b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
470b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
471b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y2 = info->current.y;
472b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
473b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
47434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
47534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar *py0 = (const uchar *)(pin + stride * y0);
47634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar *py1 = (const uchar *)(pin + stride * y1);
47734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar *py2 = (const uchar *)(pin + stride * y2);
47834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar *py3 = (const uchar *)(pin + stride * y3);
47934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar *py4 = (const uchar *)(pin + stride * y4);
48034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
481b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uchar *out = (uchar *)info->outPtr[0];
48234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x1 = xstart;
48334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x2 = xend;
48434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
48534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    while((x1 < x2) && (x1 < 2)) {
486b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
48734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out++;
48834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1++;
48934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
49034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
49134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#if 0//defined(ARCH_ARM_HAVE_NEON)
49234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    if((x1 + 3) < x2) {
49334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        uint32_t len = (x2 - x1 - 3) >> 1;
49434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
49534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out += len << 1;
49634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1 += len << 1;
49734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
49834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#endif
49934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
50034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    while(x1 < x2) {
501b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneU1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
50234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out++;
50334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1++;
50434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
50534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams}
50634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
507b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossvoid RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsExpandKernelDriverInfo *info,
50834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                                                uint32_t xstart, uint32_t xend,
5099ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                                                uint32_t outstep) {
510b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
51134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    if (!cp->alloc.get()) {
51234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        ALOGE("Convolve5x5 executed without input, skipping");
51334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        return;
51434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
51534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
51634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
51734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
518b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
519b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
520b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y2 = info->current.y;
521b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
522b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
52334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
52434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float4 *py0 = (const float4 *)(pin + stride * y0);
52534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float4 *py1 = (const float4 *)(pin + stride * y1);
52634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float4 *py2 = (const float4 *)(pin + stride * y2);
52734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float4 *py3 = (const float4 *)(pin + stride * y3);
52834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float4 *py4 = (const float4 *)(pin + stride * y4);
52934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
530b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    float4 *out = (float4 *)info->outPtr[0];
53134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x1 = xstart;
53234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x2 = xend;
53334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
53434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    while((x1 < x2) && (x1 < 2)) {
535b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
53634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out++;
53734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1++;
53834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
53934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
54034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#if 0//defined(ARCH_ARM_HAVE_NEON)
54134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    if((x1 + 3) < x2) {
54234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        uint32_t len = (x2 - x1 - 3) >> 1;
54334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
54434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out += len << 1;
54534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1 += len << 1;
54634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
54734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#endif
54834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
54934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    while(x1 < x2) {
550b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneF4(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
55134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out++;
55234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1++;
55334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
55434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams}
55534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
556b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossvoid RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsExpandKernelDriverInfo *info,
55734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                                                uint32_t xstart, uint32_t xend,
5589ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                                                uint32_t outstep) {
559b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
56034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    if (!cp->alloc.get()) {
56134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        ALOGE("Convolve5x5 executed without input, skipping");
56234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        return;
56334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
56434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
56534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
56634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
567b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
568b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
569b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y2 = info->current.y;
570b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
571b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
57234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
57334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float2 *py0 = (const float2 *)(pin + stride * y0);
57434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float2 *py1 = (const float2 *)(pin + stride * y1);
57534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float2 *py2 = (const float2 *)(pin + stride * y2);
57634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float2 *py3 = (const float2 *)(pin + stride * y3);
57734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float2 *py4 = (const float2 *)(pin + stride * y4);
57834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
579b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    float2 *out = (float2 *)info->outPtr[0];
58034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x1 = xstart;
58134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x2 = xend;
58234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
58334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    while((x1 < x2) && (x1 < 2)) {
584b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
58534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out++;
58634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1++;
58734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
58834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
58934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#if 0//defined(ARCH_ARM_HAVE_NEON)
59034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    if((x1 + 3) < x2) {
59134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        uint32_t len = (x2 - x1 - 3) >> 1;
59234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
59334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out += len << 1;
59434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1 += len << 1;
59534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
59634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#endif
59734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
59834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    while(x1 < x2) {
599b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneF2(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
60034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out++;
60134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1++;
60234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
60334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams}
60434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
605b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossvoid RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsExpandKernelDriverInfo *info,
60634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams                                                uint32_t xstart, uint32_t xend,
6079ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                                                uint32_t outstep) {
608b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)info->usr;
60934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    if (!cp->alloc.get()) {
61034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        ALOGE("Convolve5x5 executed without input, skipping");
61134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        return;
61234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
61334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
61434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
61534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
616b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y0 = rsMax((int32_t)info->current.y-2, 0);
617b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y1 = rsMax((int32_t)info->current.y-1, 0);
618b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y2 = info->current.y;
619b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y3 = rsMin((int32_t)info->current.y+1, (int32_t)(info->dim.y-1));
620b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t y4 = rsMin((int32_t)info->current.y+2, (int32_t)(info->dim.y-1));
62134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
62234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float *py0 = (const float *)(pin + stride * y0);
62334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float *py1 = (const float *)(pin + stride * y1);
62434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float *py2 = (const float *)(pin + stride * y2);
62534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float *py3 = (const float *)(pin + stride * y3);
62634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    const float *py4 = (const float *)(pin + stride * y4);
62734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
628b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    float *out = (float *)info->outPtr[0];
62934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x1 = xstart;
63034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    uint32_t x2 = xend;
63134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
63234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    while((x1 < x2) && (x1 < 2)) {
633b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
63434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out++;
63534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1++;
63634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
63734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
63834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#if 0//defined(ARCH_ARM_HAVE_NEON)
63934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    if((x1 + 3) < x2) {
64034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        uint32_t len = (x2 - x1 - 3) >> 1;
64134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
64234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out += len << 1;
64334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1 += len << 1;
64434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
64534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#endif
64634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams
64734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    while(x1 < x2) {
648b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross        OneF1(info, x1, out, py0, py1, py2, py3, py4, cp->mFp);
64934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        out++;
65034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        x1++;
65134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
65234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams}
653d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
654709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
655c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
656c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
657d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
65834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    if (e->getType() == RS_TYPE_FLOAT_32) {
65934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        switch(e->getVectorSize()) {
66034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        case 1:
66134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            mRootPtr = &kernelF1;
66234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            break;
66334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        case 2:
66434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            mRootPtr = &kernelF2;
66534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            break;
66634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        case 3:
66734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        case 4:
66834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            mRootPtr = &kernelF4;
66934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            break;
67034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        }
67134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    } else {
67234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        switch(e->getVectorSize()) {
67334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        case 1:
67434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            mRootPtr = &kernelU1;
67534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            break;
67634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        case 2:
67734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            mRootPtr = &kernelU2;
67834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            break;
67934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        case 3:
68034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        case 4:
68134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            mRootPtr = &kernelU4;
68234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams            break;
68334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        }
68434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams    }
685ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams    for(int ct=0; ct < 25; ct++) {
68634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        mFp[ct] = 1.f / 25.f;
68734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams        mIp[ct] = (short)(mFp[ct] * 256.f);
688d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams    }
689d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams}
690d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams
691709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
692709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
693709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
694709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
695709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    s->mHal.info.exportedVariableCount = 2;
696709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
697709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
698709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
699709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    alloc.clear();
700709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
701709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
702709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
703c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason SamsRsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
704c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams                                            const Script *s, const Element *e) {
705709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
706c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
707709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
708