1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "rsCpuIntrinsic.h"
18#include "rsCpuIntrinsicInlines.h"
19
20using namespace android;
21using namespace android::renderscript;
22
23namespace android {
24namespace renderscript {
25
26
27class RsdCpuScriptIntrinsicHistogram : public RsdCpuScriptIntrinsic {
28public:
29    virtual void populateScript(Script *);
30    virtual void invokeFreeChildren();
31
32    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
33    virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
34
35    virtual ~RsdCpuScriptIntrinsicHistogram();
36    RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
37
38protected:
39    void preLaunch(uint32_t slot, const Allocation * ain,
40                   Allocation * aout, const void * usr,
41                   uint32_t usrLen, const RsScriptCall *sc);
42    void postLaunch(uint32_t slot, const Allocation * ain,
43                    Allocation * aout, const void * usr,
44                    uint32_t usrLen, const RsScriptCall *sc);
45
46
47    float mDot[4];
48    int mDotI[4];
49    int *mSums;
50    ObjectBaseRef<Allocation> mAllocOut;
51
52    static void kernelP1U4(const RsForEachStubParamStruct *p,
53                          uint32_t xstart, uint32_t xend,
54                          uint32_t instep, uint32_t outstep);
55    static void kernelP1U3(const RsForEachStubParamStruct *p,
56                          uint32_t xstart, uint32_t xend,
57                          uint32_t instep, uint32_t outstep);
58    static void kernelP1U2(const RsForEachStubParamStruct *p,
59                          uint32_t xstart, uint32_t xend,
60                          uint32_t instep, uint32_t outstep);
61    static void kernelP1U1(const RsForEachStubParamStruct *p,
62                          uint32_t xstart, uint32_t xend,
63                          uint32_t instep, uint32_t outstep);
64
65    static void kernelP1L4(const RsForEachStubParamStruct *p,
66                           uint32_t xstart, uint32_t xend,
67                           uint32_t instep, uint32_t outstep);
68    static void kernelP1L3(const RsForEachStubParamStruct *p,
69                           uint32_t xstart, uint32_t xend,
70                           uint32_t instep, uint32_t outstep);
71    static void kernelP1L2(const RsForEachStubParamStruct *p,
72                           uint32_t xstart, uint32_t xend,
73                           uint32_t instep, uint32_t outstep);
74    static void kernelP1L1(const RsForEachStubParamStruct *p,
75                           uint32_t xstart, uint32_t xend,
76                           uint32_t instep, uint32_t outstep);
77
78};
79
80}
81}
82
83void RsdCpuScriptIntrinsicHistogram::setGlobalObj(uint32_t slot, ObjectBase *data) {
84    rsAssert(slot == 1);
85    mAllocOut.set(static_cast<Allocation *>(data));
86}
87
88void RsdCpuScriptIntrinsicHistogram::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
89    rsAssert(slot == 0);
90    rsAssert(dataLength == 16);
91    memcpy(mDot, data, 16);
92    mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
93    mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
94    mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
95    mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
96}
97
98
99
100void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation * ain,
101                                      Allocation * aout, const void * usr,
102                                      uint32_t usrLen, const RsScriptCall *sc) {
103
104    const uint32_t threads = mCtx->getThreadCount();
105    uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
106
107    switch (slot) {
108    case 0:
109        switch(vSize) {
110        case 1:
111            mRootPtr = &kernelP1U1;
112            break;
113        case 2:
114            mRootPtr = &kernelP1U2;
115            break;
116        case 3:
117            mRootPtr = &kernelP1U3;
118            vSize = 4;
119            break;
120        case 4:
121            mRootPtr = &kernelP1U4;
122            break;
123        }
124        break;
125    case 1:
126        switch(ain->getType()->getElement()->getVectorSize()) {
127        case 1:
128            mRootPtr = &kernelP1L1;
129            break;
130        case 2:
131            mRootPtr = &kernelP1L2;
132            break;
133        case 3:
134            mRootPtr = &kernelP1L3;
135            break;
136        case 4:
137            mRootPtr = &kernelP1L4;
138            break;
139        }
140        break;
141    }
142    memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
143}
144
145void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation * ain,
146                                       Allocation * aout, const void * usr,
147                                       uint32_t usrLen, const RsScriptCall *sc) {
148
149    unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
150    uint32_t threads = mCtx->getThreadCount();
151    uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
152
153    if (vSize == 3) vSize = 4;
154
155    for (uint32_t ct=0; ct < (256 * vSize); ct++) {
156        o[ct] = mSums[ct];
157        for (uint32_t t=1; t < threads; t++) {
158            o[ct] += mSums[ct + (256 * vSize * t)];
159        }
160    }
161}
162
163void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsForEachStubParamStruct *p,
164                                                uint32_t xstart, uint32_t xend,
165                                                uint32_t instep, uint32_t outstep) {
166
167    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
168    uchar *in = (uchar *)p->in;
169    int * sums = &cp->mSums[256 * 4 * p->lid];
170
171    for (uint32_t x = xstart; x < xend; x++) {
172        sums[(in[0] << 2)    ] ++;
173        sums[(in[1] << 2) + 1] ++;
174        sums[(in[2] << 2) + 2] ++;
175        sums[(in[3] << 2) + 3] ++;
176        in += instep;
177    }
178}
179
180void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsForEachStubParamStruct *p,
181                                                uint32_t xstart, uint32_t xend,
182                                                uint32_t instep, uint32_t outstep) {
183
184    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
185    uchar *in = (uchar *)p->in;
186    int * sums = &cp->mSums[256 * 4 * p->lid];
187
188    for (uint32_t x = xstart; x < xend; x++) {
189        sums[(in[0] << 2)    ] ++;
190        sums[(in[1] << 2) + 1] ++;
191        sums[(in[2] << 2) + 2] ++;
192        in += instep;
193    }
194}
195
196void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsForEachStubParamStruct *p,
197                                                uint32_t xstart, uint32_t xend,
198                                                uint32_t instep, uint32_t outstep) {
199
200    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
201    uchar *in = (uchar *)p->in;
202    int * sums = &cp->mSums[256 * 2 * p->lid];
203
204    for (uint32_t x = xstart; x < xend; x++) {
205        sums[(in[0] << 1)    ] ++;
206        sums[(in[1] << 1) + 1] ++;
207        in += instep;
208    }
209}
210
211void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsForEachStubParamStruct *p,
212                                                uint32_t xstart, uint32_t xend,
213                                                uint32_t instep, uint32_t outstep) {
214
215    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
216    uchar *in = (uchar *)p->in;
217    int * sums = &cp->mSums[256 * p->lid];
218
219    for (uint32_t x = xstart; x < xend; x++) {
220        int t = (cp->mDotI[0] * in[0]) +
221                (cp->mDotI[1] * in[1]) +
222                (cp->mDotI[2] * in[2]) +
223                (cp->mDotI[3] * in[3]);
224        sums[(t + 0x7f) >> 8] ++;
225        in += instep;
226    }
227}
228
229void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsForEachStubParamStruct *p,
230                                                uint32_t xstart, uint32_t xend,
231                                                uint32_t instep, uint32_t outstep) {
232
233    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
234    uchar *in = (uchar *)p->in;
235    int * sums = &cp->mSums[256 * p->lid];
236
237    for (uint32_t x = xstart; x < xend; x++) {
238        int t = (cp->mDotI[0] * in[0]) +
239                (cp->mDotI[1] * in[1]) +
240                (cp->mDotI[2] * in[2]);
241        sums[(t + 0x7f) >> 8] ++;
242        in += instep;
243    }
244}
245
246void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsForEachStubParamStruct *p,
247                                                uint32_t xstart, uint32_t xend,
248                                                uint32_t instep, uint32_t outstep) {
249
250    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
251    uchar *in = (uchar *)p->in;
252    int * sums = &cp->mSums[256 * p->lid];
253
254    for (uint32_t x = xstart; x < xend; x++) {
255        int t = (cp->mDotI[0] * in[0]) +
256                (cp->mDotI[1] * in[1]);
257        sums[(t + 0x7f) >> 8] ++;
258        in += instep;
259    }
260}
261
262void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsForEachStubParamStruct *p,
263                                                uint32_t xstart, uint32_t xend,
264                                                uint32_t instep, uint32_t outstep) {
265
266    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
267    uchar *in = (uchar *)p->in;
268    int * sums = &cp->mSums[256 * p->lid];
269
270    for (uint32_t x = xstart; x < xend; x++) {
271        int t = (cp->mDotI[0] * in[0]);
272        sums[(t + 0x7f) >> 8] ++;
273        in += instep;
274    }
275}
276
277void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsForEachStubParamStruct *p,
278                                                uint32_t xstart, uint32_t xend,
279                                                uint32_t instep, uint32_t outstep) {
280
281    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)p->usr;
282    uchar *in = (uchar *)p->in;
283    int * sums = &cp->mSums[256 * p->lid];
284
285    for (uint32_t x = xstart; x < xend; x++) {
286        sums[in[0]] ++;
287        in += instep;
288    }
289}
290
291
292RsdCpuScriptIntrinsicHistogram::RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx,
293                                                     const Script *s, const Element *e)
294            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_HISTOGRAM) {
295
296    mRootPtr = NULL;
297    mSums = new int[256 * 4 * mCtx->getThreadCount()];
298    mDot[0] = 0.299f;
299    mDot[1] = 0.587f;
300    mDot[2] = 0.114f;
301    mDot[3] = 0;
302    mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
303    mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
304    mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
305    mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
306}
307
308RsdCpuScriptIntrinsicHistogram::~RsdCpuScriptIntrinsicHistogram() {
309    if (mSums) {
310        delete []mSums;
311    }
312}
313
314void RsdCpuScriptIntrinsicHistogram::populateScript(Script *s) {
315    s->mHal.info.exportedVariableCount = 2;
316}
317
318void RsdCpuScriptIntrinsicHistogram::invokeFreeChildren() {
319}
320
321
322RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
323
324    return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
325}
326
327
328