1/*M///////////////////////////////////////////////////////////////////////////////////////
2//
3//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4//
5//  By downloading, copying, installing or using the software you agree to this license.
6//  If you do not agree to this license, do not download, install,
7//  copy or use the software.
8//
9//
10//                           License Agreement
11//                For Open Source Computer Vision Library
12//
13// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
15// Third party copyrights are property of their respective owners.
16//
17// @Authors
18//    Dachuan Zhao, dachuan@multicorewareinc.com
19//
20// Redistribution and use in source and binary forms, with or without modification,
21// are permitted provided that the following conditions are met:
22//
23//   * Redistribution's of source code must retain the above copyright notice,
24//     this list of conditions and the following disclaimer.
25//
26//   * Redistribution's in binary form must reproduce the above copyright notice,
27//     this list of conditions and the following disclaimer in the documentation
28//     and/or other materials provided with the distribution.
29//
30//   * The name of the copyright holders may not be used to endorse or promote products
31//     derived from this software without specific prior written permission.
32//
33// This software is provided by the copyright holders and contributors as is and
34// any express or implied warranties, including, but not limited to, the implied
35// warranties of merchantability and fitness for a particular purpose are disclaimed.
36// In no event shall the Intel Corporation or contributors be liable for any direct,
37// indirect, incidental, special, exemplary, or consequential damages
38// (including, but not limited to, procurement of substitute goods or services;
39// loss of use, data, or profits; or business interruption) however caused
40// and on any theory of liability, whether in contract, strict liability,
41// or tort (including negligence or otherwise) arising in any way out of
42// the use of this software, even if advised of the possibility of such damage.
43//
44//M*/
45
46#ifdef DOUBLE_SUPPORT
47#ifdef cl_amd_fp64
48#pragma OPENCL EXTENSION cl_amd_fp64:enable
49#elif defined (cl_khr_fp64)
50#pragma OPENCL EXTENSION cl_khr_fp64:enable
51#endif
52#endif
53
54#if defined BORDER_REPLICATE
55// aaaaaa|abcdefgh|hhhhhhh
56#define EXTRAPOLATE(x, maxV) clamp((x), 0, (maxV)-1)
57#elif defined BORDER_WRAP
58// cdefgh|abcdefgh|abcdefg
59#define EXTRAPOLATE(x, maxV) ( (x) + (maxV) ) % (maxV)
60#elif defined BORDER_REFLECT
61// fedcba|abcdefgh|hgfedcb
62#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1)
63#elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
64// gfedcb|abcdefgh|gfedcba
65#define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1)
66#else
67#error No extrapolation method
68#endif
69
70#if cn != 3
71#define loadpix(addr)  *(__global const T*)(addr)
72#define storepix(val, addr)  *(__global T*)(addr) = (val)
73#define PIXSIZE ((int)sizeof(T))
74#else
75#define loadpix(addr)  vload3(0, (__global const T1*)(addr))
76#define storepix(val, addr) vstore3((val), 0, (__global T1*)(addr))
77#define PIXSIZE ((int)sizeof(T1)*3)
78#endif
79
80#define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))
81
82#if kercn == 4
83#define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x)))
84#endif
85
86#ifdef INTEL_DEVICE
87#define MAD(x,y,z) fma((x),(y),(z))
88#else
89#define MAD(x,y,z) mad((x),(y),(z))
90#endif
91
92#define LOAD_LOCAL(col_gl, col_lcl) \
93    sum0 =     co3* SRC(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));         \
94    sum0 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum0);  \
95    temp = SRC(col_gl, EXTRAPOLATE_(src_y, src_rows));                      \
96    sum0 = MAD(co1, temp, sum0);                                            \
97    sum1 = co3 * temp;                                                      \
98    temp = SRC(col_gl, EXTRAPOLATE_(src_y + 1, src_rows));                  \
99    sum0 = MAD(co2, temp, sum0);                                            \
100    sum1 = MAD(co2, temp, sum1);                                            \
101    temp = SRC(col_gl, EXTRAPOLATE_(src_y + 2, src_rows));                  \
102    sum0 = MAD(co3, temp, sum0);                                            \
103    sum1 = MAD(co1, temp, sum1);                                            \
104    smem[0][col_lcl] = sum0;                                                \
105    sum1 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum1);  \
106    sum1 = MAD(co3, SRC(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum1);  \
107    smem[1][col_lcl] = sum1;
108
109
110#if kercn == 4
111#define LOAD_LOCAL4(col_gl, col_lcl) \
112    sum40 =     co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));           \
113    sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40);   \
114    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y, src_rows));                       \
115    sum40 = MAD(co1, temp4, sum40);                                             \
116    sum41 = co3 * temp4;                                                        \
117    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 1, src_rows));                   \
118    sum40 = MAD(co2, temp4, sum40);                                             \
119    sum41 = MAD(co2, temp4, sum41);                                             \
120    temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 2, src_rows));                   \
121    sum40 = MAD(co3, temp4, sum40);                                             \
122    sum41 = MAD(co1, temp4, sum41);                                             \
123    vstore4(sum40, col_lcl, (__local float*) &smem[0][2]);                      \
124    sum41 = MAD(co2, SRC4(col_gl,  EXTRAPOLATE_(src_y + 3, src_rows)), sum41);  \
125    sum41 = MAD(co3, SRC4(col_gl,  EXTRAPOLATE_(src_y + 4, src_rows)), sum41);  \
126    vstore4(sum41, col_lcl, (__local float*) &smem[1][2]);
127#endif
128
129#define noconvert
130
131__kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
132                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
133{
134    const int x = get_global_id(0)*kercn;
135    const int y = 2*get_global_id(1);
136
137    __local FT smem[2][LOCAL_SIZE + 4];
138    __global uchar * dstData = dst + dst_offset;
139    __global const uchar * srcData = src + src_offset;
140
141    FT sum0, sum1, temp;
142    FT co1 = 0.375f;
143    FT co2 = 0.25f;
144    FT co3 = 0.0625f;
145
146    const int src_y = 2*y;
147    int col;
148
149    if (src_y >= 2 && src_y < src_rows - 4)
150    {
151#define EXTRAPOLATE_(val, maxVal)   val
152#if kercn == 1
153        col = EXTRAPOLATE(x, src_cols);
154        LOAD_LOCAL(col, 2 + get_local_id(0))
155#else
156        if (x < src_cols-4)
157        {
158            float4 sum40, sum41, temp4;
159            LOAD_LOCAL4(x, get_local_id(0))
160        }
161        else
162        {
163            for (int i=0; i<4; i++)
164            {
165                col = EXTRAPOLATE(x+i, src_cols);
166                LOAD_LOCAL(col, 2 + 4 * get_local_id(0) + i)
167            }
168        }
169#endif
170        if (get_local_id(0) < 2)
171        {
172            col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
173            LOAD_LOCAL(col, get_local_id(0))
174        }
175        else if (get_local_id(0) < 4)
176        {
177            col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
178            LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
179        }
180    }
181    else // need extrapolate y
182    {
183#define EXTRAPOLATE_(val, maxVal)   EXTRAPOLATE(val, maxVal)
184#if kercn == 1
185        col = EXTRAPOLATE(x, src_cols);
186        LOAD_LOCAL(col, 2 + get_local_id(0))
187#else
188        if (x < src_cols-4)
189        {
190            float4 sum40, sum41, temp4;
191            LOAD_LOCAL4(x, get_local_id(0))
192        }
193        else
194        {
195            for (int i=0; i<4; i++)
196            {
197                col = EXTRAPOLATE(x+i, src_cols);
198                LOAD_LOCAL(col, 2 + 4*get_local_id(0) + i)
199            }
200        }
201#endif
202        if (get_local_id(0) < 2)
203        {
204            col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
205            LOAD_LOCAL(col, get_local_id(0))
206        }
207        else if (get_local_id(0) < 4)
208        {
209            col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
210            LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
211        }
212    }
213
214    barrier(CLK_LOCAL_MEM_FENCE);
215
216#if kercn == 1
217    if (get_local_id(0) < LOCAL_SIZE / 2)
218    {
219        const int tid2 = get_local_id(0) * 2;
220
221        const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
222
223        if (dst_x < dst_cols)
224        {
225            for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
226            {
227#if cn == 1
228#if fdepth <= 5
229                FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2));
230#else
231                FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2));
232#endif
233#else
234                FT sum = co3 * smem[yin - y][2 + tid2 - 2];
235                sum = MAD(co2, smem[yin - y][2 + tid2 - 1], sum);
236                sum = MAD(co1, smem[yin - y][2 + tid2    ], sum);
237                sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum);
238#endif
239                sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum);
240                storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE);
241            }
242        }
243    }
244#else
245    int tid4 = get_local_id(0) * 4;
246    int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2;
247    if (dst_x < dst_cols - 1)
248    {
249        for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
250        {
251
252            FT sum =  co3* smem[yin - y][2 + tid4 + 2];
253            sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
254            sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
255            sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
256            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
257            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
258
259            dst_x ++;
260            sum =     co3* smem[yin - y][2 + tid4 + 4];
261            sum = MAD(co3, smem[yin - y][2 + tid4    ], sum);
262            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
263            sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum);
264            sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum);
265            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
266            dst_x --;
267        }
268
269    }
270    else if (dst_x < dst_cols)
271    {
272        for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
273        {
274            FT sum =  co3* smem[yin - y][2 + tid4 + 2];
275            sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
276            sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
277            sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
278            sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
279
280            storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
281        }
282    }
283#endif
284
285}
286