accelerate-kernels-private.h revision 44fb54c2b1dfacf5cb2c9b20b08c164d84f35ecd
1/*
2  Copyright 1999-2016 ImageMagick Studio LLC, a non-profit organization
3  dedicated to making software imaging solutions freely available.
4
5  You may not use this file except in compliance with the License.
6  obtain a copy of the License at
7
8    http://www.imagemagick.org/script/license.php
9
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15
16  MagickCore private kernels for accelerated functions.
17*/
18
19#ifndef _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20#define _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21
22#if defined(__cplusplus) || defined(c_plusplus)
23extern "C" {
24#endif
25
26#if defined(MAGICKCORE_OPENCL_SUPPORT)
27
28/*
29  Define declarations.
30*/
31#define OPENCL_DEFINE(VAR,...)	"\n #""define " #VAR " " #__VA_ARGS__ " \n"
32#define OPENCL_ELIF(...)	"\n #""elif " #__VA_ARGS__ " \n"
33#define OPENCL_ELSE()		"\n #""else " " \n"
34#define OPENCL_ENDIF()		"\n #""endif " " \n"
35#define OPENCL_IF(...)		"\n #""if " #__VA_ARGS__ " \n"
36#define STRINGIFY(...) #__VA_ARGS__ "\n"
37
38/*
39  Typedef declarations.
40*/
41
42typedef struct _FloatPixelPacket
43{
44  MagickRealType
45    red,
46    green,
47    blue,
48    alpha,
49    black;
50} FloatPixelPacket;
51
52const char *accelerateKernels =
53
54/*
55  Define declarations.
56*/
57  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
58  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
59  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
60  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
61  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
62  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
63  OPENCL_DEFINE(SigmaRandom, (attenuate))
64  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
65  OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
66  OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
67
68/*
69  Typedef declarations.
70*/
71  STRINGIFY(
72    typedef enum
73  {
74    UndefinedColorspace,
75    RGBColorspace,            /* Linear RGB colorspace */
76    GRAYColorspace,           /* greyscale (linear) image (faked 1 channel) */
77    TransparentColorspace,
78    OHTAColorspace,
79    LabColorspace,
80    XYZColorspace,
81    YCbCrColorspace,
82    YCCColorspace,
83    YIQColorspace,
84    YPbPrColorspace,
85    YUVColorspace,
86    CMYKColorspace,           /* negared linear RGB with black separated */
87    sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
88    HSBColorspace,
89    HSLColorspace,
90    HWBColorspace,
91    Rec601LumaColorspace,
92    Rec601YCbCrColorspace,
93    Rec709LumaColorspace,
94    Rec709YCbCrColorspace,
95    LogColorspace,
96    CMYColorspace,            /* negated linear RGB colorspace */
97    LuvColorspace,
98    HCLColorspace,
99    LCHColorspace,            /* alias for LCHuv */
100    LMSColorspace,
101    LCHabColorspace,          /* Cylindrical (Polar) Lab */
102    LCHuvColorspace,          /* Cylindrical (Polar) Luv */
103    scRGBColorspace,
104    HSIColorspace,
105    HSVColorspace,            /* alias for HSB */
106    HCLpColorspace,
107    YDbDrColorspace
108  } ColorspaceType;
109  )
110
111  STRINGIFY(
112    typedef enum
113    {
114      UndefinedCompositeOp,
115      NoCompositeOp,
116      ModulusAddCompositeOp,
117      AtopCompositeOp,
118      BlendCompositeOp,
119      BumpmapCompositeOp,
120      ChangeMaskCompositeOp,
121      ClearCompositeOp,
122      ColorBurnCompositeOp,
123      ColorDodgeCompositeOp,
124      ColorizeCompositeOp,
125      CopyBlackCompositeOp,
126      CopyBlueCompositeOp,
127      CopyCompositeOp,
128      CopyCyanCompositeOp,
129      CopyGreenCompositeOp,
130      CopyMagentaCompositeOp,
131      CopyOpacityCompositeOp,
132      CopyRedCompositeOp,
133      CopyYellowCompositeOp,
134      DarkenCompositeOp,
135      DstAtopCompositeOp,
136      DstCompositeOp,
137      DstInCompositeOp,
138      DstOutCompositeOp,
139      DstOverCompositeOp,
140      DifferenceCompositeOp,
141      DisplaceCompositeOp,
142      DissolveCompositeOp,
143      ExclusionCompositeOp,
144      HardLightCompositeOp,
145      HueCompositeOp,
146      InCompositeOp,
147      LightenCompositeOp,
148      LinearLightCompositeOp,
149      LuminizeCompositeOp,
150      MinusDstCompositeOp,
151      ModulateCompositeOp,
152      MultiplyCompositeOp,
153      OutCompositeOp,
154      OverCompositeOp,
155      OverlayCompositeOp,
156      PlusCompositeOp,
157      ReplaceCompositeOp,
158      SaturateCompositeOp,
159      ScreenCompositeOp,
160      SoftLightCompositeOp,
161      SrcAtopCompositeOp,
162      SrcCompositeOp,
163      SrcInCompositeOp,
164      SrcOutCompositeOp,
165      SrcOverCompositeOp,
166      ModulusSubtractCompositeOp,
167      ThresholdCompositeOp,
168      XorCompositeOp,
169      /* These are new operators, added after the above was last sorted.
170       * The list should be re-sorted only when a new library version is
171       * created.
172       */
173      DivideDstCompositeOp,
174      DistortCompositeOp,
175      BlurCompositeOp,
176      PegtopLightCompositeOp,
177      VividLightCompositeOp,
178      PinLightCompositeOp,
179      LinearDodgeCompositeOp,
180      LinearBurnCompositeOp,
181      MathematicsCompositeOp,
182      DivideSrcCompositeOp,
183      MinusSrcCompositeOp,
184      DarkenIntensityCompositeOp,
185      LightenIntensityCompositeOp
186    } CompositeOperator;
187  )
188
189  STRINGIFY(
190     typedef enum
191     {
192       UndefinedFunction,
193       ArcsinFunction,
194       ArctanFunction,
195       PolynomialFunction,
196       SinusoidFunction
197     } MagickFunction;
198  )
199
200  STRINGIFY(
201    typedef enum
202    {
203      UndefinedNoise,
204      UniformNoise,
205      GaussianNoise,
206      MultiplicativeGaussianNoise,
207      ImpulseNoise,
208      LaplacianNoise,
209      PoissonNoise,
210      RandomNoise
211    } NoiseType;
212  )
213
214  STRINGIFY(
215    typedef enum
216  {
217    UndefinedPixelIntensityMethod = 0,
218    AveragePixelIntensityMethod,
219    BrightnessPixelIntensityMethod,
220    LightnessPixelIntensityMethod,
221    Rec601LumaPixelIntensityMethod,
222    Rec601LuminancePixelIntensityMethod,
223    Rec709LumaPixelIntensityMethod,
224    Rec709LuminancePixelIntensityMethod,
225    RMSPixelIntensityMethod,
226    MSPixelIntensityMethod
227  } PixelIntensityMethod;
228  )
229
230  STRINGIFY(
231  typedef enum {
232    BoxWeightingFunction = 0,
233    TriangleWeightingFunction,
234    CubicBCWeightingFunction,
235    HannWeightingFunction,
236    HammingWeightingFunction,
237    BlackmanWeightingFunction,
238    GaussianWeightingFunction,
239    QuadraticWeightingFunction,
240    JincWeightingFunction,
241    SincWeightingFunction,
242    SincFastWeightingFunction,
243    KaiserWeightingFunction,
244    WelshWeightingFunction,
245    BohmanWeightingFunction,
246    LagrangeWeightingFunction,
247    CosineWeightingFunction,
248  } ResizeWeightingFunctionType;
249  )
250
251  STRINGIFY(
252     typedef enum
253     {
254       UndefinedChannel = 0x0000,
255       RedChannel = 0x0001,
256       GrayChannel = 0x0001,
257       CyanChannel = 0x0001,
258       GreenChannel = 0x0002,
259       MagentaChannel = 0x0002,
260       BlueChannel = 0x0004,
261       YellowChannel = 0x0004,
262       BlackChannel = 0x0008,
263       AlphaChannel = 0x0010,
264       OpacityChannel = 0x0010,
265       IndexChannel = 0x0020,             /* Color Index Table? */
266       ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
267       WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
268       MetaChannel = 0x0100,              /* ???? */
269       CompositeChannels = 0x002F,
270       AllChannels = 0x7ffffff,
271       /*
272         Special purpose channel types.
273         FUTURE: are these needed any more - they are more like hacks
274         SyncChannels for example is NOT a real channel but a 'flag'
275         It really says -- "User has not defined channels"
276         Though it does have extra meaning in the "-auto-level" operator
277       */
278       TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
279       RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
280       GrayChannels = 0x0400,
281       SyncChannels = 0x20000,    /* channels modified as a single unit */
282       DefaultChannels = AllChannels
283     } ChannelType;  /* must correspond to PixelChannel */
284  )
285
286/*
287  Helper functions.
288*/
289
290OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
291
292  STRINGIFY(
293    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
294    {
295      return((CLQuantum) value);
296    }
297  )
298
299OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
300
301  STRINGIFY(
302    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
303    {
304      return((CLQuantum) (257.0f*value));
305    }
306  )
307
308OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
309
310  STRINGIFY(
311    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
312    {
313      return((CLQuantum) (16843009.0*value));
314    }
315  )
316
317OPENCL_ENDIF()
318
319  STRINGIFY(
320    inline int ClampToCanvas(const int offset,const int range)
321      {
322        return clamp(offset, (int)0, range-1);
323      }
324  )
325
326  STRINGIFY(
327    inline CLQuantum ClampToQuantum(const float value)
328      {
329        return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
330      }
331  )
332
333  STRINGIFY(
334    inline uint ScaleQuantumToMap(CLQuantum value)
335      {
336        if (value >= (CLQuantum) MaxMap)
337          return ((uint)MaxMap);
338        else
339          return ((uint)value);
340      }
341  )
342
343  STRINGIFY(
344    inline float PerceptibleReciprocal(const float x)
345    {
346      float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
347      return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
348    }
349  )
350
351  STRINGIFY(
352  inline float RoundToUnity(const float value)
353   {
354     return clamp(value,0.0f,1.0f);
355   }
356  )
357
358  STRINGIFY(
359
360  inline unsigned int getPixelIndex(const unsigned int number_channels,
361    const unsigned int columns, const unsigned int x, const unsigned int y)
362  {
363    return (x * number_channels) + (y * columns * number_channels);
364  }
365
366  inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
367  inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
368  inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
369  inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
370
371  inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
372  inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
373  inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
374  inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
375
376  inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
377  inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
378  inline float getBlueF4(float4 p)                      { return p.x; }
379  inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
380
381  inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
382  inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
383  inline float getGreenF4(float4 p)                     { return p.y; }
384  inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
385
386  inline CLQuantum getRed(CLPixelType p)                { return p.z; }
387  inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
388  inline float getRedF4(float4 p)                       { return p.z; }
389  inline void setRedF4(float4* p, float value)          { (*p).z = value; }
390
391  inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
392  inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
393  inline float getAlphaF4(float4 p)                     { return p.w; }
394  inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
395
396  inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
397    const ChannelType channel, float *red, float *green, float *blue, float *alpha)
398  {
399    if ((channel & RedChannel) != 0)
400      *red=getPixelRed(p);
401
402    if (number_channels > 2)
403      {
404        if ((channel & GreenChannel) != 0)
405          *green=getPixelGreen(p);
406
407        if ((channel & BlueChannel) != 0)
408          *blue=getPixelBlue(p);
409      }
410
411    if (((number_channels == 4) || (number_channels == 2)) &&
412        ((channel & AlphaChannel) != 0))
413      *alpha=getPixelAlpha(p,number_channels);
414  }
415
416  inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
417    const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
418  {
419    const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
420
421    float red = 0.0f;
422    float green = 0.0f;
423    float blue = 0.0f;
424    float alpha = 0.0f;
425
426    ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
427    return (float4)(red, green, blue, alpha);
428  }
429
430  inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
431    const ChannelType channel, float red, float green, float blue, float alpha)
432  {
433    if ((channel & RedChannel) != 0)
434      setPixelRed(p,ClampToQuantum(red));
435
436    if (number_channels > 2)
437      {
438        if ((channel & GreenChannel) != 0)
439          setPixelGreen(p,ClampToQuantum(green));
440
441        if ((channel & BlueChannel) != 0)
442          setPixelBlue(p,ClampToQuantum(blue));
443      }
444
445    if (((number_channels == 4) || (number_channels == 2)) &&
446        ((channel & AlphaChannel) != 0))
447      setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
448  }
449
450  inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
451    const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
452    float4 pixel)
453  {
454    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
455    WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
456  }
457
458  inline float GetPixelIntensity(const unsigned int colorspace,
459    const unsigned int method,float red,float green,float blue)
460  {
461    float intensity;
462
463    if (colorspace == GRAYColorspace)
464      return red;
465
466    switch (method)
467    {
468      case AveragePixelIntensityMethod:
469        {
470          intensity=(red+green+blue)/3.0;
471          break;
472        }
473      case BrightnessPixelIntensityMethod:
474        {
475          intensity=MagickMax(MagickMax(red,green),blue);
476          break;
477        }
478      case LightnessPixelIntensityMethod:
479        {
480          intensity=(MagickMin(MagickMin(red,green),blue)+
481              MagickMax(MagickMax(red,green),blue))/2.0;
482          break;
483        }
484      case MSPixelIntensityMethod:
485        {
486          intensity=(float) (((float) red*red+green*green+blue*blue)/
487              (3.0*QuantumRange));
488          break;
489        }
490      case Rec601LumaPixelIntensityMethod:
491        {
492          /*
493          if (image->colorspace == RGBColorspace)
494          {
495            red=EncodePixelGamma(red);
496            green=EncodePixelGamma(green);
497            blue=EncodePixelGamma(blue);
498          }
499          */
500          intensity=0.298839*red+0.586811*green+0.114350*blue;
501          break;
502        }
503      case Rec601LuminancePixelIntensityMethod:
504        {
505          /*
506          if (image->colorspace == sRGBColorspace)
507          {
508            red=DecodePixelGamma(red);
509            green=DecodePixelGamma(green);
510            blue=DecodePixelGamma(blue);
511          }
512          */
513          intensity=0.298839*red+0.586811*green+0.114350*blue;
514          break;
515        }
516      case Rec709LumaPixelIntensityMethod:
517      default:
518        {
519          /*
520          if (image->colorspace == RGBColorspace)
521          {
522            red=EncodePixelGamma(red);
523            green=EncodePixelGamma(green);
524            blue=EncodePixelGamma(blue);
525          }
526          */
527          intensity=0.212656*red+0.715158*green+0.072186*blue;
528          break;
529        }
530      case Rec709LuminancePixelIntensityMethod:
531        {
532          /*
533          if (image->colorspace == sRGBColorspace)
534          {
535            red=DecodePixelGamma(red);
536            green=DecodePixelGamma(green);
537            blue=DecodePixelGamma(blue);
538          }
539          */
540          intensity=0.212656*red+0.715158*green+0.072186*blue;
541          break;
542        }
543      case RMSPixelIntensityMethod:
544        {
545          intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
546              sqrt(3.0));
547          break;
548        }
549    }
550
551    return intensity;
552  }
553
554  inline int mirrorBottom(int value)
555  {
556      return (value < 0) ? - (value) : value;
557  }
558
559  inline int mirrorTop(int value, int width)
560  {
561      return (value >= width) ? (2 * width - value - 1) : value;
562  }
563  )
564
565/*
566%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
567%                                                                             %
568%                                                                             %
569%                                                                             %
570%     A d d N o i s e                                                         %
571%                                                                             %
572%                                                                             %
573%                                                                             %
574%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
575*/
576
577  STRINGIFY(
578  /*
579  Part of MWC64X by David Thomas, dt10@imperial.ac.uk
580  This is provided under BSD, full license is with the main package.
581  See http://www.doc.ic.ac.uk/~dt10/research
582  */
583
584  // Pre: a<M, b<M
585  // Post: r=(a+b) mod M
586  ulong MWC_AddMod64(ulong a, ulong b, ulong M)
587  {
588    ulong v=a+b;
589    //if( (v>=M) || (v<a) )
590    if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
591      v=v-M;
592    return v;
593  }
594
595  // Pre: a<M,b<M
596  // Post: r=(a*b) mod M
597  // This could be done more efficently, but it is portable, and should
598  // be easy to understand. It can be replaced with any of the better
599  // modular multiplication algorithms (for example if you know you have
600  // double precision available or something).
601  ulong MWC_MulMod64(ulong a, ulong b, ulong M)
602  {
603    ulong r=0;
604    while(a!=0){
605      if(a&1)
606        r=MWC_AddMod64(r,b,M);
607      b=MWC_AddMod64(b,b,M);
608      a=a>>1;
609    }
610    return r;
611  }
612
613  // Pre: a<M, e>=0
614  // Post: r=(a^b) mod M
615  // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
616  // most architectures
617  ulong MWC_PowMod64(ulong a, ulong e, ulong M)
618  {
619    ulong sqr=a, acc=1;
620    while(e!=0){
621      if(e&1)
622        acc=MWC_MulMod64(acc,sqr,M);
623        sqr=MWC_MulMod64(sqr,sqr,M);
624      e=e>>1;
625    }
626    return acc;
627  }
628
629  uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
630  {
631    ulong m=MWC_PowMod64(A, distance, M);
632    ulong x=curr.x*(ulong)A+curr.y;
633    x=MWC_MulMod64(x, m, M);
634    return (uint2)((uint)(x/A), (uint)(x%A));
635  }
636
637  uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
638  {
639    // This is an arbitrary constant for starting LCG jumping from. I didn't
640    // want to start from 1, as then you end up with the two or three first values
641    // being a bit poor in ones - once you've decided that, one constant is as
642    // good as any another. There is no deep mathematical reason for it, I just
643    // generated a random number.
644    enum{ MWC_BASEID = 4077358422479273989UL };
645
646    ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
647    ulong m=MWC_PowMod64(A, dist, M);
648
649    ulong x=MWC_MulMod64(MWC_BASEID, m, M);
650    return (uint2)((uint)(x/A), (uint)(x%A));
651  }
652
653  //! Represents the state of a particular generator
654  typedef struct{ uint x; uint c; } mwc64x_state_t;
655
656  enum{ MWC64X_A = 4294883355U };
657  enum{ MWC64X_M = 18446383549859758079UL };
658
659  void MWC64X_Step(mwc64x_state_t *s)
660  {
661    uint X=s->x, C=s->c;
662
663    uint Xn=MWC64X_A*X+C;
664    uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
665    uint Cn=mad_hi(MWC64X_A,X,carry);
666
667    s->x=Xn;
668    s->c=Cn;
669  }
670
671  void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
672  {
673    uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
674    s->x=tmp.x;
675    s->c=tmp.y;
676  }
677
678  void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
679  {
680    uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
681    s->x=tmp.x;
682    s->c=tmp.y;
683  }
684
685  //! Return a 32-bit integer in the range [0..2^32)
686  uint MWC64X_NextUint(mwc64x_state_t *s)
687  {
688    uint res=s->x ^ s->c;
689    MWC64X_Step(s);
690    return res;
691  }
692
693  //
694  // End of MWC64X excerpt
695  //
696
697  float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
698  {
699    return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
700  }
701
702  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
703  {
704    float
705      alpha,
706      beta,
707      noise,
708      sigma;
709
710    noise = 0.0f;
711    alpha=mwcReadPseudoRandomValue(r);
712    switch(noise_type)
713    {
714      case UniformNoise:
715      default:
716        {
717          noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
718          break;
719        }
720      case GaussianNoise:
721        {
722          float
723            gamma,
724            tau;
725
726          if (alpha == 0.0f)
727            alpha=1.0f;
728          beta=mwcReadPseudoRandomValue(r);
729          gamma=sqrt(-2.0f*log(alpha));
730          sigma=gamma*cospi((2.0f*beta));
731          tau=gamma*sinpi((2.0f*beta));
732          noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
733          break;
734        }
735      case ImpulseNoise:
736      {
737        if (alpha < (SigmaImpulse/2.0f))
738          noise=0.0f;
739        else
740          if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
741            noise=QuantumRange;
742          else
743            noise=pixel;
744        break;
745      }
746      case LaplacianNoise:
747      {
748        if (alpha <= 0.5f)
749          {
750            if (alpha <= MagickEpsilon)
751              noise=(pixel-QuantumRange);
752            else
753              noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
754                0.5f);
755            break;
756          }
757        beta=1.0f-alpha;
758        if (beta <= (0.5f*MagickEpsilon))
759          noise=(pixel+QuantumRange);
760        else
761          noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
762        break;
763      }
764      case MultiplicativeGaussianNoise:
765      {
766        sigma=1.0f;
767        if (alpha > MagickEpsilon)
768          sigma=sqrt(-2.0f*log(alpha));
769        beta=mwcReadPseudoRandomValue(r);
770        noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
771          cospi((2.0f*beta))/2.0f);
772        break;
773      }
774      case PoissonNoise:
775      {
776        float
777          poisson;
778        unsigned int i;
779        poisson=exp(-SigmaPoisson*QuantumScale*pixel);
780        for (i=0; alpha > poisson; i++)
781        {
782          beta=mwcReadPseudoRandomValue(r);
783          alpha*=beta;
784        }
785        noise=(QuantumRange*i/SigmaPoisson);
786        break;
787      }
788      case RandomNoise:
789      {
790        noise=(QuantumRange*SigmaRandom*alpha);
791        break;
792      }
793    }
794    return noise;
795  }
796
797  __kernel
798  void AddNoise(const __global CLQuantum *image,
799    const unsigned int number_channels,const ChannelType channel,
800    const unsigned int length,const unsigned int pixelsPerWorkItem,
801    const NoiseType noise_type,const float attenuate,const unsigned int seed0,
802    const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
803    __global CLQuantum *filteredImage)
804  {
805    mwc64x_state_t rng;
806    rng.x = seed0;
807    rng.c = seed1;
808
809    uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
810    uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
811    MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
812
813    uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
814    uint count = pixelsPerWorkItem;
815
816    while (count > 0 && pos < length)
817    {
818      const __global CLQuantum *p = image + pos;
819      __global CLQuantum *q = filteredImage + pos;
820
821      float red;
822      float green;
823      float blue;
824      float alpha;
825
826      ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
827
828      if ((channel & RedChannel) != 0)
829        red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
830
831      if (number_channels > 2)
832      {
833        if ((channel & GreenChannel) != 0)
834          green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
835
836        if ((channel & BlueChannel) != 0)
837          blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
838      }
839
840      if (((number_channels == 4) || (number_channels == 2)) &&
841          ((channel & AlphaChannel) != 0))
842        alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
843
844      WriteChannels(q, number_channels, channel, red, green, blue, alpha);
845
846      pos += (get_local_size(0) * number_channels);
847      count--;
848    }
849  }
850  )
851
852/*
853%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
854%                                                                             %
855%                                                                             %
856%                                                                             %
857%    B l u r                                                                  %
858%                                                                             %
859%                                                                             %
860%                                                                             %
861%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
862*/
863
864  STRINGIFY(
865  /*
866  Reduce image noise and reduce detail levels by row
867  */
868  __kernel void BlurRow(const __global CLQuantum *image,
869    const unsigned int number_channels,const ChannelType channel,
870    __constant float *filter,const unsigned int width,
871    const unsigned int imageColumns,const unsigned int imageRows,
872    __local float4 *temp,__global float4 *tempImage)
873  {
874    const int x = get_global_id(0);
875    const int y = get_global_id(1);
876
877    const int columns = imageColumns;
878
879    const unsigned int radius = (width-1)/2;
880    const int wsize = get_local_size(0);
881    const unsigned int loadSize = wsize+width;
882
883    //group coordinate
884    const int groupX=get_local_size(0)*get_group_id(0);
885
886    //parallel load and clamp
887    for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
888    {
889      int cx = ClampToCanvas(i + groupX - radius, columns);
890      temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
891    }
892
893    // barrier
894    barrier(CLK_LOCAL_MEM_FENCE);
895
896    // only do the work if this is not a patched item
897    if (get_global_id(0) < columns)
898    {
899      // compute
900      float4 result = (float4) 0;
901
902      int i = 0;
903
904      for ( ; i+7 < width; )
905      {
906        for (int j=0; j < 8; j++)
907          result+=filter[i+j]*temp[i+j+get_local_id(0)];
908        i+=8;
909      }
910
911      for ( ; i < width; i++)
912        result+=filter[i]*temp[i+get_local_id(0)];
913
914      // write back to global
915      tempImage[y*columns+x] = result;
916    }
917  }
918  )
919
920  STRINGIFY(
921  /*
922  Reduce image noise and reduce detail levels by line
923  */
924  __kernel void BlurColumn(const __global float4 *blurRowData,
925    const unsigned int number_channels,const ChannelType channel,
926    __constant float *filter,const unsigned int width,
927    const unsigned int imageColumns,const unsigned int imageRows,
928    __local float4 *temp,__global CLQuantum *filteredImage)
929  {
930    const int x = get_global_id(0);
931    const int y = get_global_id(1);
932
933    const int columns = imageColumns;
934    const int rows = imageRows;
935
936    unsigned int radius = (width-1)/2;
937    const int wsize = get_local_size(1);
938    const unsigned int loadSize = wsize+width;
939
940    //group coordinate
941    const int groupX=get_local_size(0)*get_group_id(0);
942    const int groupY=get_local_size(1)*get_group_id(1);
943    //notice that get_local_size(0) is 1, so
944    //groupX=get_group_id(0);
945
946    //parallel load and clamp
947    for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
948      temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
949
950    // barrier
951    barrier(CLK_LOCAL_MEM_FENCE);
952
953    // only do the work if this is not a patched item
954    if (get_global_id(1) < rows)
955    {
956      // compute
957      float4 result = (float4) 0;
958
959      int i = 0;
960
961      for ( ; i+7 < width; )
962      {
963        for (int j=0; j < 8; j++)
964          result+=filter[i+j]*temp[i+j+get_local_id(1)];
965        i+=8;
966      }
967
968      for ( ; i < width; i++)
969        result+=filter[i]*temp[i+get_local_id(1)];
970
971      // write back to global
972      WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
973    }
974  }
975  )
976
977/*
978%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
979%                                                                             %
980%                                                                             %
981%                                                                             %
982%    C o m p o s i t e                                                        %
983%                                                                             %
984%                                                                             %
985%                                                                             %
986%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
987*/
988
989  STRINGIFY(
990    inline float ColorDodge(const float Sca,
991      const float Sa,const float Dca,const float Da)
992    {
993      /*
994        Oct 2004 SVG specification.
995      */
996      if ((Sca*Da+Dca*Sa) >= Sa*Da)
997        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
998      return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
999
1000
1001      /*
1002        New specification, March 2009 SVG specification.  This specification was
1003        also wrong of non-overlap cases.
1004      */
1005      /*
1006      if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
1007        return(Sca*(1.0-Da));
1008      if (fabs(Sca-Sa) < MagickEpsilon)
1009        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
1010      return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca)));
1011      */
1012
1013      /*
1014        Working from first principles using the original formula:
1015
1016           f(Sc,Dc) = Dc/(1-Sc)
1017
1018        This works correctly! Looks like the 2004 model was right but just
1019        required a extra condition for correct handling.
1020      */
1021
1022      /*
1023      if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon))
1024        return(Sca*(1.0-Da)+Dca*(1.0-Sa));
1025      if (fabs(Sca-Sa) < MagickEpsilon)
1026        return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa));
1027      return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa));
1028      */
1029    }
1030
1031    inline void CompositeColorDodge(const float4 *p,
1032      const float4 *q,float4 *composite) {
1033
1034      float
1035      Da,
1036      gamma,
1037      Sa;
1038
1039      Sa=QuantumScale*getAlphaF4(*p);  /* simplify and speed up equations */
1040      Da=QuantumScale*getAlphaF4(*q);
1041      gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */
1042      setAlphaF4(composite,QuantumRange*gamma);
1043      gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma);
1044      setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale*
1045        getRedF4(*q)*Da,Da));
1046      setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale*
1047        getGreenF4(*q)*Da,Da));
1048      setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale*
1049        getBlueF4(*q)*Da,Da));
1050    }
1051  )
1052
1053  STRINGIFY(
1054    inline void MagickPixelCompositePlus(const float4 *p,
1055      const float alpha,const float4 *q,
1056      const float beta,float4 *composite)
1057    {
1058      float
1059        gamma;
1060
1061      float
1062        Da,
1063        Sa;
1064      /*
1065        Add two pixels with the given opacities.
1066      */
1067      Sa=QuantumScale*alpha;
1068      Da=QuantumScale*beta;
1069      gamma=RoundToUnity(Sa+Da);  /* 'Plus' blending -- not 'Over' blending */
1070      setAlphaF4(composite,QuantumRange*gamma);
1071      gamma=PerceptibleReciprocal(gamma);
1072      setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q)));
1073      setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q)));
1074      setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q)));
1075    }
1076  )
1077
1078  STRINGIFY(
1079    inline void MagickPixelCompositeBlend(const float4 *p,
1080      const float alpha,const float4 *q,
1081      const float beta,float4 *composite)
1082    {
1083      MagickPixelCompositePlus(p,(float) (alpha*
1084      (getAlphaF4(*p))),q,(float) (beta*
1085      (getAlphaF4(*q))),composite);
1086    }
1087  )
1088
1089  STRINGIFY(
1090    __kernel
1091    void Composite(__global CLPixelType *image,
1092                   const unsigned int imageWidth,
1093                   const unsigned int imageHeight,
1094                   const __global CLPixelType *compositeImage,
1095                   const unsigned int compositeWidth,
1096                   const unsigned int compositeHeight,
1097                   const unsigned int compose,
1098                   const ChannelType channel,
1099                   const unsigned int matte,
1100                   const float destination_dissolve,
1101                   const float source_dissolve) {
1102
1103      uint2 index;
1104      index.x = get_global_id(0);
1105      index.y = get_global_id(1);
1106
1107
1108      if (index.x >= imageWidth
1109        || index.y >= imageHeight) {
1110          return;
1111      }
1112      const CLPixelType inputPixel = image[index.y*imageWidth+index.x];
1113      float4 destination;
1114      setRedF4(&destination,getRed(inputPixel));
1115      setGreenF4(&destination,getGreen(inputPixel));
1116      setBlueF4(&destination,getBlue(inputPixel));
1117
1118
1119      const CLPixelType compositePixel
1120        = compositeImage[index.y*imageWidth+index.x];
1121      float4 source;
1122      setRedF4(&source,getRed(compositePixel));
1123      setGreenF4(&source,getGreen(compositePixel));
1124      setBlueF4(&source,getBlue(compositePixel));
1125
1126      if (matte != 0) {
1127        setAlphaF4(&destination,getAlpha(inputPixel));
1128        setAlphaF4(&source,getAlpha(compositePixel));
1129      }
1130      else {
1131        setAlphaF4(&destination,1.0f);
1132        setAlphaF4(&source,1.0f);
1133      }
1134
1135      float4 composite=destination;
1136
1137      CompositeOperator op = (CompositeOperator)compose;
1138      switch (op) {
1139      case ColorDodgeCompositeOp:
1140        CompositeColorDodge(&source,&destination,&composite);
1141        break;
1142      case BlendCompositeOp:
1143        MagickPixelCompositeBlend(&source,source_dissolve,&destination,
1144            destination_dissolve,&composite);
1145        break;
1146      default:
1147        // unsupported operators
1148        break;
1149      };
1150
1151      CLPixelType outputPixel;
1152      setRed(&outputPixel, ClampToQuantum(getRedF4(composite)));
1153      setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite)));
1154      setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite)));
1155      setAlpha(&outputPixel, ClampToQuantum(getAlphaF4(composite)));
1156      image[index.y*imageWidth+index.x] = outputPixel;
1157    }
1158  )
1159
1160/*
1161%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1162%                                                                             %
1163%                                                                             %
1164%                                                                             %
1165%    C o n t r a s t                                                          %
1166%                                                                             %
1167%                                                                             %
1168%                                                                             %
1169%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1170*/
1171
1172  STRINGIFY(
1173
1174  inline float3 ConvertRGBToHSB(CLPixelType pixel) {
1175    float3 HueSaturationBrightness;
1176    HueSaturationBrightness.x = 0.0f; // Hue
1177    HueSaturationBrightness.y = 0.0f; // Saturation
1178    HueSaturationBrightness.z = 0.0f; // Brightness
1179
1180    float r=(float) getRed(pixel);
1181    float g=(float) getGreen(pixel);
1182    float b=(float) getBlue(pixel);
1183
1184    float tmin=MagickMin(MagickMin(r,g),b);
1185    float tmax=MagickMax(MagickMax(r,g),b);
1186
1187    if (tmax!=0.0f) {
1188      float delta=tmax-tmin;
1189      HueSaturationBrightness.y=delta/tmax;
1190      HueSaturationBrightness.z=QuantumScale*tmax;
1191
1192      if (delta != 0.0f) {
1193  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
1194  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
1195        HueSaturationBrightness.x/=6.0f;
1196        HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
1197      }
1198    }
1199    return HueSaturationBrightness;
1200  }
1201
1202  inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
1203
1204    float hue = HueSaturationBrightness.x;
1205    float brightness = HueSaturationBrightness.z;
1206    float saturation = HueSaturationBrightness.y;
1207
1208    CLPixelType rgb;
1209
1210    if (saturation == 0.0f) {
1211      setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
1212      setGreen(&rgb,getRed(rgb));
1213      setBlue(&rgb,getRed(rgb));
1214    }
1215    else {
1216
1217      float h=6.0f*(hue-floor(hue));
1218      float f=h-floor(h);
1219      float p=brightness*(1.0f-saturation);
1220      float q=brightness*(1.0f-saturation*f);
1221      float t=brightness*(1.0f-(saturation*(1.0f-f)));
1222
1223      float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
1224      float clamped_t = ClampToQuantum(QuantumRange*t);
1225      float clamped_p = ClampToQuantum(QuantumRange*p);
1226      float clamped_q = ClampToQuantum(QuantumRange*q);
1227      int ih = (int)h;
1228      setRed(&rgb, (ih == 1)?clamped_q:
1229        (ih == 2 || ih == 3)?clamped_p:
1230        (ih == 4)?clamped_t:
1231                 clampedBrightness);
1232
1233      setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
1234        (ih == 3)?clamped_q:
1235        (ih == 4 || ih == 5)?clamped_p:
1236                 clamped_t);
1237
1238      setBlue(&rgb, (ih == 2)?clamped_t:
1239        (ih == 3 || ih == 4)?clampedBrightness:
1240        (ih == 5)?clamped_q:
1241                 clamped_p);
1242    }
1243    return rgb;
1244  }
1245
1246  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
1247  {
1248
1249    const int sign = sharpen!=0?1:-1;
1250    const int x = get_global_id(0);
1251    const int y = get_global_id(1);
1252    const int columns = get_global_size(0);
1253    const int c = x + y * columns;
1254
1255    CLPixelType pixel = im[c];
1256    float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
1257    float brightness = HueSaturationBrightness.z;
1258    brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1259    brightness = clamp(brightness,0.0f,1.0f);
1260    HueSaturationBrightness.z = brightness;
1261
1262    CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
1263    filteredPixel.w = pixel.w;
1264    im[c] = filteredPixel;
1265  }
1266  )
1267
1268/*
1269%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1270%                                                                             %
1271%                                                                             %
1272%                                                                             %
1273%    C o n t r a s t S t r e t c h                                            %
1274%                                                                             %
1275%                                                                             %
1276%                                                                             %
1277%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1278*/
1279
1280    STRINGIFY(
1281    /*
1282    */
1283    __kernel void Histogram(__global CLPixelType * restrict im,
1284      const ChannelType channel,
1285      const unsigned int colorspace,
1286      const unsigned int method,
1287      __global uint4 * restrict histogram)
1288      {
1289        const int x = get_global_id(0);
1290        const int y = get_global_id(1);
1291        const int columns = get_global_size(0);
1292        const int c = x + y * columns;
1293        if ((channel & SyncChannels) != 0)
1294        {
1295          float red=(float)getRed(im[c]);
1296          float green=(float)getGreen(im[c]);
1297          float blue=(float)getBlue(im[c]);
1298
1299          float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1300          uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1301          atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1302        }
1303        else
1304        {
1305          // for equalizing, we always need all channels?
1306          // otherwise something more
1307        }
1308      }
1309    )
1310
1311    STRINGIFY(
1312    /*
1313    */
1314    __kernel void ContrastStretch(__global CLPixelType * restrict im,
1315      const ChannelType channel,
1316      __global CLPixelType * restrict stretch_map,
1317      const float4 white, const float4 black)
1318      {
1319        const int x = get_global_id(0);
1320        const int y = get_global_id(1);
1321        const int columns = get_global_size(0);
1322        const int c = x + y * columns;
1323
1324        uint ePos;
1325        CLPixelType oValue, eValue;
1326        CLQuantum red, green, blue, alpha;
1327
1328        //read from global
1329        oValue=im[c];
1330
1331        if ((channel & RedChannel) != 0)
1332        {
1333          if (getRedF4(white) != getRedF4(black))
1334          {
1335            ePos = ScaleQuantumToMap(getRed(oValue));
1336            eValue = stretch_map[ePos];
1337            red = getRed(eValue);
1338          }
1339        }
1340
1341        if ((channel & GreenChannel) != 0)
1342        {
1343          if (getGreenF4(white) != getGreenF4(black))
1344          {
1345            ePos = ScaleQuantumToMap(getGreen(oValue));
1346            eValue = stretch_map[ePos];
1347            green = getGreen(eValue);
1348          }
1349        }
1350
1351        if ((channel & BlueChannel) != 0)
1352        {
1353          if (getBlueF4(white) != getBlueF4(black))
1354          {
1355            ePos = ScaleQuantumToMap(getBlue(oValue));
1356            eValue = stretch_map[ePos];
1357            blue = getBlue(eValue);
1358          }
1359        }
1360
1361        if ((channel & AlphaChannel) != 0)
1362        {
1363          if (getAlphaF4(white) != getAlphaF4(black))
1364          {
1365            ePos = ScaleQuantumToMap(getAlpha(oValue));
1366            eValue = stretch_map[ePos];
1367            alpha = getAlpha(eValue);
1368          }
1369        }
1370
1371        //write back
1372        im[c]=(CLPixelType)(blue, green, red, alpha);
1373
1374      }
1375    )
1376
1377/*
1378%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1379%                                                                             %
1380%                                                                             %
1381%                                                                             %
1382%    C o n v o l v e                                                          %
1383%                                                                             %
1384%                                                                             %
1385%                                                                             %
1386%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1387*/
1388
1389  STRINGIFY(
1390    __kernel
1391    void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1392    const unsigned int imageWidth, const unsigned int imageHeight,
1393    __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1394    const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1395
1396      int2 blockID;
1397      blockID.x = get_global_id(0) / get_local_size(0);
1398      blockID.y = get_global_id(1) / get_local_size(1);
1399
1400      // image area processed by this workgroup
1401      int2 imageAreaOrg;
1402      imageAreaOrg.x = blockID.x * get_local_size(0);
1403      imageAreaOrg.y = blockID.y * get_local_size(1);
1404
1405      int2 midFilterDimen;
1406      midFilterDimen.x = (filterWidth-1)/2;
1407      midFilterDimen.y = (filterHeight-1)/2;
1408
1409      int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1410
1411      // dimension of the local cache
1412      int2 cachedAreaDimen;
1413      cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1414      cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1415
1416      // cache the pixels accessed by this workgroup in local memory
1417      int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1418      int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1419      int groupSize = get_local_size(0) * get_local_size(1);
1420      for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1421
1422        int2 cachedAreaIndex;
1423        cachedAreaIndex.x = i % cachedAreaDimen.x;
1424        cachedAreaIndex.y = i / cachedAreaDimen.x;
1425
1426        int2 imagePixelIndex;
1427        imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1428
1429        // only support EdgeVirtualPixelMethod through ClampToCanvas
1430        // TODO: implement other virtual pixel method
1431        imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1432        imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1433
1434        pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1435      }
1436
1437      // cache the filter
1438      for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1439        filterCache[i] = filter[i];
1440      }
1441      barrier(CLK_LOCAL_MEM_FENCE);
1442
1443
1444      int2 imageIndex;
1445      imageIndex.x = imageAreaOrg.x + get_local_id(0);
1446      imageIndex.y = imageAreaOrg.y + get_local_id(1);
1447
1448      // if out-of-range, stops here and quit
1449      if (imageIndex.x >= imageWidth
1450        || imageIndex.y >= imageHeight) {
1451          return;
1452      }
1453
1454      int filterIndex = 0;
1455      float4 sum = (float4)0.0f;
1456      float gamma = 0.0f;
1457      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1458        int cacheIndexY = get_local_id(1);
1459        for (int j = 0; j < filterHeight; j++) {
1460          int cacheIndexX = get_local_id(0);
1461          for (int i = 0; i < filterWidth; i++) {
1462            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1463            float f = filterCache[filterIndex];
1464
1465            sum.x += f * p.x;
1466            sum.y += f * p.y;
1467            sum.z += f * p.z;
1468            sum.w += f * p.w;
1469
1470            gamma += f;
1471            filterIndex++;
1472            cacheIndexX++;
1473          }
1474          cacheIndexY++;
1475        }
1476      }
1477      else {
1478        int cacheIndexY = get_local_id(1);
1479        for (int j = 0; j < filterHeight; j++) {
1480          int cacheIndexX = get_local_id(0);
1481          for (int i = 0; i < filterWidth; i++) {
1482
1483            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1484            float alpha = QuantumScale*p.w;
1485            float f = filterCache[filterIndex];
1486            float g = alpha * f;
1487
1488            sum.x += g*p.x;
1489            sum.y += g*p.y;
1490            sum.z += g*p.z;
1491            sum.w += f*p.w;
1492
1493            gamma += g;
1494            filterIndex++;
1495            cacheIndexX++;
1496          }
1497          cacheIndexY++;
1498        }
1499        gamma = PerceptibleReciprocal(gamma);
1500        sum.xyz = gamma*sum.xyz;
1501      }
1502      CLPixelType outputPixel;
1503      outputPixel.x = ClampToQuantum(sum.x);
1504      outputPixel.y = ClampToQuantum(sum.y);
1505      outputPixel.z = ClampToQuantum(sum.z);
1506      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1507
1508      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1509    }
1510  )
1511
1512  STRINGIFY(
1513    __kernel
1514    void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1515                  const uint imageWidth, const uint imageHeight,
1516                  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1517                  const uint matte, const ChannelType channel) {
1518
1519      int2 imageIndex;
1520      imageIndex.x = get_global_id(0);
1521      imageIndex.y = get_global_id(1);
1522
1523      /*
1524      unsigned int imageWidth = get_global_size(0);
1525      unsigned int imageHeight = get_global_size(1);
1526      */
1527      if (imageIndex.x >= imageWidth
1528          || imageIndex.y >= imageHeight)
1529          return;
1530
1531      int2 midFilterDimen;
1532      midFilterDimen.x = (filterWidth-1)/2;
1533      midFilterDimen.y = (filterHeight-1)/2;
1534
1535      int filterIndex = 0;
1536      float4 sum = (float4)0.0f;
1537      float gamma = 0.0f;
1538      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1539        for (int j = 0; j < filterHeight; j++) {
1540          int2 inputPixelIndex;
1541          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1542          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1543          for (int i = 0; i < filterWidth; i++) {
1544            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1545            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1546
1547            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1548            float f = filter[filterIndex];
1549
1550            sum.x += f * p.x;
1551            sum.y += f * p.y;
1552            sum.z += f * p.z;
1553            sum.w += f * p.w;
1554
1555            gamma += f;
1556
1557            filterIndex++;
1558          }
1559        }
1560      }
1561      else {
1562
1563        for (int j = 0; j < filterHeight; j++) {
1564          int2 inputPixelIndex;
1565          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1566          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1567          for (int i = 0; i < filterWidth; i++) {
1568            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1569            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1570
1571            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1572            float alpha = QuantumScale*p.w;
1573            float f = filter[filterIndex];
1574            float g = alpha * f;
1575
1576            sum.x += g*p.x;
1577            sum.y += g*p.y;
1578            sum.z += g*p.z;
1579            sum.w += f*p.w;
1580
1581            gamma += g;
1582
1583
1584            filterIndex++;
1585          }
1586        }
1587        gamma = PerceptibleReciprocal(gamma);
1588        sum.xyz = gamma*sum.xyz;
1589      }
1590
1591      CLPixelType outputPixel;
1592      outputPixel.x = ClampToQuantum(sum.x);
1593      outputPixel.y = ClampToQuantum(sum.y);
1594      outputPixel.z = ClampToQuantum(sum.z);
1595      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1596
1597      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1598    }
1599  )
1600
1601/*
1602%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1603%                                                                             %
1604%                                                                             %
1605%                                                                             %
1606%     D e s p e c k l e                                                       %
1607%                                                                             %
1608%                                                                             %
1609%                                                                             %
1610%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1611*/
1612
1613  STRINGIFY(
1614
1615  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1616  , const unsigned int imageWidth, const unsigned int imageHeight
1617  , const int2 offset, const int polarity, const int matte) {
1618
1619    int x = get_global_id(0);
1620    int y = get_global_id(1);
1621
1622    CLPixelType v = inputImage[y*imageWidth+x];
1623
1624    int2 neighbor;
1625    neighbor.y = y + offset.y;
1626    neighbor.x = x + offset.x;
1627
1628    int2 clampedNeighbor;
1629    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1630    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1631
1632    CLPixelType r = (clampedNeighbor.x == neighbor.x
1633                     && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1634    :(CLPixelType)0;
1635
1636    int sv[4];
1637    sv[0] = (int)v.x;
1638    sv[1] = (int)v.y;
1639    sv[2] = (int)v.z;
1640    sv[3] = (int)v.w;
1641
1642    int sr[4];
1643    sr[0] = (int)r.x;
1644    sr[1] = (int)r.y;
1645    sr[2] = (int)r.z;
1646    sr[3] = (int)r.w;
1647
1648    if (polarity > 0) {
1649      \n #pragma unroll 4\n
1650      for (unsigned int i = 0; i < 4; i++) {
1651        sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1652      }
1653    }
1654    else {
1655      \n #pragma unroll 4\n
1656      for (unsigned int i = 0; i < 4; i++) {
1657        sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1658      }
1659
1660    }
1661
1662    v.x = (CLQuantum)sv[0];
1663    v.y = (CLQuantum)sv[1];
1664    v.z = (CLQuantum)sv[2];
1665
1666    if (matte!=0)
1667      v.w = (CLQuantum)sv[3];
1668
1669    outputImage[y*imageWidth+x] = v;
1670
1671    }
1672
1673
1674  )
1675
1676  STRINGIFY(
1677
1678  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1679  , const unsigned int imageWidth, const unsigned int imageHeight
1680  , const int2 offset, const int polarity, const int matte) {
1681
1682    int x = get_global_id(0);
1683    int y = get_global_id(1);
1684
1685    CLPixelType v = inputImage[y*imageWidth+x];
1686
1687    int2 neighbor, clampedNeighbor;
1688
1689    neighbor.y = y + offset.y;
1690    neighbor.x = x + offset.x;
1691    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1692    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1693
1694    CLPixelType r = (clampedNeighbor.x == neighbor.x
1695      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1696    :(CLPixelType)0;
1697
1698
1699    neighbor.y = y - offset.y;
1700    neighbor.x = x - offset.x;
1701    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1702    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1703
1704    CLPixelType s = (clampedNeighbor.x == neighbor.x
1705      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1706    :(CLPixelType)0;
1707
1708
1709    int sv[4];
1710    sv[0] = (int)v.x;
1711    sv[1] = (int)v.y;
1712    sv[2] = (int)v.z;
1713    sv[3] = (int)v.w;
1714
1715    int sr[4];
1716    sr[0] = (int)r.x;
1717    sr[1] = (int)r.y;
1718    sr[2] = (int)r.z;
1719    sr[3] = (int)r.w;
1720
1721    int ss[4];
1722    ss[0] = (int)s.x;
1723    ss[1] = (int)s.y;
1724    ss[2] = (int)s.z;
1725    ss[3] = (int)s.w;
1726
1727    if (polarity > 0) {
1728      \n #pragma unroll 4\n
1729      for (unsigned int i = 0; i < 4; i++) {
1730        //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1731        //
1732        //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1733        //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1734        sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1735      }
1736    }
1737    else {
1738      \n #pragma unroll 4\n
1739      for (unsigned int i = 0; i < 4; i++) {
1740        //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1741        //
1742        //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1743        sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1744      }
1745    }
1746
1747    v.x = (CLQuantum)sv[0];
1748    v.y = (CLQuantum)sv[1];
1749    v.z = (CLQuantum)sv[2];
1750
1751    if (matte!=0)
1752      v.w = (CLQuantum)sv[3];
1753
1754    outputImage[y*imageWidth+x] = v;
1755
1756    }
1757  )
1758
1759/*
1760%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1761%                                                                             %
1762%                                                                             %
1763%                                                                             %
1764%     E q u a l i z e                                                         %
1765%                                                                             %
1766%                                                                             %
1767%                                                                             %
1768%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1769*/
1770
1771    STRINGIFY(
1772    /*
1773    */
1774    __kernel void Equalize(__global CLPixelType * restrict im,
1775      const ChannelType channel,
1776      __global CLPixelType * restrict equalize_map,
1777      const float4 white, const float4 black)
1778      {
1779        const int x = get_global_id(0);
1780        const int y = get_global_id(1);
1781        const int columns = get_global_size(0);
1782        const int c = x + y * columns;
1783
1784        uint ePos;
1785        CLPixelType oValue, eValue;
1786        CLQuantum red, green, blue, alpha;
1787
1788        //read from global
1789        oValue=im[c];
1790
1791        if ((channel & SyncChannels) != 0)
1792        {
1793          if (getRedF4(white) != getRedF4(black))
1794          {
1795            ePos = ScaleQuantumToMap(getRed(oValue));
1796            eValue = equalize_map[ePos];
1797            red = getRed(eValue);
1798            ePos = ScaleQuantumToMap(getGreen(oValue));
1799            eValue = equalize_map[ePos];
1800            green = getRed(eValue);
1801            ePos = ScaleQuantumToMap(getBlue(oValue));
1802            eValue = equalize_map[ePos];
1803            blue = getRed(eValue);
1804            ePos = ScaleQuantumToMap(getAlpha(oValue));
1805            eValue = equalize_map[ePos];
1806            alpha = getRed(eValue);
1807
1808            //write back
1809            im[c]=(CLPixelType)(blue, green, red, alpha);
1810          }
1811
1812        }
1813
1814        // for equalizing, we always need all channels?
1815        // otherwise something more
1816
1817     }
1818    )
1819
1820/*
1821%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1822%                                                                             %
1823%                                                                             %
1824%                                                                             %
1825%     F u n c t i o n                                                         %
1826%                                                                             %
1827%                                                                             %
1828%                                                                             %
1829%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1830*/
1831
1832  STRINGIFY(
1833  /*
1834  apply FunctionImageChannel(braightness-contrast)
1835  */
1836  CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1837    const unsigned int number_parameters,__constant float *parameters)
1838  {
1839    float result = 0.0f;
1840
1841    switch (function)
1842    {
1843    case PolynomialFunction:
1844      {
1845        for (unsigned int i=0; i < number_parameters; i++)
1846          result = result*QuantumScale*pixel + parameters[i];
1847        result *= QuantumRange;
1848        break;
1849      }
1850    case SinusoidFunction:
1851      {
1852        float  freq,phase,ampl,bias;
1853        freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1854        phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1855        ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1856        bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1857        result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1858          (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1859        break;
1860      }
1861    case ArcsinFunction:
1862      {
1863        float  width,range,center,bias;
1864        width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1865        center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1866        range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1867        bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1868
1869        result = 2.0f/width*(QuantumScale*pixel - center);
1870        result = range/MagickPI*asin(result)+bias;
1871        result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1872        result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1873        result *= QuantumRange;
1874        break;
1875      }
1876    case ArctanFunction:
1877      {
1878        float slope,range,center,bias;
1879        slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1880        center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1881        range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1882        bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1883        result = MagickPI*slope*(QuantumScale*pixel-center);
1884        result = QuantumRange*(range/MagickPI*atan(result) + bias);
1885        break;
1886      }
1887    case UndefinedFunction:
1888      break;
1889    }
1890    return(ClampToQuantum(result));
1891  }
1892  )
1893
1894  STRINGIFY(
1895  /*
1896  Improve brightness / contrast of the image
1897  channel : define which channel is improved
1898  function : the function called to enchance the brightness contrast
1899  number_parameters : numbers of parameters
1900  parameters : the parameter
1901  */
1902  __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1903    const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1904    __constant float *parameters)
1905  {
1906    const unsigned int x = get_global_id(0);
1907    const unsigned int y = get_global_id(1);
1908    const unsigned int columns = get_global_size(0);
1909    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1910
1911    float red;
1912    float green;
1913    float blue;
1914    float alpha;
1915
1916    ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1917
1918    if ((channel & RedChannel) != 0)
1919      red=ApplyFunction(red, function, number_parameters, parameters);
1920
1921    if (number_channels > 2)
1922      {
1923        if ((channel & GreenChannel) != 0)
1924          green=ApplyFunction(green, function, number_parameters, parameters);
1925
1926        if ((channel & BlueChannel) != 0)
1927          blue=ApplyFunction(blue, function, number_parameters, parameters);
1928      }
1929
1930    if (((number_channels == 4) || (number_channels == 2)) &&
1931        ((channel & AlphaChannel) != 0))
1932      alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1933
1934    WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1935  }
1936  )
1937
1938/*
1939%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1940%                                                                             %
1941%                                                                             %
1942%                                                                             %
1943%     G r a y s c a l e                                                       %
1944%                                                                             %
1945%                                                                             %
1946%                                                                             %
1947%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1948*/
1949
1950  STRINGIFY(
1951  __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1952    const unsigned int colorspace,const unsigned int method)
1953  {
1954    const unsigned int x = get_global_id(0);
1955    const unsigned int y = get_global_id(1);
1956    const unsigned int columns = get_global_size(0);
1957    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1958
1959    float
1960      blue,
1961      green,
1962      red;
1963
1964    red=getPixelRed(p);
1965    green=getPixelGreen(p);
1966    blue=getPixelBlue(p);
1967
1968    CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1969
1970    setPixelRed(p,intensity);
1971    setPixelGreen(p,intensity);
1972    setPixelBlue(p,intensity);
1973  }
1974  )
1975
1976/*
1977%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1978%                                                                             %
1979%                                                                             %
1980%                                                                             %
1981%     L o c a l C o n t r a s t                                               %
1982%                                                                             %
1983%                                                                             %
1984%                                                                             %
1985%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1986*/
1987
1988    STRINGIFY(
1989
1990      __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1991          const int radius,
1992          const int imageWidth,
1993          const int imageHeight)
1994      {
1995        const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1996
1997        int x = get_local_id(0);
1998        int y = get_global_id(1);
1999
2000        global CLPixelType *src = srcImage + y * imageWidth;
2001
2002        for (int i = x; i < imageWidth; i += get_local_size(0)) {
2003            float sum = 0.0f;
2004            float weight = 1.0f;
2005
2006            int j = i - radius;
2007            while ((j + 7) < i) {
2008                for (int k = 0; k < 8; ++k) // Unroll 8x
2009                    sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
2010                weight += 8.0f;
2011                j+=8;
2012            }
2013            while (j < i) {
2014                sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
2015                weight += 1.0f;
2016                ++j;
2017            }
2018
2019            while ((j + 7) < radius + i) {
2020                for (int k = 0; k < 8; ++k) // Unroll 8x
2021                    sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
2022                weight -= 8.0f;
2023                j+=8;
2024            }
2025            while (j < radius + i) {
2026                sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
2027                weight -= 1.0f;
2028                ++j;
2029            }
2030
2031            tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
2032        }
2033      }
2034    )
2035
2036    STRINGIFY(
2037      __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
2038          const int radius,
2039          const float strength,
2040          const int imageWidth,
2041          const int imageHeight)
2042      {
2043        const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
2044
2045        int x = get_global_id(0);
2046        int y = get_global_id(1);
2047
2048        if ((x >= imageWidth) || (y >= imageHeight))
2049                return;
2050
2051        global float *src = blurImage + x;
2052
2053        float sum = 0.0f;
2054        float weight = 1.0f;
2055
2056        int j = y - radius;
2057        while ((j + 7) < y) {
2058            for (int k = 0; k < 8; ++k) // Unroll 8x
2059                sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
2060            weight += 8.0f;
2061            j+=8;
2062        }
2063        while (j < y) {
2064            sum += weight * src[mirrorBottom(j) * imageWidth];
2065            weight += 1.0f;
2066            ++j;
2067        }
2068
2069        while ((j + 7) < radius + y) {
2070            for (int k = 0; k < 8; ++k) // Unroll 8x
2071                sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
2072            weight -= 8.0f;
2073            j+=8;
2074        }
2075        while (j < radius + y) {
2076            sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
2077            weight -= 1.0f;
2078            ++j;
2079        }
2080
2081        CLPixelType pixel = srcImage[x + y * imageWidth];
2082        float srcVal = dot(RGB, convert_float4(pixel));
2083        float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
2084        mult = (srcVal + mult) / srcVal;
2085
2086        pixel.x = ClampToQuantum(pixel.x * mult);
2087        pixel.y = ClampToQuantum(pixel.y * mult);
2088        pixel.z = ClampToQuantum(pixel.z * mult);
2089
2090        dstImage[x + y * imageWidth] = pixel;
2091      }
2092    )
2093
2094/*
2095%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2096%                                                                             %
2097%                                                                             %
2098%                                                                             %
2099%     M o d u l a t e                                                         %
2100%                                                                             %
2101%                                                                             %
2102%                                                                             %
2103%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2104*/
2105
2106  STRINGIFY(
2107
2108  inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
2109    float *hue, float *saturation, float *lightness)
2110  {
2111  float
2112    c,
2113    tmax,
2114    tmin;
2115
2116  /*
2117     Convert RGB to HSL colorspace.
2118     */
2119  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
2120  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
2121
2122  c=tmax-tmin;
2123
2124  *lightness=(tmax+tmin)/2.0;
2125  if (c <= 0.0)
2126  {
2127    *hue=0.0;
2128    *saturation=0.0;
2129    return;
2130  }
2131
2132  if (tmax == (QuantumScale*red))
2133  {
2134    *hue=(QuantumScale*green-QuantumScale*blue)/c;
2135    if ((QuantumScale*green) < (QuantumScale*blue))
2136      *hue+=6.0;
2137  }
2138  else
2139    if (tmax == (QuantumScale*green))
2140      *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
2141    else
2142      *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2143
2144  *hue*=60.0/360.0;
2145  if (*lightness <= 0.5)
2146    *saturation=c/(2.0*(*lightness));
2147  else
2148    *saturation=c/(2.0-2.0*(*lightness));
2149  }
2150
2151  inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2152      CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2153  {
2154    float
2155      b,
2156      c,
2157      g,
2158      h,
2159      tmin,
2160      r,
2161      x;
2162
2163    /*
2164       Convert HSL to RGB colorspace.
2165       */
2166    h=hue*360.0;
2167    if (lightness <= 0.5)
2168      c=2.0*lightness*saturation;
2169    else
2170      c=(2.0-2.0*lightness)*saturation;
2171    tmin=lightness-0.5*c;
2172    h-=360.0*floor(h/360.0);
2173    h/=60.0;
2174    x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2175    switch ((int) floor(h) % 6)
2176    {
2177      case 0:
2178        {
2179          r=tmin+c;
2180          g=tmin+x;
2181          b=tmin;
2182          break;
2183        }
2184      case 1:
2185        {
2186          r=tmin+x;
2187          g=tmin+c;
2188          b=tmin;
2189          break;
2190        }
2191      case 2:
2192        {
2193          r=tmin;
2194          g=tmin+c;
2195          b=tmin+x;
2196          break;
2197        }
2198      case 3:
2199        {
2200          r=tmin;
2201          g=tmin+x;
2202          b=tmin+c;
2203          break;
2204        }
2205      case 4:
2206        {
2207          r=tmin+x;
2208          g=tmin;
2209          b=tmin+c;
2210          break;
2211        }
2212      case 5:
2213        {
2214          r=tmin+c;
2215          g=tmin;
2216          b=tmin+x;
2217          break;
2218        }
2219      default:
2220        {
2221          r=0.0;
2222          g=0.0;
2223          b=0.0;
2224        }
2225    }
2226    *red=ClampToQuantum(QuantumRange*r);
2227    *green=ClampToQuantum(QuantumRange*g);
2228    *blue=ClampToQuantum(QuantumRange*b);
2229  }
2230
2231  inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2232    CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2233  {
2234    float
2235      hue,
2236      lightness,
2237      saturation;
2238
2239    /*
2240    Increase or decrease color lightness, saturation, or hue.
2241    */
2242    ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2243    hue+=0.5*(0.01*percent_hue-1.0);
2244    while (hue < 0.0)
2245      hue+=1.0;
2246    while (hue >= 1.0)
2247      hue-=1.0;
2248    saturation*=0.01*percent_saturation;
2249    lightness*=0.01*percent_lightness;
2250    ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2251  }
2252
2253  __kernel void Modulate(__global CLPixelType *im,
2254    const float percent_brightness,
2255    const float percent_hue,
2256    const float percent_saturation,
2257    const int colorspace)
2258  {
2259
2260    const int x = get_global_id(0);
2261    const int y = get_global_id(1);
2262    const int columns = get_global_size(0);
2263    const int c = x + y * columns;
2264
2265    CLPixelType pixel = im[c];
2266
2267    CLQuantum
2268        blue,
2269        green,
2270        red;
2271
2272    red=getRed(pixel);
2273    green=getGreen(pixel);
2274    blue=getBlue(pixel);
2275
2276    switch (colorspace)
2277    {
2278      case HSLColorspace:
2279      default:
2280        {
2281          ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2282              &red, &green, &blue);
2283        }
2284
2285    }
2286
2287    CLPixelType filteredPixel;
2288
2289    setRed(&filteredPixel, red);
2290    setGreen(&filteredPixel, green);
2291    setBlue(&filteredPixel, blue);
2292    filteredPixel.w = pixel.w;
2293
2294    im[c] = filteredPixel;
2295  }
2296  )
2297
2298/*
2299%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2300%                                                                             %
2301%                                                                             %
2302%                                                                             %
2303%     M o t i o n B l u r                                                     %
2304%                                                                             %
2305%                                                                             %
2306%                                                                             %
2307%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2308*/
2309
2310  STRINGIFY(
2311    __kernel
2312    void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2313                    const unsigned int imageWidth, const unsigned int imageHeight,
2314                    const __global float *filter, const unsigned int width, const __global int2* offset,
2315                    const float4 bias,
2316                    const ChannelType channel, const unsigned int matte) {
2317
2318      int2 currentPixel;
2319      currentPixel.x = get_global_id(0);
2320      currentPixel.y = get_global_id(1);
2321
2322      if (currentPixel.x >= imageWidth
2323          || currentPixel.y >= imageHeight)
2324          return;
2325
2326      float4 pixel;
2327      pixel.x = (float)bias.x;
2328      pixel.y = (float)bias.y;
2329      pixel.z = (float)bias.z;
2330      pixel.w = (float)bias.w;
2331
2332      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
2333
2334        for (int i = 0; i < width; i++) {
2335          // only support EdgeVirtualPixelMethod through ClampToCanvas
2336          // TODO: implement other virtual pixel method
2337          int2 samplePixel = currentPixel + offset[i];
2338          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2339          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2340          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2341
2342          pixel.x += (filter[i] * (float)samplePixelValue.x);
2343          pixel.y += (filter[i] * (float)samplePixelValue.y);
2344          pixel.z += (filter[i] * (float)samplePixelValue.z);
2345          pixel.w += (filter[i] * (float)samplePixelValue.w);
2346        }
2347
2348        CLPixelType outputPixel;
2349        outputPixel.x = ClampToQuantum(pixel.x);
2350        outputPixel.y = ClampToQuantum(pixel.y);
2351        outputPixel.z = ClampToQuantum(pixel.z);
2352        outputPixel.w = ClampToQuantum(pixel.w);
2353        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2354      }
2355      else {
2356
2357        float gamma = 0.0f;
2358        for (int i = 0; i < width; i++) {
2359          // only support EdgeVirtualPixelMethod through ClampToCanvas
2360          // TODO: implement other virtual pixel method
2361          int2 samplePixel = currentPixel + offset[i];
2362          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2363          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2364
2365          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2366
2367          float alpha = QuantumScale*samplePixelValue.w;
2368          float k = filter[i];
2369          pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2370          pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2371          pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2372
2373          pixel.w += k * alpha * samplePixelValue.w;
2374
2375          gamma+=k*alpha;
2376        }
2377        gamma = PerceptibleReciprocal(gamma);
2378        pixel.xyz = gamma*pixel.xyz;
2379
2380        CLPixelType outputPixel;
2381        outputPixel.x = ClampToQuantum(pixel.x);
2382        outputPixel.y = ClampToQuantum(pixel.y);
2383        outputPixel.z = ClampToQuantum(pixel.z);
2384        outputPixel.w = ClampToQuantum(pixel.w);
2385        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2386      }
2387    }
2388  )
2389
2390/*
2391%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2392%                                                                             %
2393%                                                                             %
2394%                                                                             %
2395%     R e s i z e                                                             %
2396%                                                                             %
2397%                                                                             %
2398%                                                                             %
2399%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2400*/
2401
2402  STRINGIFY(
2403  // Based on Box from resize.c
2404  float BoxResizeFilter(const float x)
2405  {
2406    return 1.0f;
2407  }
2408  )
2409
2410  STRINGIFY(
2411  // Based on CubicBC from resize.c
2412  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2413  {
2414    /*
2415    Cubic Filters using B,C determined values:
2416    Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
2417    Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
2418    Spline              B = 1   C = 0    B-Spline Gaussian approximation
2419    Hermite             B = 0   C = 0    B-Spline interpolator
2420
2421    See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2422    Graphics Computer Graphics, Volume 22, Number 4, August 1988
2423    http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2424    Mitchell.pdf.
2425
2426    Coefficents are determined from B,C values:
2427    P0 = (  6 - 2*B       )/6 = coeff[0]
2428    P1 =         0
2429    P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2430    P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2431    Q0 = (      8*B +24*C )/6 = coeff[3]
2432    Q1 = (    -12*B -48*C )/6 = coeff[4]
2433    Q2 = (      6*B +30*C )/6 = coeff[5]
2434    Q3 = (    - 1*B - 6*C )/6 = coeff[6]
2435
2436    which are used to define the filter:
2437
2438    P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
2439    Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
2440
2441    which ensures function is continuous in value and derivative (slope).
2442    */
2443    if (x < 1.0)
2444      return(resizeFilterCoefficients[0]+x*(x*
2445      (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2446    if (x < 2.0)
2447      return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2448      (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2449    return(0.0);
2450  }
2451  )
2452
2453  STRINGIFY(
2454  float Sinc(const float x)
2455  {
2456    if (x != 0.0f)
2457    {
2458      const float alpha=(float) (MagickPI*x);
2459      return sinpi(x)/alpha;
2460    }
2461    return(1.0f);
2462  }
2463  )
2464
2465  STRINGIFY(
2466  float Triangle(const float x)
2467  {
2468    /*
2469    1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2470    a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
2471    for Sinc().
2472    */
2473    return ((x<1.0f)?(1.0f-x):0.0f);
2474  }
2475  )
2476
2477
2478  STRINGIFY(
2479  float Hann(const float x)
2480  {
2481    /*
2482    Cosine window function:
2483      0.5+0.5*cos(pi*x).
2484    */
2485    const float cosine=cos((MagickPI*x));
2486    return(0.5f+0.5f*cosine);
2487  }
2488  )
2489
2490  STRINGIFY(
2491  float Hamming(const float x)
2492  {
2493    /*
2494      Offset cosine window function:
2495       .54 + .46 cos(pi x).
2496    */
2497    const float cosine=cos((MagickPI*x));
2498    return(0.54f+0.46f*cosine);
2499  }
2500  )
2501
2502  STRINGIFY(
2503  float Blackman(const float x)
2504  {
2505    /*
2506      Blackman: 2nd order cosine windowing function:
2507        0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2508
2509      Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2510      five flops.
2511    */
2512    const float cosine=cos((MagickPI*x));
2513    return(0.34f+cosine*(0.5f+cosine*0.16f));
2514  }
2515  )
2516
2517  STRINGIFY(
2518  inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2519  {
2520    switch (filterType)
2521    {
2522    /* Call Sinc even for SincFast to get better precision on GPU
2523       and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
2524    case SincWeightingFunction:
2525    case SincFastWeightingFunction:
2526      return Sinc(x);
2527    case CubicBCWeightingFunction:
2528      return CubicBC(x,filterCoefficients);
2529    case BoxWeightingFunction:
2530      return BoxResizeFilter(x);
2531    case TriangleWeightingFunction:
2532      return Triangle(x);
2533    case HannWeightingFunction:
2534      return Hann(x);
2535    case HammingWeightingFunction:
2536      return Hamming(x);
2537    case BlackmanWeightingFunction:
2538      return Blackman(x);
2539
2540    default:
2541      return 0.0f;
2542    }
2543  }
2544  )
2545
2546
2547  STRINGIFY(
2548  inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2549           , const ResizeWeightingFunctionType resizeWindowType
2550           , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2551  {
2552    float scale;
2553    float xBlur = fabs(x/resizeFilterBlur);
2554    if (resizeWindowSupport < MagickEpsilon
2555        || resizeWindowType == BoxWeightingFunction)
2556    {
2557      scale = 1.0f;
2558    }
2559    else
2560    {
2561      scale = resizeFilterScale;
2562      scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2563    }
2564    float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2565    return weight;
2566  }
2567
2568  )
2569
2570  ;
2571  const char *accelerateKernels2 =
2572
2573  STRINGIFY(
2574
2575  inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2576    return (numWorkItems/pixelPerWorkgroup);
2577  }
2578
2579  // returns the index of the pixel for the current workitem to compute.
2580  // returns -1 if this workitem doesn't need to participate in any computation
2581  inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2582    const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2583    int pixelIndex = itemID/numWorkItemsPerPixel;
2584    pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2585    return pixelIndex;
2586  }
2587
2588  )
2589
2590  STRINGIFY(
2591  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2592    void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2593      const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2594      const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2595      const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2596      const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2597      const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2598      const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2599      __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2600  {
2601    // calculate the range of resized image pixels computed by this workgroup
2602    const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2603    const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2604    const unsigned int actualNumPixelToCompute = stopX - startX;
2605
2606    // calculate the range of input image pixels to cache
2607    float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2608    const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2609    scale = PerceptibleReciprocal(scale);
2610
2611    const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2612    const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2613
2614    // cache the input pixels into local memory
2615    const unsigned int y = get_global_id(1);
2616    const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2617    const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2618    event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2619    wait_group_events(1, &e);
2620
2621    unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2622    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2623    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2624    {
2625      const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2626      const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2627      const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2628
2629      // determine which resized pixel computed by this workitem
2630      const unsigned int itemID = get_local_id(0);
2631      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2632
2633      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2634
2635      float4 filteredPixel = (float4)0.0f;
2636      float density = 0.0f;
2637      float gamma = 0.0f;
2638      // -1 means this workitem doesn't participate in the computation
2639      if (pixelIndex != -1)
2640      {
2641        // x coordinated of the resized pixel computed by this workitem
2642        const int x = chunkStartX + pixelIndex;
2643
2644        // calculate how many steps required for this pixel
2645        const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2646        const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2647        const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2648        const unsigned int n = stop - start;
2649
2650        // calculate how many steps this workitem will contribute
2651        unsigned int numStepsPerWorkItem = n / numItems;
2652        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2653
2654        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2655        if (startStep < n)
2656        {
2657          const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2658
2659          unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2660
2661          for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2662          {
2663            float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2664              (ResizeWeightingFunctionType) resizeFilterType,
2665              (ResizeWeightingFunctionType) resizeWindowType,
2666              resizeFilterScale, resizeFilterWindowSupport,
2667              resizeFilterBlur, scale*(start + i - bisect + 0.5));
2668
2669            float4 cp = (float4) 0;
2670
2671            __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2672            cp.x = (float) *(p);
2673            if (number_channels > 2)
2674            {
2675              cp.y = (float) *(p + 1);
2676              cp.z = (float) *(p + 2);
2677            }
2678
2679            if (alpha_index != 0)
2680            {
2681              cp.w = (float) *(p + alpha_index);
2682
2683              float alpha = weight * QuantumScale * cp.w;
2684
2685              filteredPixel.x += alpha * cp.x;
2686              filteredPixel.y += alpha * cp.y;
2687              filteredPixel.z += alpha * cp.z;
2688              filteredPixel.w += weight * cp.w;
2689              gamma += alpha;
2690            }
2691            else
2692              filteredPixel += ((float4) weight)*cp;
2693
2694            density += weight;
2695          }
2696        }
2697      }
2698
2699      // initialize the accumulators to zero
2700      if (itemID < actualNumPixelInThisChunk) {
2701        outputPixelCache[itemID] = (float4)0.0f;
2702        densityCache[itemID] = 0.0f;
2703        if (alpha_index != 0)
2704          gammaCache[itemID] = 0.0f;
2705      }
2706      barrier(CLK_LOCAL_MEM_FENCE);
2707
2708      // accumulatte the filtered pixel value and the density
2709      for (unsigned int i = 0; i < numItems; i++) {
2710        if (pixelIndex != -1) {
2711          if (itemID%numItems == i) {
2712            outputPixelCache[pixelIndex]+=filteredPixel;
2713            densityCache[pixelIndex]+=density;
2714            if (alpha_index != 0)
2715              gammaCache[pixelIndex]+=gamma;
2716          }
2717        }
2718        barrier(CLK_LOCAL_MEM_FENCE);
2719      }
2720
2721      if (itemID < actualNumPixelInThisChunk)
2722      {
2723        float4 filteredPixel = outputPixelCache[itemID];
2724
2725        float gamma = 0.0f;
2726        if (alpha_index != 0)
2727          gamma = gammaCache[itemID];
2728
2729        float density = densityCache[itemID];
2730        if ((density != 0.0f) && (density != 1.0f))
2731        {
2732          density = PerceptibleReciprocal(density);
2733          filteredPixel *= (float4) density;
2734          gamma *= density;
2735        }
2736
2737        if (alpha_index != 0)
2738        {
2739          gamma = PerceptibleReciprocal(gamma);
2740          filteredPixel.x *= gamma;
2741          filteredPixel.y *= gamma;
2742          filteredPixel.z *= gamma;
2743        }
2744
2745        WriteFloat4(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, AllChannels, filteredPixel);
2746      }
2747    }
2748  }
2749  )
2750
2751
2752  STRINGIFY(
2753 __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2754    void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2755      const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2756      const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2757      const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2758      const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2759      const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2760      const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2761      __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2762  {
2763    // calculate the range of resized image pixels computed by this workgroup
2764    const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2765    const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2766    const unsigned int actualNumPixelToCompute = stopY - startY;
2767
2768    // calculate the range of input image pixels to cache
2769    float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2770    const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2771    scale = PerceptibleReciprocal(scale);
2772
2773    const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2774    const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2775
2776    // cache the input pixels into local memory
2777    const unsigned int x = get_global_id(0);
2778    unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2779    unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2780    unsigned int stride = inputColumns * number_channels;
2781    for (unsigned int i = 0; i < number_channels; i++)
2782    {
2783      event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2784      wait_group_events(1,&e);
2785    }
2786
2787    unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2788    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2789    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2790    {
2791      const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2792      const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2793      const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2794
2795      // determine which resized pixel computed by this workitem
2796      const unsigned int itemID = get_local_id(1);
2797      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2798
2799      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2800
2801      float4 filteredPixel = (float4)0.0f;
2802      float density = 0.0f;
2803      float gamma = 0.0f;
2804      // -1 means this workitem doesn't participate in the computation
2805      if (pixelIndex != -1)
2806      {
2807        // x coordinated of the resized pixel computed by this workitem
2808        const int y = chunkStartY + pixelIndex;
2809
2810        // calculate how many steps required for this pixel
2811        const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2812        const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2813        const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2814        const unsigned int n = stop - start;
2815
2816        // calculate how many steps this workitem will contribute
2817        unsigned int numStepsPerWorkItem = n / numItems;
2818        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2819
2820        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2821        if (startStep < n)
2822        {
2823          const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2824
2825          unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2826          for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2827          {
2828            float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2829              (ResizeWeightingFunctionType) resizeFilterType,
2830              (ResizeWeightingFunctionType) resizeWindowType,
2831              resizeFilterScale, resizeFilterWindowSupport,
2832              resizeFilterBlur, scale*(start + i - bisect + 0.5));
2833
2834            float4 cp = (float4)0.0f;
2835
2836            __local CLQuantum *p = inputImageCache + cacheIndex;
2837            cp.x = (float) *(p);
2838            if (number_channels > 2)
2839            {
2840              cp.y = (float) *(p + rangeLength);
2841              cp.z = (float) *(p + (rangeLength * 2));
2842            }
2843
2844            if (alpha_index != 0)
2845            {
2846              cp.w = (float) *(p + (rangeLength * alpha_index));
2847
2848              float alpha = weight * QuantumScale * cp.w;
2849
2850              filteredPixel.x += alpha * cp.x;
2851              filteredPixel.y += alpha * cp.y;
2852              filteredPixel.z += alpha * cp.z;
2853              filteredPixel.w += weight * cp.w;
2854              gamma += alpha;
2855            }
2856            else
2857              filteredPixel += ((float4) weight)*cp;
2858
2859            density += weight;
2860          }
2861        }
2862      }
2863
2864      // initialize the accumulators to zero
2865      if (itemID < actualNumPixelInThisChunk) {
2866        outputPixelCache[itemID] = (float4)0.0f;
2867        densityCache[itemID] = 0.0f;
2868        if (alpha_index != 0)
2869          gammaCache[itemID] = 0.0f;
2870      }
2871      barrier(CLK_LOCAL_MEM_FENCE);
2872
2873      // accumulatte the filtered pixel value and the density
2874      for (unsigned int i = 0; i < numItems; i++) {
2875        if (pixelIndex != -1) {
2876          if (itemID%numItems == i) {
2877            outputPixelCache[pixelIndex]+=filteredPixel;
2878            densityCache[pixelIndex]+=density;
2879            if (alpha_index != 0)
2880              gammaCache[pixelIndex]+=gamma;
2881          }
2882        }
2883        barrier(CLK_LOCAL_MEM_FENCE);
2884      }
2885
2886      if (itemID < actualNumPixelInThisChunk)
2887      {
2888        float4 filteredPixel = outputPixelCache[itemID];
2889
2890        float gamma = 0.0f;
2891        if (alpha_index != 0)
2892          gamma = gammaCache[itemID];
2893
2894        float density = densityCache[itemID];
2895        if ((density != 0.0f) && (density != 1.0f))
2896        {
2897          density = PerceptibleReciprocal(density);
2898          filteredPixel *= (float4) density;
2899          gamma *= density;
2900        }
2901
2902        if (alpha_index != 0)
2903        {
2904          gamma = PerceptibleReciprocal(gamma);
2905          filteredPixel.x *= gamma;
2906          filteredPixel.y *= gamma;
2907          filteredPixel.z *= gamma;
2908        }
2909
2910        WriteFloat4(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, AllChannels, filteredPixel);
2911      }
2912    }
2913  }
2914  )
2915
2916/*
2917%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2918%                                                                             %
2919%                                                                             %
2920%                                                                             %
2921%     R o t a t i o n a l B l u r                                             %
2922%                                                                             %
2923%                                                                             %
2924%                                                                             %
2925%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2926*/
2927
2928  STRINGIFY(
2929  __kernel void RotationalBlur(const __global CLQuantum *image,const unsigned int number_channels,
2930    const unsigned int channel,const float4 bias,const float2 blurCenter,__constant float *cos_theta,
2931    __constant float *sin_theta,const unsigned int cossin_theta_size,__global CLQuantum *filteredImage)
2932  {
2933    const int x = get_global_id(0);
2934    const int y = get_global_id(1);
2935    const int columns = get_global_size(0);
2936    const int rows = get_global_size(1);
2937    unsigned int step = 1;
2938    float center_x = (float) x - blurCenter.x;
2939    float center_y = (float) y - blurCenter.y;
2940    float radius = hypot(center_x, center_y);
2941
2942    float blur_radius = hypot(blurCenter.x, blurCenter.y);
2943
2944    if (radius > MagickEpsilon)
2945    {
2946      step = (unsigned int) (blur_radius / radius);
2947      if (step == 0)
2948        step = 1;
2949      if (step >= cossin_theta_size)
2950        step = cossin_theta_size-1;
2951    }
2952
2953    float4 result = bias;
2954
2955    float normalize = 0.0f;
2956    float gamma = 0.0f;
2957
2958    for (unsigned int i=0; i<cossin_theta_size; i+=step)
2959    {
2960      int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2961      int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2962
2963      float4 pixel = ReadFloat4(image, number_channels, columns, cx, cy, channel);
2964
2965      if ((number_channels == 4) || (number_channels == 2))
2966      {
2967        float alpha = (float)(QuantumScale*pixel.w);
2968
2969        gamma += alpha;
2970
2971        result.x += alpha * pixel.x;
2972        result.y += alpha * pixel.y;
2973        result.z += alpha * pixel.z;
2974        result.w += pixel.w;
2975      }
2976      else
2977        result += pixel;
2978
2979      normalize += 1.0f;
2980    }
2981
2982    normalize = PerceptibleReciprocal(normalize);
2983
2984    if ((number_channels == 4) || (number_channels == 2))
2985    {
2986      gamma = PerceptibleReciprocal(gamma);
2987      result.x *= gamma;
2988      result.y *= gamma;
2989      result.z *= gamma;
2990      result.w *= normalize;
2991    }
2992    else
2993      result *= normalize;
2994
2995    WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2996  }
2997  )
2998
2999/*
3000%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3001%                                                                             %
3002%                                                                             %
3003%                                                                             %
3004%     U n s h a r p M a s k                                                   %
3005%                                                                             %
3006%                                                                             %
3007%                                                                             %
3008%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
3009*/
3010
3011  STRINGIFY(
3012  __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
3013    const __global float4 *blurRowData,const unsigned int number_channels,
3014    const ChannelType channel,const unsigned int columns,
3015    const unsigned int rows,__local float4* cachedData,
3016    __local float* cachedFilter,const __global float *filter,
3017    const unsigned int width,const float gain, const float threshold,
3018    __global CLQuantum *filteredImage)
3019  {
3020    const unsigned int radius = (width-1)/2;
3021
3022    // cache the pixel shared by the workgroup
3023    const int groupX = get_group_id(0);
3024    const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
3025    const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
3026
3027    if ((groupStartY >= 0) && (groupStopY < rows))
3028    {
3029      event_t e = async_work_group_strided_copy(cachedData,
3030        blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
3031      wait_group_events(1,&e);
3032    }
3033    else
3034    {
3035      for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
3036        cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
3037
3038      barrier(CLK_LOCAL_MEM_FENCE);
3039    }
3040    // cache the filter as well
3041    event_t e = async_work_group_copy(cachedFilter,filter,width,0);
3042    wait_group_events(1,&e);
3043
3044    // only do the work if this is not a patched item
3045    const int cy = get_global_id(1);
3046
3047    if (cy < rows)
3048    {
3049      float4 blurredPixel = (float4) 0.0f;
3050
3051      int i = 0;
3052
3053      for ( ; i+7 < width; )
3054      {
3055        for (int j=0; j < 8; j++, i++)
3056          blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
3057      }
3058
3059      for ( ; i < width; i++)
3060        blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
3061
3062      float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
3063      float4 outputPixel = inputImagePixel - blurredPixel;
3064
3065      float quantumThreshold = QuantumRange*threshold;
3066
3067      int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
3068      outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
3069
3070      //write back
3071      WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
3072    }
3073  }
3074  )
3075
3076  STRINGIFY(
3077  __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
3078    const ChannelType channel,__constant float *filter,const unsigned int width,
3079    const unsigned int columns,const unsigned int rows,__local float4 *pixels,
3080    const float gain,const float threshold,__global CLQuantum *filteredImage)
3081  {
3082    const unsigned int x = get_global_id(0);
3083    const unsigned int y = get_global_id(1);
3084
3085    const unsigned int radius = (width - 1) / 2;
3086
3087    int row = y - radius;
3088    int baseRow = get_group_id(1) * get_local_size(1) - radius;
3089    int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
3090
3091    while (row < endRow) {
3092      int srcy = (row < 0) ? -row : row; // mirror pad
3093      srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
3094
3095      float4 value = 0.0f;
3096
3097      int ix = x - radius;
3098      int i = 0;
3099
3100      while (i + 7 < width) {
3101        for (int j = 0; j < 8; ++j) { // unrolled
3102          int srcx = ix + j;
3103          srcx = (srcx < 0) ? -srcx : srcx;
3104          srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
3105          value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
3106        }
3107        ix += 8;
3108        i += 8;
3109      }
3110
3111      while (i < width) {
3112        int srcx = (ix < 0) ? -ix : ix; // mirror pad
3113        srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
3114        value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
3115        ++i;
3116        ++ix;
3117      }
3118      pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
3119      row += get_local_size(1);
3120    }
3121
3122    barrier(CLK_LOCAL_MEM_FENCE);
3123
3124    const int px = get_local_id(0);
3125    const int py = get_local_id(1);
3126    const int prp = get_local_size(0);
3127    float4 value = (float4)(0.0f);
3128
3129    int i = 0;
3130    while (i + 7 < width) {
3131      for (int j = 0; j < 8; ++j) // unrolled
3132        value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
3133      i += 8;
3134    }
3135    while (i < width) {
3136      value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
3137      ++i;
3138    }
3139
3140    float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
3141    float4 diff = srcPixel - value;
3142
3143    float quantumThreshold = QuantumRange*threshold;
3144
3145    int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3146    value = select(srcPixel + diff * gain, srcPixel, mask);
3147
3148    if ((x < columns) && (y < rows))
3149      WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
3150  }
3151  )
3152
3153  STRINGIFY(
3154    __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
3155    void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
3156      const unsigned int number_channels,const unsigned int max_channels,
3157      const float threshold,const int passes,const unsigned int imageWidth,
3158      const unsigned int imageHeight)
3159  {
3160    const int pad = (1 << (passes - 1));
3161    const int tileSize = 64;
3162    const int tileRowPixels = 64;
3163    const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3164
3165    CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
3166
3167    local float buffer[64 * 64];
3168
3169    int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0);
3170    int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad;
3171
3172    for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3173      int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
3174                (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
3175
3176      for (int channel = 0; channel < max_channels; ++channel)
3177        stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
3178    }
3179
3180    for (int channel = 0; channel < max_channels; ++channel) {
3181      // Load LDS
3182      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3183        buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
3184
3185      // Process
3186
3187      float tmp[16];
3188      float accum[16];
3189      float pixel;
3190
3191      for (int i = 0; i < 16; i++)
3192        accum[i]=0.0f;
3193
3194      for (int pass = 0; pass < passes; ++pass) {
3195        const int radius = 1 << pass;
3196        const int x = get_local_id(0);
3197        const float thresh = threshold * noise[pass];
3198
3199        // Apply horizontal hat
3200        for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3201          const int offset = i * tileRowPixels;
3202          if (pass == 0)
3203            tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3204          pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3205          barrier(CLK_LOCAL_MEM_FENCE);
3206          buffer[x + offset] = pixel;
3207        }
3208        barrier(CLK_LOCAL_MEM_FENCE);
3209
3210        // Apply vertical hat
3211        for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3212          pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3213          float delta = tmp[i / 4] - pixel;
3214          tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3215          if (delta < -thresh)
3216            delta += thresh;
3217          else if (delta > thresh)
3218            delta -= thresh;
3219          else
3220            delta = 0;
3221          accum[i / 4] += delta;
3222        }
3223        barrier(CLK_LOCAL_MEM_FENCE);
3224
3225        if (pass < passes - 1)
3226          for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3227            buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3228        else  // last pass
3229          for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3230            accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3231        barrier(CLK_LOCAL_MEM_FENCE);
3232      }
3233
3234      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3235        stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
3236
3237      barrier(CLK_LOCAL_MEM_FENCE);
3238    }
3239
3240    // Write from stage to output
3241
3242    if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3243      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3244        if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3245          int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
3246          for (int channel = 0; channel < max_channels; ++channel) {
3247            dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
3248          }
3249        }
3250      }
3251    }
3252  }
3253  )
3254
3255  ;
3256
3257#endif // MAGICKCORE_OPENCL_SUPPORT
3258
3259#if defined(__cplusplus) || defined(c_plusplus)
3260}
3261#endif
3262
3263#endif // _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
3264