1/*
2  Copyright 1999-2016 ImageMagick Studio LLC, a non-profit organization
3  dedicated to making software imaging solutions freely available.
4
5  You may not use this file except in compliance with the License.
6  obtain a copy of the License at
7
8    http://www.imagemagick.org/script/license.php
9
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15
16  MagickCore private kernels for accelerated functions.
17*/
18
19#ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20#define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21
22#if defined(__cplusplus) || defined(c_plusplus)
23extern "C" {
24#endif
25
26#if defined(MAGICKCORE_OPENCL_SUPPORT)
27
28/*
29  Define declarations.
30*/
31#define OPENCL_DEFINE(VAR,...)	"\n #""define " #VAR " " #__VA_ARGS__ " \n"
32#define OPENCL_ELIF(...)	"\n #""elif " #__VA_ARGS__ " \n"
33#define OPENCL_ELSE()		"\n #""else " " \n"
34#define OPENCL_ENDIF()		"\n #""endif " " \n"
35#define OPENCL_IF(...)		"\n #""if " #__VA_ARGS__ " \n"
36#define STRINGIFY(...) #__VA_ARGS__ "\n"
37
38/*
39  Typedef declarations.
40*/
41
42typedef struct _FloatPixelPacket
43{
44  MagickRealType
45    red,
46    green,
47    blue,
48    alpha,
49    black;
50} FloatPixelPacket;
51
52const char *accelerateKernels =
53
54/*
55  Define declarations.
56*/
57  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
58  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
59  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
60  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
61  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
62  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
63  OPENCL_DEFINE(SigmaRandom, (attenuate))
64  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
65  OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
66  OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
67
68/*
69  Typedef declarations.
70*/
71  STRINGIFY(
72    typedef enum
73  {
74    UndefinedColorspace,
75    RGBColorspace,            /* Linear RGB colorspace */
76    GRAYColorspace,           /* greyscale (linear) image (faked 1 channel) */
77    TransparentColorspace,
78    OHTAColorspace,
79    LabColorspace,
80    XYZColorspace,
81    YCbCrColorspace,
82    YCCColorspace,
83    YIQColorspace,
84    YPbPrColorspace,
85    YUVColorspace,
86    CMYKColorspace,           /* negared linear RGB with black separated */
87    sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
88    HSBColorspace,
89    HSLColorspace,
90    HWBColorspace,
91    Rec601LumaColorspace,
92    Rec601YCbCrColorspace,
93    Rec709LumaColorspace,
94    Rec709YCbCrColorspace,
95    LogColorspace,
96    CMYColorspace,            /* negated linear RGB colorspace */
97    LuvColorspace,
98    HCLColorspace,
99    LCHColorspace,            /* alias for LCHuv */
100    LMSColorspace,
101    LCHabColorspace,          /* Cylindrical (Polar) Lab */
102    LCHuvColorspace,          /* Cylindrical (Polar) Luv */
103    scRGBColorspace,
104    HSIColorspace,
105    HSVColorspace,            /* alias for HSB */
106    HCLpColorspace,
107    YDbDrColorspace
108  } ColorspaceType;
109  )
110
111  STRINGIFY(
112    typedef enum
113    {
114      UndefinedCompositeOp,
115      NoCompositeOp,
116      ModulusAddCompositeOp,
117      AtopCompositeOp,
118      BlendCompositeOp,
119      BumpmapCompositeOp,
120      ChangeMaskCompositeOp,
121      ClearCompositeOp,
122      ColorBurnCompositeOp,
123      ColorDodgeCompositeOp,
124      ColorizeCompositeOp,
125      CopyBlackCompositeOp,
126      CopyBlueCompositeOp,
127      CopyCompositeOp,
128      CopyCyanCompositeOp,
129      CopyGreenCompositeOp,
130      CopyMagentaCompositeOp,
131      CopyOpacityCompositeOp,
132      CopyRedCompositeOp,
133      CopyYellowCompositeOp,
134      DarkenCompositeOp,
135      DstAtopCompositeOp,
136      DstCompositeOp,
137      DstInCompositeOp,
138      DstOutCompositeOp,
139      DstOverCompositeOp,
140      DifferenceCompositeOp,
141      DisplaceCompositeOp,
142      DissolveCompositeOp,
143      ExclusionCompositeOp,
144      HardLightCompositeOp,
145      HueCompositeOp,
146      InCompositeOp,
147      LightenCompositeOp,
148      LinearLightCompositeOp,
149      LuminizeCompositeOp,
150      MinusDstCompositeOp,
151      ModulateCompositeOp,
152      MultiplyCompositeOp,
153      OutCompositeOp,
154      OverCompositeOp,
155      OverlayCompositeOp,
156      PlusCompositeOp,
157      ReplaceCompositeOp,
158      SaturateCompositeOp,
159      ScreenCompositeOp,
160      SoftLightCompositeOp,
161      SrcAtopCompositeOp,
162      SrcCompositeOp,
163      SrcInCompositeOp,
164      SrcOutCompositeOp,
165      SrcOverCompositeOp,
166      ModulusSubtractCompositeOp,
167      ThresholdCompositeOp,
168      XorCompositeOp,
169      /* These are new operators, added after the above was last sorted.
170       * The list should be re-sorted only when a new library version is
171       * created.
172       */
173      DivideDstCompositeOp,
174      DistortCompositeOp,
175      BlurCompositeOp,
176      PegtopLightCompositeOp,
177      VividLightCompositeOp,
178      PinLightCompositeOp,
179      LinearDodgeCompositeOp,
180      LinearBurnCompositeOp,
181      MathematicsCompositeOp,
182      DivideSrcCompositeOp,
183      MinusSrcCompositeOp,
184      DarkenIntensityCompositeOp,
185      LightenIntensityCompositeOp
186    } CompositeOperator;
187  )
188
189  STRINGIFY(
190     typedef enum
191     {
192       UndefinedFunction,
193       ArcsinFunction,
194       ArctanFunction,
195       PolynomialFunction,
196       SinusoidFunction
197     } MagickFunction;
198  )
199
200  STRINGIFY(
201    typedef enum
202    {
203      UndefinedNoise,
204      UniformNoise,
205      GaussianNoise,
206      MultiplicativeGaussianNoise,
207      ImpulseNoise,
208      LaplacianNoise,
209      PoissonNoise,
210      RandomNoise
211    } NoiseType;
212  )
213
214  STRINGIFY(
215    typedef enum
216  {
217    UndefinedPixelIntensityMethod = 0,
218    AveragePixelIntensityMethod,
219    BrightnessPixelIntensityMethod,
220    LightnessPixelIntensityMethod,
221    Rec601LumaPixelIntensityMethod,
222    Rec601LuminancePixelIntensityMethod,
223    Rec709LumaPixelIntensityMethod,
224    Rec709LuminancePixelIntensityMethod,
225    RMSPixelIntensityMethod,
226    MSPixelIntensityMethod
227  } PixelIntensityMethod;
228  )
229
230  STRINGIFY(
231  typedef enum {
232    BoxWeightingFunction = 0,
233    TriangleWeightingFunction,
234    CubicBCWeightingFunction,
235    HannWeightingFunction,
236    HammingWeightingFunction,
237    BlackmanWeightingFunction,
238    GaussianWeightingFunction,
239    QuadraticWeightingFunction,
240    JincWeightingFunction,
241    SincWeightingFunction,
242    SincFastWeightingFunction,
243    KaiserWeightingFunction,
244    WelshWeightingFunction,
245    BohmanWeightingFunction,
246    LagrangeWeightingFunction,
247    CosineWeightingFunction,
248  } ResizeWeightingFunctionType;
249  )
250
251  STRINGIFY(
252     typedef enum
253     {
254       UndefinedChannel = 0x0000,
255       RedChannel = 0x0001,
256       GrayChannel = 0x0001,
257       CyanChannel = 0x0001,
258       GreenChannel = 0x0002,
259       MagentaChannel = 0x0002,
260       BlueChannel = 0x0004,
261       YellowChannel = 0x0004,
262       BlackChannel = 0x0008,
263       AlphaChannel = 0x0010,
264       OpacityChannel = 0x0010,
265       IndexChannel = 0x0020,             /* Color Index Table? */
266       ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
267       WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
268       MetaChannel = 0x0100,              /* ???? */
269       CompositeChannels = 0x001F,
270       AllChannels = 0x7ffffff,
271       /*
272         Special purpose channel types.
273         FUTURE: are these needed any more - they are more like hacks
274         SyncChannels for example is NOT a real channel but a 'flag'
275         It really says -- "User has not defined channels"
276         Though it does have extra meaning in the "-auto-level" operator
277       */
278       TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
279       RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
280       GrayChannels = 0x0400,
281       SyncChannels = 0x20000,    /* channels modified as a single unit */
282       DefaultChannels = AllChannels
283     } ChannelType;  /* must correspond to PixelChannel */
284  )
285
286/*
287  Helper functions.
288*/
289
290OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
291
292  STRINGIFY(
293    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
294    {
295      return((CLQuantum) value);
296    }
297  )
298
299OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
300
301  STRINGIFY(
302    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
303    {
304      return((CLQuantum) (257.0f*value));
305    }
306  )
307
308OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
309
310  STRINGIFY(
311    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
312    {
313      return((CLQuantum) (16843009.0*value));
314    }
315  )
316
317OPENCL_ENDIF()
318
319  STRINGIFY(
320    inline int ClampToCanvas(const int offset,const int range)
321      {
322        return clamp(offset, (int)0, range-1);
323      }
324  )
325
326  STRINGIFY(
327    inline CLQuantum ClampToQuantum(const float value)
328      {
329        return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
330      }
331  )
332
333  STRINGIFY(
334    inline uint ScaleQuantumToMap(CLQuantum value)
335      {
336        if (value >= (CLQuantum) MaxMap)
337          return ((uint)MaxMap);
338        else
339          return ((uint)value);
340      }
341  )
342
343  STRINGIFY(
344    inline float PerceptibleReciprocal(const float x)
345    {
346      float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
347      return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
348    }
349  )
350
351  STRINGIFY(
352  inline float RoundToUnity(const float value)
353   {
354     return clamp(value,0.0f,1.0f);
355   }
356  )
357
358  STRINGIFY(
359
360  inline unsigned int getPixelIndex(const unsigned int number_channels,
361    const unsigned int columns, const unsigned int x, const unsigned int y)
362  {
363    return (x * number_channels) + (y * columns * number_channels);
364  }
365
366  inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
367  inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
368  inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
369  inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
370
371  inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
372  inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
373  inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
374  inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
375
376  inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
377  inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
378  inline float getBlueF4(float4 p)                      { return p.x; }
379  inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
380
381  inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
382  inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
383  inline float getGreenF4(float4 p)                     { return p.y; }
384  inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
385
386  inline CLQuantum getRed(CLPixelType p)                { return p.z; }
387  inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
388  inline float getRedF4(float4 p)                       { return p.z; }
389  inline void setRedF4(float4* p, float value)          { (*p).z = value; }
390
391  inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
392  inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
393  inline float getAlphaF4(float4 p)                     { return p.w; }
394  inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
395
396  inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
397    const ChannelType channel, float *red, float *green, float *blue, float *alpha)
398  {
399    if ((channel & RedChannel) != 0)
400      *red=getPixelRed(p);
401
402    if (number_channels > 2)
403      {
404        if ((channel & GreenChannel) != 0)
405          *green=getPixelGreen(p);
406
407        if ((channel & BlueChannel) != 0)
408          *blue=getPixelBlue(p);
409      }
410
411    if (((number_channels == 4) || (number_channels == 2)) &&
412        ((channel & AlphaChannel) != 0))
413      *alpha=getPixelAlpha(p,number_channels);
414  }
415
416  inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
417    const unsigned int columns, const unsigned int x, const unsigned int y)
418  {
419    const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
420
421    float4 pixel;
422
423    pixel.x=getPixelRed(p);
424
425    if (number_channels > 2)
426      {
427        pixel.y=getPixelGreen(p);
428        pixel.z=getPixelBlue(p);
429      }
430
431    if ((number_channels == 4) || (number_channels == 2))
432      pixel.w=getPixelAlpha(p,number_channels);
433    return(pixel);
434  }
435
436  inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
437    const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
438  {
439    const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
440
441    float red = 0.0f;
442    float green = 0.0f;
443    float blue = 0.0f;
444    float alpha = 0.0f;
445
446    ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
447    return (float4)(red, green, blue, alpha);
448  }
449
450  inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
451    const ChannelType channel, float red, float green, float blue, float alpha)
452  {
453    if ((channel & RedChannel) != 0)
454      setPixelRed(p,ClampToQuantum(red));
455
456    if (number_channels > 2)
457      {
458        if ((channel & GreenChannel) != 0)
459          setPixelGreen(p,ClampToQuantum(green));
460
461        if ((channel & BlueChannel) != 0)
462          setPixelBlue(p,ClampToQuantum(blue));
463      }
464
465    if (((number_channels == 4) || (number_channels == 2)) &&
466        ((channel & AlphaChannel) != 0))
467      setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
468  }
469
470  inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
471    const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
472  {
473    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
474
475    setPixelRed(p,ClampToQuantum(pixel.x));
476
477    if (number_channels > 2)
478      {
479        setPixelGreen(p,ClampToQuantum(pixel.y));
480        setPixelBlue(p,ClampToQuantum(pixel.z));
481      }
482
483    if ((number_channels == 4) || (number_channels == 2))
484      setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
485  }
486
487  inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
488    const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
489    float4 pixel)
490  {
491    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
492    WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
493  }
494
495  inline float GetPixelIntensity(const unsigned int colorspace,
496    const unsigned int method,float red,float green,float blue)
497  {
498    float intensity;
499
500    if (colorspace == GRAYColorspace)
501      return red;
502
503    switch (method)
504    {
505      case AveragePixelIntensityMethod:
506        {
507          intensity=(red+green+blue)/3.0;
508          break;
509        }
510      case BrightnessPixelIntensityMethod:
511        {
512          intensity=MagickMax(MagickMax(red,green),blue);
513          break;
514        }
515      case LightnessPixelIntensityMethod:
516        {
517          intensity=(MagickMin(MagickMin(red,green),blue)+
518              MagickMax(MagickMax(red,green),blue))/2.0;
519          break;
520        }
521      case MSPixelIntensityMethod:
522        {
523          intensity=(float) (((float) red*red+green*green+blue*blue)/
524              (3.0*QuantumRange));
525          break;
526        }
527      case Rec601LumaPixelIntensityMethod:
528        {
529          /*
530          if (image->colorspace == RGBColorspace)
531          {
532            red=EncodePixelGamma(red);
533            green=EncodePixelGamma(green);
534            blue=EncodePixelGamma(blue);
535          }
536          */
537          intensity=0.298839*red+0.586811*green+0.114350*blue;
538          break;
539        }
540      case Rec601LuminancePixelIntensityMethod:
541        {
542          /*
543          if (image->colorspace == sRGBColorspace)
544          {
545            red=DecodePixelGamma(red);
546            green=DecodePixelGamma(green);
547            blue=DecodePixelGamma(blue);
548          }
549          */
550          intensity=0.298839*red+0.586811*green+0.114350*blue;
551          break;
552        }
553      case Rec709LumaPixelIntensityMethod:
554      default:
555        {
556          /*
557          if (image->colorspace == RGBColorspace)
558          {
559            red=EncodePixelGamma(red);
560            green=EncodePixelGamma(green);
561            blue=EncodePixelGamma(blue);
562          }
563          */
564          intensity=0.212656*red+0.715158*green+0.072186*blue;
565          break;
566        }
567      case Rec709LuminancePixelIntensityMethod:
568        {
569          /*
570          if (image->colorspace == sRGBColorspace)
571          {
572            red=DecodePixelGamma(red);
573            green=DecodePixelGamma(green);
574            blue=DecodePixelGamma(blue);
575          }
576          */
577          intensity=0.212656*red+0.715158*green+0.072186*blue;
578          break;
579        }
580      case RMSPixelIntensityMethod:
581        {
582          intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
583              sqrt(3.0));
584          break;
585        }
586    }
587
588    return intensity;
589  }
590
591  inline int mirrorBottom(int value)
592  {
593      return (value < 0) ? - (value) : value;
594  }
595
596  inline int mirrorTop(int value, int width)
597  {
598      return (value >= width) ? (2 * width - value - 1) : value;
599  }
600  )
601
602/*
603%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
604%                                                                             %
605%                                                                             %
606%                                                                             %
607%     A d d N o i s e                                                         %
608%                                                                             %
609%                                                                             %
610%                                                                             %
611%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
612*/
613
614  STRINGIFY(
615  /*
616  Part of MWC64X by David Thomas, dt10@imperial.ac.uk
617  This is provided under BSD, full license is with the main package.
618  See http://www.doc.ic.ac.uk/~dt10/research
619  */
620
621  // Pre: a<M, b<M
622  // Post: r=(a+b) mod M
623  ulong MWC_AddMod64(ulong a, ulong b, ulong M)
624  {
625    ulong v=a+b;
626    //if( (v>=M) || (v<a) )
627    if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
628      v=v-M;
629    return v;
630  }
631
632  // Pre: a<M,b<M
633  // Post: r=(a*b) mod M
634  // This could be done more efficently, but it is portable, and should
635  // be easy to understand. It can be replaced with any of the better
636  // modular multiplication algorithms (for example if you know you have
637  // double precision available or something).
638  ulong MWC_MulMod64(ulong a, ulong b, ulong M)
639  {
640    ulong r=0;
641    while(a!=0){
642      if(a&1)
643        r=MWC_AddMod64(r,b,M);
644      b=MWC_AddMod64(b,b,M);
645      a=a>>1;
646    }
647    return r;
648  }
649
650  // Pre: a<M, e>=0
651  // Post: r=(a^b) mod M
652  // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
653  // most architectures
654  ulong MWC_PowMod64(ulong a, ulong e, ulong M)
655  {
656    ulong sqr=a, acc=1;
657    while(e!=0){
658      if(e&1)
659        acc=MWC_MulMod64(acc,sqr,M);
660        sqr=MWC_MulMod64(sqr,sqr,M);
661      e=e>>1;
662    }
663    return acc;
664  }
665
666  uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
667  {
668    ulong m=MWC_PowMod64(A, distance, M);
669    ulong x=curr.x*(ulong)A+curr.y;
670    x=MWC_MulMod64(x, m, M);
671    return (uint2)((uint)(x/A), (uint)(x%A));
672  }
673
674  uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
675  {
676    // This is an arbitrary constant for starting LCG jumping from. I didn't
677    // want to start from 1, as then you end up with the two or three first values
678    // being a bit poor in ones - once you've decided that, one constant is as
679    // good as any another. There is no deep mathematical reason for it, I just
680    // generated a random number.
681    enum{ MWC_BASEID = 4077358422479273989UL };
682
683    ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
684    ulong m=MWC_PowMod64(A, dist, M);
685
686    ulong x=MWC_MulMod64(MWC_BASEID, m, M);
687    return (uint2)((uint)(x/A), (uint)(x%A));
688  }
689
690  //! Represents the state of a particular generator
691  typedef struct{ uint x; uint c; } mwc64x_state_t;
692
693  enum{ MWC64X_A = 4294883355U };
694  enum{ MWC64X_M = 18446383549859758079UL };
695
696  void MWC64X_Step(mwc64x_state_t *s)
697  {
698    uint X=s->x, C=s->c;
699
700    uint Xn=MWC64X_A*X+C;
701    uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
702    uint Cn=mad_hi(MWC64X_A,X,carry);
703
704    s->x=Xn;
705    s->c=Cn;
706  }
707
708  void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
709  {
710    uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
711    s->x=tmp.x;
712    s->c=tmp.y;
713  }
714
715  void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
716  {
717    uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
718    s->x=tmp.x;
719    s->c=tmp.y;
720  }
721
722  //! Return a 32-bit integer in the range [0..2^32)
723  uint MWC64X_NextUint(mwc64x_state_t *s)
724  {
725    uint res=s->x ^ s->c;
726    MWC64X_Step(s);
727    return res;
728  }
729
730  //
731  // End of MWC64X excerpt
732  //
733
734  float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
735  {
736    return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
737  }
738
739  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
740  {
741    float
742      alpha,
743      beta,
744      noise,
745      sigma;
746
747    noise = 0.0f;
748    alpha=mwcReadPseudoRandomValue(r);
749    switch(noise_type)
750    {
751      case UniformNoise:
752      default:
753        {
754          noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
755          break;
756        }
757      case GaussianNoise:
758        {
759          float
760            gamma,
761            tau;
762
763          if (alpha == 0.0f)
764            alpha=1.0f;
765          beta=mwcReadPseudoRandomValue(r);
766          gamma=sqrt(-2.0f*log(alpha));
767          sigma=gamma*cospi((2.0f*beta));
768          tau=gamma*sinpi((2.0f*beta));
769          noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
770          break;
771        }
772      case ImpulseNoise:
773      {
774        if (alpha < (SigmaImpulse/2.0f))
775          noise=0.0f;
776        else
777          if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
778            noise=QuantumRange;
779          else
780            noise=pixel;
781        break;
782      }
783      case LaplacianNoise:
784      {
785        if (alpha <= 0.5f)
786          {
787            if (alpha <= MagickEpsilon)
788              noise=(pixel-QuantumRange);
789            else
790              noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
791                0.5f);
792            break;
793          }
794        beta=1.0f-alpha;
795        if (beta <= (0.5f*MagickEpsilon))
796          noise=(pixel+QuantumRange);
797        else
798          noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
799        break;
800      }
801      case MultiplicativeGaussianNoise:
802      {
803        sigma=1.0f;
804        if (alpha > MagickEpsilon)
805          sigma=sqrt(-2.0f*log(alpha));
806        beta=mwcReadPseudoRandomValue(r);
807        noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
808          cospi((2.0f*beta))/2.0f);
809        break;
810      }
811      case PoissonNoise:
812      {
813        float
814          poisson;
815        unsigned int i;
816        poisson=exp(-SigmaPoisson*QuantumScale*pixel);
817        for (i=0; alpha > poisson; i++)
818        {
819          beta=mwcReadPseudoRandomValue(r);
820          alpha*=beta;
821        }
822        noise=(QuantumRange*i/SigmaPoisson);
823        break;
824      }
825      case RandomNoise:
826      {
827        noise=(QuantumRange*SigmaRandom*alpha);
828        break;
829      }
830    }
831    return noise;
832  }
833
834  __kernel
835  void AddNoise(const __global CLQuantum *image,
836    const unsigned int number_channels,const ChannelType channel,
837    const unsigned int length,const unsigned int pixelsPerWorkItem,
838    const NoiseType noise_type,const float attenuate,const unsigned int seed0,
839    const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
840    __global CLQuantum *filteredImage)
841  {
842    mwc64x_state_t rng;
843    rng.x = seed0;
844    rng.c = seed1;
845
846    uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
847    uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
848    MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
849
850    uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
851    uint count = pixelsPerWorkItem;
852
853    while (count > 0 && pos < length)
854    {
855      const __global CLQuantum *p = image + pos;
856      __global CLQuantum *q = filteredImage + pos;
857
858      float red;
859      float green;
860      float blue;
861      float alpha;
862
863      ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
864
865      if ((channel & RedChannel) != 0)
866        red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
867
868      if (number_channels > 2)
869      {
870        if ((channel & GreenChannel) != 0)
871          green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
872
873        if ((channel & BlueChannel) != 0)
874          blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
875      }
876
877      if (((number_channels == 4) || (number_channels == 2)) &&
878          ((channel & AlphaChannel) != 0))
879        alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
880
881      WriteChannels(q, number_channels, channel, red, green, blue, alpha);
882
883      pos += (get_local_size(0) * number_channels);
884      count--;
885    }
886  }
887  )
888
889/*
890%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
891%                                                                             %
892%                                                                             %
893%                                                                             %
894%    B l u r                                                                  %
895%                                                                             %
896%                                                                             %
897%                                                                             %
898%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
899*/
900
901  STRINGIFY(
902  /*
903  Reduce image noise and reduce detail levels by row
904  */
905  __kernel void BlurRow(const __global CLQuantum *image,
906    const unsigned int number_channels,const ChannelType channel,
907    __constant float *filter,const unsigned int width,
908    const unsigned int imageColumns,const unsigned int imageRows,
909    __local float4 *temp,__global float4 *tempImage)
910  {
911    const int x = get_global_id(0);
912    const int y = get_global_id(1);
913
914    const int columns = imageColumns;
915
916    const unsigned int radius = (width-1)/2;
917    const int wsize = get_local_size(0);
918    const unsigned int loadSize = wsize+width;
919
920    //group coordinate
921    const int groupX=get_local_size(0)*get_group_id(0);
922
923    //parallel load and clamp
924    for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
925    {
926      int cx = ClampToCanvas(i + groupX - radius, columns);
927      temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
928    }
929
930    // barrier
931    barrier(CLK_LOCAL_MEM_FENCE);
932
933    // only do the work if this is not a patched item
934    if (get_global_id(0) < columns)
935    {
936      // compute
937      float4 result = (float4) 0;
938
939      int i = 0;
940
941      for ( ; i+7 < width; )
942      {
943        for (int j=0; j < 8; j++)
944          result+=filter[i+j]*temp[i+j+get_local_id(0)];
945        i+=8;
946      }
947
948      for ( ; i < width; i++)
949        result+=filter[i]*temp[i+get_local_id(0)];
950
951      // write back to global
952      tempImage[y*columns+x] = result;
953    }
954  }
955  )
956
957  STRINGIFY(
958  /*
959  Reduce image noise and reduce detail levels by line
960  */
961  __kernel void BlurColumn(const __global float4 *blurRowData,
962    const unsigned int number_channels,const ChannelType channel,
963    __constant float *filter,const unsigned int width,
964    const unsigned int imageColumns,const unsigned int imageRows,
965    __local float4 *temp,__global CLQuantum *filteredImage)
966  {
967    const int x = get_global_id(0);
968    const int y = get_global_id(1);
969
970    const int columns = imageColumns;
971    const int rows = imageRows;
972
973    unsigned int radius = (width-1)/2;
974    const int wsize = get_local_size(1);
975    const unsigned int loadSize = wsize+width;
976
977    //group coordinate
978    const int groupX=get_local_size(0)*get_group_id(0);
979    const int groupY=get_local_size(1)*get_group_id(1);
980    //notice that get_local_size(0) is 1, so
981    //groupX=get_group_id(0);
982
983    //parallel load and clamp
984    for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
985      temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
986
987    // barrier
988    barrier(CLK_LOCAL_MEM_FENCE);
989
990    // only do the work if this is not a patched item
991    if (get_global_id(1) < rows)
992    {
993      // compute
994      float4 result = (float4) 0;
995
996      int i = 0;
997
998      for ( ; i+7 < width; )
999      {
1000        for (int j=0; j < 8; j++)
1001          result+=filter[i+j]*temp[i+j+get_local_id(1)];
1002        i+=8;
1003      }
1004
1005      for ( ; i < width; i++)
1006        result+=filter[i]*temp[i+get_local_id(1)];
1007
1008      // write back to global
1009      WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
1010    }
1011  }
1012  )
1013
1014/*
1015%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1016%                                                                             %
1017%                                                                             %
1018%                                                                             %
1019%    C o n t r a s t                                                          %
1020%                                                                             %
1021%                                                                             %
1022%                                                                             %
1023%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1024*/
1025
1026  STRINGIFY(
1027
1028  inline float3 ConvertRGBToHSB(CLPixelType pixel) {
1029    float3 HueSaturationBrightness;
1030    HueSaturationBrightness.x = 0.0f; // Hue
1031    HueSaturationBrightness.y = 0.0f; // Saturation
1032    HueSaturationBrightness.z = 0.0f; // Brightness
1033
1034    float r=(float) getRed(pixel);
1035    float g=(float) getGreen(pixel);
1036    float b=(float) getBlue(pixel);
1037
1038    float tmin=MagickMin(MagickMin(r,g),b);
1039    float tmax=MagickMax(MagickMax(r,g),b);
1040
1041    if (tmax!=0.0f) {
1042      float delta=tmax-tmin;
1043      HueSaturationBrightness.y=delta/tmax;
1044      HueSaturationBrightness.z=QuantumScale*tmax;
1045
1046      if (delta != 0.0f) {
1047  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
1048  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
1049        HueSaturationBrightness.x/=6.0f;
1050        HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
1051      }
1052    }
1053    return HueSaturationBrightness;
1054  }
1055
1056  inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
1057
1058    float hue = HueSaturationBrightness.x;
1059    float brightness = HueSaturationBrightness.z;
1060    float saturation = HueSaturationBrightness.y;
1061
1062    CLPixelType rgb;
1063
1064    if (saturation == 0.0f) {
1065      setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
1066      setGreen(&rgb,getRed(rgb));
1067      setBlue(&rgb,getRed(rgb));
1068    }
1069    else {
1070
1071      float h=6.0f*(hue-floor(hue));
1072      float f=h-floor(h);
1073      float p=brightness*(1.0f-saturation);
1074      float q=brightness*(1.0f-saturation*f);
1075      float t=brightness*(1.0f-(saturation*(1.0f-f)));
1076
1077      float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
1078      float clamped_t = ClampToQuantum(QuantumRange*t);
1079      float clamped_p = ClampToQuantum(QuantumRange*p);
1080      float clamped_q = ClampToQuantum(QuantumRange*q);
1081      int ih = (int)h;
1082      setRed(&rgb, (ih == 1)?clamped_q:
1083        (ih == 2 || ih == 3)?clamped_p:
1084        (ih == 4)?clamped_t:
1085                 clampedBrightness);
1086
1087      setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
1088        (ih == 3)?clamped_q:
1089        (ih == 4 || ih == 5)?clamped_p:
1090                 clamped_t);
1091
1092      setBlue(&rgb, (ih == 2)?clamped_t:
1093        (ih == 3 || ih == 4)?clampedBrightness:
1094        (ih == 5)?clamped_q:
1095                 clamped_p);
1096    }
1097    return rgb;
1098  }
1099
1100  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
1101  {
1102
1103    const int sign = sharpen!=0?1:-1;
1104    const int x = get_global_id(0);
1105    const int y = get_global_id(1);
1106    const int columns = get_global_size(0);
1107    const int c = x + y * columns;
1108
1109    CLPixelType pixel = im[c];
1110    float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
1111    float brightness = HueSaturationBrightness.z;
1112    brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1113    brightness = clamp(brightness,0.0f,1.0f);
1114    HueSaturationBrightness.z = brightness;
1115
1116    CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
1117    filteredPixel.w = pixel.w;
1118    im[c] = filteredPixel;
1119  }
1120  )
1121
1122/*
1123%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1124%                                                                             %
1125%                                                                             %
1126%                                                                             %
1127%    C o n t r a s t S t r e t c h                                            %
1128%                                                                             %
1129%                                                                             %
1130%                                                                             %
1131%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1132*/
1133
1134    STRINGIFY(
1135    /*
1136    */
1137    __kernel void Histogram(__global CLPixelType * restrict im,
1138      const ChannelType channel,
1139      const unsigned int colorspace,
1140      const unsigned int method,
1141      __global uint4 * restrict histogram)
1142      {
1143        const int x = get_global_id(0);
1144        const int y = get_global_id(1);
1145        const int columns = get_global_size(0);
1146        const int c = x + y * columns;
1147        if ((channel & SyncChannels) != 0)
1148        {
1149          float red=(float)getRed(im[c]);
1150          float green=(float)getGreen(im[c]);
1151          float blue=(float)getBlue(im[c]);
1152
1153          float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1154          uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1155          atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1156        }
1157        else
1158        {
1159          // for equalizing, we always need all channels?
1160          // otherwise something more
1161        }
1162      }
1163    )
1164
1165    STRINGIFY(
1166    /*
1167    */
1168    __kernel void ContrastStretch(__global CLPixelType * restrict im,
1169      const ChannelType channel,
1170      __global CLPixelType * restrict stretch_map,
1171      const float4 white, const float4 black)
1172      {
1173        const int x = get_global_id(0);
1174        const int y = get_global_id(1);
1175        const int columns = get_global_size(0);
1176        const int c = x + y * columns;
1177
1178        uint ePos;
1179        CLPixelType oValue, eValue;
1180        CLQuantum red, green, blue, alpha;
1181
1182        //read from global
1183        oValue=im[c];
1184
1185        if ((channel & RedChannel) != 0)
1186        {
1187          if (getRedF4(white) != getRedF4(black))
1188          {
1189            ePos = ScaleQuantumToMap(getRed(oValue));
1190            eValue = stretch_map[ePos];
1191            red = getRed(eValue);
1192          }
1193        }
1194
1195        if ((channel & GreenChannel) != 0)
1196        {
1197          if (getGreenF4(white) != getGreenF4(black))
1198          {
1199            ePos = ScaleQuantumToMap(getGreen(oValue));
1200            eValue = stretch_map[ePos];
1201            green = getGreen(eValue);
1202          }
1203        }
1204
1205        if ((channel & BlueChannel) != 0)
1206        {
1207          if (getBlueF4(white) != getBlueF4(black))
1208          {
1209            ePos = ScaleQuantumToMap(getBlue(oValue));
1210            eValue = stretch_map[ePos];
1211            blue = getBlue(eValue);
1212          }
1213        }
1214
1215        if ((channel & AlphaChannel) != 0)
1216        {
1217          if (getAlphaF4(white) != getAlphaF4(black))
1218          {
1219            ePos = ScaleQuantumToMap(getAlpha(oValue));
1220            eValue = stretch_map[ePos];
1221            alpha = getAlpha(eValue);
1222          }
1223        }
1224
1225        //write back
1226        im[c]=(CLPixelType)(blue, green, red, alpha);
1227
1228      }
1229    )
1230
1231/*
1232%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1233%                                                                             %
1234%                                                                             %
1235%                                                                             %
1236%    C o n v o l v e                                                          %
1237%                                                                             %
1238%                                                                             %
1239%                                                                             %
1240%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1241*/
1242
1243  STRINGIFY(
1244    __kernel
1245    void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1246    const unsigned int imageWidth, const unsigned int imageHeight,
1247    __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1248    const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1249
1250      int2 blockID;
1251      blockID.x = get_global_id(0) / get_local_size(0);
1252      blockID.y = get_global_id(1) / get_local_size(1);
1253
1254      // image area processed by this workgroup
1255      int2 imageAreaOrg;
1256      imageAreaOrg.x = blockID.x * get_local_size(0);
1257      imageAreaOrg.y = blockID.y * get_local_size(1);
1258
1259      int2 midFilterDimen;
1260      midFilterDimen.x = (filterWidth-1)/2;
1261      midFilterDimen.y = (filterHeight-1)/2;
1262
1263      int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1264
1265      // dimension of the local cache
1266      int2 cachedAreaDimen;
1267      cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1268      cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1269
1270      // cache the pixels accessed by this workgroup in local memory
1271      int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1272      int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1273      int groupSize = get_local_size(0) * get_local_size(1);
1274      for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1275
1276        int2 cachedAreaIndex;
1277        cachedAreaIndex.x = i % cachedAreaDimen.x;
1278        cachedAreaIndex.y = i / cachedAreaDimen.x;
1279
1280        int2 imagePixelIndex;
1281        imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1282
1283        // only support EdgeVirtualPixelMethod through ClampToCanvas
1284        // TODO: implement other virtual pixel method
1285        imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1286        imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1287
1288        pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1289      }
1290
1291      // cache the filter
1292      for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1293        filterCache[i] = filter[i];
1294      }
1295      barrier(CLK_LOCAL_MEM_FENCE);
1296
1297
1298      int2 imageIndex;
1299      imageIndex.x = imageAreaOrg.x + get_local_id(0);
1300      imageIndex.y = imageAreaOrg.y + get_local_id(1);
1301
1302      // if out-of-range, stops here and quit
1303      if (imageIndex.x >= imageWidth
1304        || imageIndex.y >= imageHeight) {
1305          return;
1306      }
1307
1308      int filterIndex = 0;
1309      float4 sum = (float4)0.0f;
1310      float gamma = 0.0f;
1311      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1312        int cacheIndexY = get_local_id(1);
1313        for (int j = 0; j < filterHeight; j++) {
1314          int cacheIndexX = get_local_id(0);
1315          for (int i = 0; i < filterWidth; i++) {
1316            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1317            float f = filterCache[filterIndex];
1318
1319            sum.x += f * p.x;
1320            sum.y += f * p.y;
1321            sum.z += f * p.z;
1322            sum.w += f * p.w;
1323
1324            gamma += f;
1325            filterIndex++;
1326            cacheIndexX++;
1327          }
1328          cacheIndexY++;
1329        }
1330      }
1331      else {
1332        int cacheIndexY = get_local_id(1);
1333        for (int j = 0; j < filterHeight; j++) {
1334          int cacheIndexX = get_local_id(0);
1335          for (int i = 0; i < filterWidth; i++) {
1336
1337            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1338            float alpha = QuantumScale*p.w;
1339            float f = filterCache[filterIndex];
1340            float g = alpha * f;
1341
1342            sum.x += g*p.x;
1343            sum.y += g*p.y;
1344            sum.z += g*p.z;
1345            sum.w += f*p.w;
1346
1347            gamma += g;
1348            filterIndex++;
1349            cacheIndexX++;
1350          }
1351          cacheIndexY++;
1352        }
1353        gamma = PerceptibleReciprocal(gamma);
1354        sum.xyz = gamma*sum.xyz;
1355      }
1356      CLPixelType outputPixel;
1357      outputPixel.x = ClampToQuantum(sum.x);
1358      outputPixel.y = ClampToQuantum(sum.y);
1359      outputPixel.z = ClampToQuantum(sum.z);
1360      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1361
1362      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1363    }
1364  )
1365
1366  STRINGIFY(
1367    __kernel
1368    void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1369                  const uint imageWidth, const uint imageHeight,
1370                  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1371                  const uint matte, const ChannelType channel) {
1372
1373      int2 imageIndex;
1374      imageIndex.x = get_global_id(0);
1375      imageIndex.y = get_global_id(1);
1376
1377      /*
1378      unsigned int imageWidth = get_global_size(0);
1379      unsigned int imageHeight = get_global_size(1);
1380      */
1381      if (imageIndex.x >= imageWidth
1382          || imageIndex.y >= imageHeight)
1383          return;
1384
1385      int2 midFilterDimen;
1386      midFilterDimen.x = (filterWidth-1)/2;
1387      midFilterDimen.y = (filterHeight-1)/2;
1388
1389      int filterIndex = 0;
1390      float4 sum = (float4)0.0f;
1391      float gamma = 0.0f;
1392      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1393        for (int j = 0; j < filterHeight; j++) {
1394          int2 inputPixelIndex;
1395          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1396          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1397          for (int i = 0; i < filterWidth; i++) {
1398            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1399            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1400
1401            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1402            float f = filter[filterIndex];
1403
1404            sum.x += f * p.x;
1405            sum.y += f * p.y;
1406            sum.z += f * p.z;
1407            sum.w += f * p.w;
1408
1409            gamma += f;
1410
1411            filterIndex++;
1412          }
1413        }
1414      }
1415      else {
1416
1417        for (int j = 0; j < filterHeight; j++) {
1418          int2 inputPixelIndex;
1419          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1420          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1421          for (int i = 0; i < filterWidth; i++) {
1422            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1423            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1424
1425            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1426            float alpha = QuantumScale*p.w;
1427            float f = filter[filterIndex];
1428            float g = alpha * f;
1429
1430            sum.x += g*p.x;
1431            sum.y += g*p.y;
1432            sum.z += g*p.z;
1433            sum.w += f*p.w;
1434
1435            gamma += g;
1436
1437
1438            filterIndex++;
1439          }
1440        }
1441        gamma = PerceptibleReciprocal(gamma);
1442        sum.xyz = gamma*sum.xyz;
1443      }
1444
1445      CLPixelType outputPixel;
1446      outputPixel.x = ClampToQuantum(sum.x);
1447      outputPixel.y = ClampToQuantum(sum.y);
1448      outputPixel.z = ClampToQuantum(sum.z);
1449      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1450
1451      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1452    }
1453  )
1454
1455/*
1456%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1457%                                                                             %
1458%                                                                             %
1459%                                                                             %
1460%     D e s p e c k l e                                                       %
1461%                                                                             %
1462%                                                                             %
1463%                                                                             %
1464%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1465*/
1466
1467  STRINGIFY(
1468
1469  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1470  , const unsigned int imageWidth, const unsigned int imageHeight
1471  , const int2 offset, const int polarity, const int matte) {
1472
1473    int x = get_global_id(0);
1474    int y = get_global_id(1);
1475
1476    CLPixelType v = inputImage[y*imageWidth+x];
1477
1478    int2 neighbor;
1479    neighbor.y = y + offset.y;
1480    neighbor.x = x + offset.x;
1481
1482    int2 clampedNeighbor;
1483    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1484    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1485
1486    CLPixelType r = (clampedNeighbor.x == neighbor.x
1487                     && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1488    :(CLPixelType)0;
1489
1490    int sv[4];
1491    sv[0] = (int)v.x;
1492    sv[1] = (int)v.y;
1493    sv[2] = (int)v.z;
1494    sv[3] = (int)v.w;
1495
1496    int sr[4];
1497    sr[0] = (int)r.x;
1498    sr[1] = (int)r.y;
1499    sr[2] = (int)r.z;
1500    sr[3] = (int)r.w;
1501
1502    if (polarity > 0) {
1503      \n #pragma unroll 4\n
1504      for (unsigned int i = 0; i < 4; i++) {
1505        sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1506      }
1507    }
1508    else {
1509      \n #pragma unroll 4\n
1510      for (unsigned int i = 0; i < 4; i++) {
1511        sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1512      }
1513
1514    }
1515
1516    v.x = (CLQuantum)sv[0];
1517    v.y = (CLQuantum)sv[1];
1518    v.z = (CLQuantum)sv[2];
1519
1520    if (matte!=0)
1521      v.w = (CLQuantum)sv[3];
1522
1523    outputImage[y*imageWidth+x] = v;
1524
1525    }
1526
1527
1528  )
1529
1530  STRINGIFY(
1531
1532  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1533  , const unsigned int imageWidth, const unsigned int imageHeight
1534  , const int2 offset, const int polarity, const int matte) {
1535
1536    int x = get_global_id(0);
1537    int y = get_global_id(1);
1538
1539    CLPixelType v = inputImage[y*imageWidth+x];
1540
1541    int2 neighbor, clampedNeighbor;
1542
1543    neighbor.y = y + offset.y;
1544    neighbor.x = x + offset.x;
1545    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1546    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1547
1548    CLPixelType r = (clampedNeighbor.x == neighbor.x
1549      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1550    :(CLPixelType)0;
1551
1552
1553    neighbor.y = y - offset.y;
1554    neighbor.x = x - offset.x;
1555    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1556    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1557
1558    CLPixelType s = (clampedNeighbor.x == neighbor.x
1559      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1560    :(CLPixelType)0;
1561
1562
1563    int sv[4];
1564    sv[0] = (int)v.x;
1565    sv[1] = (int)v.y;
1566    sv[2] = (int)v.z;
1567    sv[3] = (int)v.w;
1568
1569    int sr[4];
1570    sr[0] = (int)r.x;
1571    sr[1] = (int)r.y;
1572    sr[2] = (int)r.z;
1573    sr[3] = (int)r.w;
1574
1575    int ss[4];
1576    ss[0] = (int)s.x;
1577    ss[1] = (int)s.y;
1578    ss[2] = (int)s.z;
1579    ss[3] = (int)s.w;
1580
1581    if (polarity > 0) {
1582      \n #pragma unroll 4\n
1583      for (unsigned int i = 0; i < 4; i++) {
1584        //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1585        //
1586        //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1587        //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1588        sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1589      }
1590    }
1591    else {
1592      \n #pragma unroll 4\n
1593      for (unsigned int i = 0; i < 4; i++) {
1594        //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1595        //
1596        //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1597        sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1598      }
1599    }
1600
1601    v.x = (CLQuantum)sv[0];
1602    v.y = (CLQuantum)sv[1];
1603    v.z = (CLQuantum)sv[2];
1604
1605    if (matte!=0)
1606      v.w = (CLQuantum)sv[3];
1607
1608    outputImage[y*imageWidth+x] = v;
1609
1610    }
1611  )
1612
1613/*
1614%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1615%                                                                             %
1616%                                                                             %
1617%                                                                             %
1618%     E q u a l i z e                                                         %
1619%                                                                             %
1620%                                                                             %
1621%                                                                             %
1622%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1623*/
1624
1625    STRINGIFY(
1626    /*
1627    */
1628    __kernel void Equalize(__global CLPixelType * restrict im,
1629      const ChannelType channel,
1630      __global CLPixelType * restrict equalize_map,
1631      const float4 white, const float4 black)
1632      {
1633        const int x = get_global_id(0);
1634        const int y = get_global_id(1);
1635        const int columns = get_global_size(0);
1636        const int c = x + y * columns;
1637
1638        uint ePos;
1639        CLPixelType oValue, eValue;
1640        CLQuantum red, green, blue, alpha;
1641
1642        //read from global
1643        oValue=im[c];
1644
1645        if ((channel & SyncChannels) != 0)
1646        {
1647          if (getRedF4(white) != getRedF4(black))
1648          {
1649            ePos = ScaleQuantumToMap(getRed(oValue));
1650            eValue = equalize_map[ePos];
1651            red = getRed(eValue);
1652            ePos = ScaleQuantumToMap(getGreen(oValue));
1653            eValue = equalize_map[ePos];
1654            green = getRed(eValue);
1655            ePos = ScaleQuantumToMap(getBlue(oValue));
1656            eValue = equalize_map[ePos];
1657            blue = getRed(eValue);
1658            ePos = ScaleQuantumToMap(getAlpha(oValue));
1659            eValue = equalize_map[ePos];
1660            alpha = getRed(eValue);
1661
1662            //write back
1663            im[c]=(CLPixelType)(blue, green, red, alpha);
1664          }
1665
1666        }
1667
1668        // for equalizing, we always need all channels?
1669        // otherwise something more
1670
1671     }
1672    )
1673
1674/*
1675%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1676%                                                                             %
1677%                                                                             %
1678%                                                                             %
1679%     F u n c t i o n                                                         %
1680%                                                                             %
1681%                                                                             %
1682%                                                                             %
1683%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1684*/
1685
1686  STRINGIFY(
1687  /*
1688  apply FunctionImageChannel(braightness-contrast)
1689  */
1690  CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1691    const unsigned int number_parameters,__constant float *parameters)
1692  {
1693    float result = 0.0f;
1694
1695    switch (function)
1696    {
1697    case PolynomialFunction:
1698      {
1699        for (unsigned int i=0; i < number_parameters; i++)
1700          result = result*QuantumScale*pixel + parameters[i];
1701        result *= QuantumRange;
1702        break;
1703      }
1704    case SinusoidFunction:
1705      {
1706        float  freq,phase,ampl,bias;
1707        freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1708        phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1709        ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1710        bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1711        result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1712          (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1713        break;
1714      }
1715    case ArcsinFunction:
1716      {
1717        float  width,range,center,bias;
1718        width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1719        center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1720        range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1721        bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1722
1723        result = 2.0f/width*(QuantumScale*pixel - center);
1724        result = range/MagickPI*asin(result)+bias;
1725        result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1726        result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1727        result *= QuantumRange;
1728        break;
1729      }
1730    case ArctanFunction:
1731      {
1732        float slope,range,center,bias;
1733        slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1734        center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1735        range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1736        bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1737        result = MagickPI*slope*(QuantumScale*pixel-center);
1738        result = QuantumRange*(range/MagickPI*atan(result) + bias);
1739        break;
1740      }
1741    case UndefinedFunction:
1742      break;
1743    }
1744    return(ClampToQuantum(result));
1745  }
1746  )
1747
1748  STRINGIFY(
1749  /*
1750  Improve brightness / contrast of the image
1751  channel : define which channel is improved
1752  function : the function called to enchance the brightness contrast
1753  number_parameters : numbers of parameters
1754  parameters : the parameter
1755  */
1756  __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1757    const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1758    __constant float *parameters)
1759  {
1760    const unsigned int x = get_global_id(0);
1761    const unsigned int y = get_global_id(1);
1762    const unsigned int columns = get_global_size(0);
1763    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1764
1765    float red;
1766    float green;
1767    float blue;
1768    float alpha;
1769
1770    ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1771
1772    if ((channel & RedChannel) != 0)
1773      red=ApplyFunction(red, function, number_parameters, parameters);
1774
1775    if (number_channels > 2)
1776      {
1777        if ((channel & GreenChannel) != 0)
1778          green=ApplyFunction(green, function, number_parameters, parameters);
1779
1780        if ((channel & BlueChannel) != 0)
1781          blue=ApplyFunction(blue, function, number_parameters, parameters);
1782      }
1783
1784    if (((number_channels == 4) || (number_channels == 2)) &&
1785        ((channel & AlphaChannel) != 0))
1786      alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1787
1788    WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1789  }
1790  )
1791
1792/*
1793%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1794%                                                                             %
1795%                                                                             %
1796%                                                                             %
1797%     G r a y s c a l e                                                       %
1798%                                                                             %
1799%                                                                             %
1800%                                                                             %
1801%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1802*/
1803
1804  STRINGIFY(
1805  __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1806    const unsigned int colorspace,const unsigned int method)
1807  {
1808    const unsigned int x = get_global_id(0);
1809    const unsigned int y = get_global_id(1);
1810    const unsigned int columns = get_global_size(0);
1811    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1812
1813    float
1814      blue,
1815      green,
1816      red;
1817
1818    red=getPixelRed(p);
1819    green=getPixelGreen(p);
1820    blue=getPixelBlue(p);
1821
1822    CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1823
1824    setPixelRed(p,intensity);
1825    setPixelGreen(p,intensity);
1826    setPixelBlue(p,intensity);
1827  }
1828  )
1829
1830/*
1831%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1832%                                                                             %
1833%                                                                             %
1834%                                                                             %
1835%     L o c a l C o n t r a s t                                               %
1836%                                                                             %
1837%                                                                             %
1838%                                                                             %
1839%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1840*/
1841
1842    STRINGIFY(
1843
1844      __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1845          const int radius,
1846          const int imageWidth,
1847          const int imageHeight)
1848      {
1849        const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1850
1851        int x = get_local_id(0);
1852        int y = get_global_id(1);
1853
1854        global CLPixelType *src = srcImage + y * imageWidth;
1855
1856        for (int i = x; i < imageWidth; i += get_local_size(0)) {
1857            float sum = 0.0f;
1858            float weight = 1.0f;
1859
1860            int j = i - radius;
1861            while ((j + 7) < i) {
1862                for (int k = 0; k < 8; ++k) // Unroll 8x
1863                    sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1864                weight += 8.0f;
1865                j+=8;
1866            }
1867            while (j < i) {
1868                sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1869                weight += 1.0f;
1870                ++j;
1871            }
1872
1873            while ((j + 7) < radius + i) {
1874                for (int k = 0; k < 8; ++k) // Unroll 8x
1875                    sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1876                weight -= 8.0f;
1877                j+=8;
1878            }
1879            while (j < radius + i) {
1880                sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1881                weight -= 1.0f;
1882                ++j;
1883            }
1884
1885            tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1886        }
1887      }
1888    )
1889
1890    STRINGIFY(
1891      __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
1892          const int radius,
1893          const float strength,
1894          const int imageWidth,
1895          const int imageHeight)
1896      {
1897        const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1898
1899        int x = get_global_id(0);
1900        int y = get_global_id(1);
1901
1902        if ((x >= imageWidth) || (y >= imageHeight))
1903                return;
1904
1905        global float *src = blurImage + x;
1906
1907        float sum = 0.0f;
1908        float weight = 1.0f;
1909
1910        int j = y - radius;
1911        while ((j + 7) < y) {
1912            for (int k = 0; k < 8; ++k) // Unroll 8x
1913                sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1914            weight += 8.0f;
1915            j+=8;
1916        }
1917        while (j < y) {
1918            sum += weight * src[mirrorBottom(j) * imageWidth];
1919            weight += 1.0f;
1920            ++j;
1921        }
1922
1923        while ((j + 7) < radius + y) {
1924            for (int k = 0; k < 8; ++k) // Unroll 8x
1925                sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1926            weight -= 8.0f;
1927            j+=8;
1928        }
1929        while (j < radius + y) {
1930            sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1931            weight -= 1.0f;
1932            ++j;
1933        }
1934
1935        CLPixelType pixel = srcImage[x + y * imageWidth];
1936        float srcVal = dot(RGB, convert_float4(pixel));
1937        float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1938        mult = (srcVal + mult) / srcVal;
1939
1940        pixel.x = ClampToQuantum(pixel.x * mult);
1941        pixel.y = ClampToQuantum(pixel.y * mult);
1942        pixel.z = ClampToQuantum(pixel.z * mult);
1943
1944        dstImage[x + y * imageWidth] = pixel;
1945      }
1946    )
1947
1948/*
1949%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1950%                                                                             %
1951%                                                                             %
1952%                                                                             %
1953%     M o d u l a t e                                                         %
1954%                                                                             %
1955%                                                                             %
1956%                                                                             %
1957%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1958*/
1959
1960  STRINGIFY(
1961
1962  inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1963    float *hue, float *saturation, float *lightness)
1964  {
1965  float
1966    c,
1967    tmax,
1968    tmin;
1969
1970  /*
1971     Convert RGB to HSL colorspace.
1972     */
1973  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1974  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1975
1976  c=tmax-tmin;
1977
1978  *lightness=(tmax+tmin)/2.0;
1979  if (c <= 0.0)
1980  {
1981    *hue=0.0;
1982    *saturation=0.0;
1983    return;
1984  }
1985
1986  if (tmax == (QuantumScale*red))
1987  {
1988    *hue=(QuantumScale*green-QuantumScale*blue)/c;
1989    if ((QuantumScale*green) < (QuantumScale*blue))
1990      *hue+=6.0;
1991  }
1992  else
1993    if (tmax == (QuantumScale*green))
1994      *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
1995    else
1996      *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
1997
1998  *hue*=60.0/360.0;
1999  if (*lightness <= 0.5)
2000    *saturation=c/(2.0*(*lightness));
2001  else
2002    *saturation=c/(2.0-2.0*(*lightness));
2003  }
2004
2005  inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2006      CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2007  {
2008    float
2009      b,
2010      c,
2011      g,
2012      h,
2013      tmin,
2014      r,
2015      x;
2016
2017    /*
2018       Convert HSL to RGB colorspace.
2019       */
2020    h=hue*360.0;
2021    if (lightness <= 0.5)
2022      c=2.0*lightness*saturation;
2023    else
2024      c=(2.0-2.0*lightness)*saturation;
2025    tmin=lightness-0.5*c;
2026    h-=360.0*floor(h/360.0);
2027    h/=60.0;
2028    x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2029    switch ((int) floor(h) % 6)
2030    {
2031      case 0:
2032        {
2033          r=tmin+c;
2034          g=tmin+x;
2035          b=tmin;
2036          break;
2037        }
2038      case 1:
2039        {
2040          r=tmin+x;
2041          g=tmin+c;
2042          b=tmin;
2043          break;
2044        }
2045      case 2:
2046        {
2047          r=tmin;
2048          g=tmin+c;
2049          b=tmin+x;
2050          break;
2051        }
2052      case 3:
2053        {
2054          r=tmin;
2055          g=tmin+x;
2056          b=tmin+c;
2057          break;
2058        }
2059      case 4:
2060        {
2061          r=tmin+x;
2062          g=tmin;
2063          b=tmin+c;
2064          break;
2065        }
2066      case 5:
2067        {
2068          r=tmin+c;
2069          g=tmin;
2070          b=tmin+x;
2071          break;
2072        }
2073      default:
2074        {
2075          r=0.0;
2076          g=0.0;
2077          b=0.0;
2078        }
2079    }
2080    *red=ClampToQuantum(QuantumRange*r);
2081    *green=ClampToQuantum(QuantumRange*g);
2082    *blue=ClampToQuantum(QuantumRange*b);
2083  }
2084
2085  inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2086    CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2087  {
2088    float
2089      hue,
2090      lightness,
2091      saturation;
2092
2093    /*
2094    Increase or decrease color lightness, saturation, or hue.
2095    */
2096    ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2097    hue+=0.5*(0.01*percent_hue-1.0);
2098    while (hue < 0.0)
2099      hue+=1.0;
2100    while (hue >= 1.0)
2101      hue-=1.0;
2102    saturation*=0.01*percent_saturation;
2103    lightness*=0.01*percent_lightness;
2104    ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2105  }
2106
2107  __kernel void Modulate(__global CLPixelType *im,
2108    const float percent_brightness,
2109    const float percent_hue,
2110    const float percent_saturation,
2111    const int colorspace)
2112  {
2113
2114    const int x = get_global_id(0);
2115    const int y = get_global_id(1);
2116    const int columns = get_global_size(0);
2117    const int c = x + y * columns;
2118
2119    CLPixelType pixel = im[c];
2120
2121    CLQuantum
2122        blue,
2123        green,
2124        red;
2125
2126    red=getRed(pixel);
2127    green=getGreen(pixel);
2128    blue=getBlue(pixel);
2129
2130    switch (colorspace)
2131    {
2132      case HSLColorspace:
2133      default:
2134        {
2135          ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2136              &red, &green, &blue);
2137        }
2138
2139    }
2140
2141    CLPixelType filteredPixel;
2142
2143    setRed(&filteredPixel, red);
2144    setGreen(&filteredPixel, green);
2145    setBlue(&filteredPixel, blue);
2146    filteredPixel.w = pixel.w;
2147
2148    im[c] = filteredPixel;
2149  }
2150  )
2151
2152/*
2153%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2154%                                                                             %
2155%                                                                             %
2156%                                                                             %
2157%     M o t i o n B l u r                                                     %
2158%                                                                             %
2159%                                                                             %
2160%                                                                             %
2161%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2162*/
2163
2164  STRINGIFY(
2165    __kernel
2166    void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2167                    const unsigned int imageWidth, const unsigned int imageHeight,
2168                    const __global float *filter, const unsigned int width, const __global int2* offset,
2169                    const float4 bias,
2170                    const ChannelType channel, const unsigned int matte) {
2171
2172      int2 currentPixel;
2173      currentPixel.x = get_global_id(0);
2174      currentPixel.y = get_global_id(1);
2175
2176      if (currentPixel.x >= imageWidth
2177          || currentPixel.y >= imageHeight)
2178          return;
2179
2180      float4 pixel;
2181      pixel.x = (float)bias.x;
2182      pixel.y = (float)bias.y;
2183      pixel.z = (float)bias.z;
2184      pixel.w = (float)bias.w;
2185
2186      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
2187
2188        for (int i = 0; i < width; i++) {
2189          // only support EdgeVirtualPixelMethod through ClampToCanvas
2190          // TODO: implement other virtual pixel method
2191          int2 samplePixel = currentPixel + offset[i];
2192          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2193          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2194          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2195
2196          pixel.x += (filter[i] * (float)samplePixelValue.x);
2197          pixel.y += (filter[i] * (float)samplePixelValue.y);
2198          pixel.z += (filter[i] * (float)samplePixelValue.z);
2199          pixel.w += (filter[i] * (float)samplePixelValue.w);
2200        }
2201
2202        CLPixelType outputPixel;
2203        outputPixel.x = ClampToQuantum(pixel.x);
2204        outputPixel.y = ClampToQuantum(pixel.y);
2205        outputPixel.z = ClampToQuantum(pixel.z);
2206        outputPixel.w = ClampToQuantum(pixel.w);
2207        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2208      }
2209      else {
2210
2211        float gamma = 0.0f;
2212        for (int i = 0; i < width; i++) {
2213          // only support EdgeVirtualPixelMethod through ClampToCanvas
2214          // TODO: implement other virtual pixel method
2215          int2 samplePixel = currentPixel + offset[i];
2216          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2217          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2218
2219          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2220
2221          float alpha = QuantumScale*samplePixelValue.w;
2222          float k = filter[i];
2223          pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2224          pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2225          pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2226
2227          pixel.w += k * alpha * samplePixelValue.w;
2228
2229          gamma+=k*alpha;
2230        }
2231        gamma = PerceptibleReciprocal(gamma);
2232        pixel.xyz = gamma*pixel.xyz;
2233
2234        CLPixelType outputPixel;
2235        outputPixel.x = ClampToQuantum(pixel.x);
2236        outputPixel.y = ClampToQuantum(pixel.y);
2237        outputPixel.z = ClampToQuantum(pixel.z);
2238        outputPixel.w = ClampToQuantum(pixel.w);
2239        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2240      }
2241    }
2242  )
2243
2244/*
2245%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2246%                                                                             %
2247%                                                                             %
2248%                                                                             %
2249%     R e s i z e                                                             %
2250%                                                                             %
2251%                                                                             %
2252%                                                                             %
2253%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2254*/
2255
2256  STRINGIFY(
2257  // Based on Box from resize.c
2258  float BoxResizeFilter(const float x)
2259  {
2260    return 1.0f;
2261  }
2262  )
2263
2264  STRINGIFY(
2265  // Based on CubicBC from resize.c
2266  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2267  {
2268    /*
2269    Cubic Filters using B,C determined values:
2270    Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
2271    Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
2272    Spline              B = 1   C = 0    B-Spline Gaussian approximation
2273    Hermite             B = 0   C = 0    B-Spline interpolator
2274
2275    See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2276    Graphics Computer Graphics, Volume 22, Number 4, August 1988
2277    http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2278    Mitchell.pdf.
2279
2280    Coefficents are determined from B,C values:
2281    P0 = (  6 - 2*B       )/6 = coeff[0]
2282    P1 =         0
2283    P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2284    P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2285    Q0 = (      8*B +24*C )/6 = coeff[3]
2286    Q1 = (    -12*B -48*C )/6 = coeff[4]
2287    Q2 = (      6*B +30*C )/6 = coeff[5]
2288    Q3 = (    - 1*B - 6*C )/6 = coeff[6]
2289
2290    which are used to define the filter:
2291
2292    P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
2293    Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
2294
2295    which ensures function is continuous in value and derivative (slope).
2296    */
2297    if (x < 1.0)
2298      return(resizeFilterCoefficients[0]+x*(x*
2299      (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2300    if (x < 2.0)
2301      return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2302      (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2303    return(0.0);
2304  }
2305  )
2306
2307  STRINGIFY(
2308  float Sinc(const float x)
2309  {
2310    if (x != 0.0f)
2311    {
2312      const float alpha=(float) (MagickPI*x);
2313      return sinpi(x)/alpha;
2314    }
2315    return(1.0f);
2316  }
2317  )
2318
2319  STRINGIFY(
2320  float Triangle(const float x)
2321  {
2322    /*
2323    1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2324    a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
2325    for Sinc().
2326    */
2327    return ((x<1.0f)?(1.0f-x):0.0f);
2328  }
2329  )
2330
2331
2332  STRINGIFY(
2333  float Hann(const float x)
2334  {
2335    /*
2336    Cosine window function:
2337      0.5+0.5*cos(pi*x).
2338    */
2339    const float cosine=cos((MagickPI*x));
2340    return(0.5f+0.5f*cosine);
2341  }
2342  )
2343
2344  STRINGIFY(
2345  float Hamming(const float x)
2346  {
2347    /*
2348      Offset cosine window function:
2349       .54 + .46 cos(pi x).
2350    */
2351    const float cosine=cos((MagickPI*x));
2352    return(0.54f+0.46f*cosine);
2353  }
2354  )
2355
2356  STRINGIFY(
2357  float Blackman(const float x)
2358  {
2359    /*
2360      Blackman: 2nd order cosine windowing function:
2361        0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2362
2363      Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2364      five flops.
2365    */
2366    const float cosine=cos((MagickPI*x));
2367    return(0.34f+cosine*(0.5f+cosine*0.16f));
2368  }
2369  )
2370
2371  STRINGIFY(
2372  inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2373  {
2374    switch (filterType)
2375    {
2376    /* Call Sinc even for SincFast to get better precision on GPU
2377       and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
2378    case SincWeightingFunction:
2379    case SincFastWeightingFunction:
2380      return Sinc(x);
2381    case CubicBCWeightingFunction:
2382      return CubicBC(x,filterCoefficients);
2383    case BoxWeightingFunction:
2384      return BoxResizeFilter(x);
2385    case TriangleWeightingFunction:
2386      return Triangle(x);
2387    case HannWeightingFunction:
2388      return Hann(x);
2389    case HammingWeightingFunction:
2390      return Hamming(x);
2391    case BlackmanWeightingFunction:
2392      return Blackman(x);
2393
2394    default:
2395      return 0.0f;
2396    }
2397  }
2398  )
2399
2400
2401  STRINGIFY(
2402  inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2403           , const ResizeWeightingFunctionType resizeWindowType
2404           , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2405  {
2406    float scale;
2407    float xBlur = fabs(x/resizeFilterBlur);
2408    if (resizeWindowSupport < MagickEpsilon
2409        || resizeWindowType == BoxWeightingFunction)
2410    {
2411      scale = 1.0f;
2412    }
2413    else
2414    {
2415      scale = resizeFilterScale;
2416      scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2417    }
2418    float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2419    return weight;
2420  }
2421
2422  )
2423
2424  ;
2425  const char *accelerateKernels2 =
2426
2427  STRINGIFY(
2428
2429  inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2430    return (numWorkItems/pixelPerWorkgroup);
2431  }
2432
2433  // returns the index of the pixel for the current workitem to compute.
2434  // returns -1 if this workitem doesn't need to participate in any computation
2435  inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2436    const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2437    int pixelIndex = itemID/numWorkItemsPerPixel;
2438    pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2439    return pixelIndex;
2440  }
2441
2442  )
2443
2444  STRINGIFY(
2445  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2446    void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2447      const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2448      const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2449      const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2450      const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2451      const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2452      const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2453      __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2454  {
2455    // calculate the range of resized image pixels computed by this workgroup
2456    const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2457    const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2458    const unsigned int actualNumPixelToCompute = stopX - startX;
2459
2460    // calculate the range of input image pixels to cache
2461    float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2462    const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2463    scale = PerceptibleReciprocal(scale);
2464
2465    const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2466    const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2467
2468    // cache the input pixels into local memory
2469    const unsigned int y = get_global_id(1);
2470    const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2471    const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2472    event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2473    wait_group_events(1, &e);
2474
2475    unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2476    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2477    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2478    {
2479      const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2480      const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2481      const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2482
2483      // determine which resized pixel computed by this workitem
2484      const unsigned int itemID = get_local_id(0);
2485      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2486
2487      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2488
2489      float4 filteredPixel = (float4)0.0f;
2490      float density = 0.0f;
2491      float gamma = 0.0f;
2492      // -1 means this workitem doesn't participate in the computation
2493      if (pixelIndex != -1)
2494      {
2495        // x coordinated of the resized pixel computed by this workitem
2496        const int x = chunkStartX + pixelIndex;
2497
2498        // calculate how many steps required for this pixel
2499        const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2500        const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2501        const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2502        const unsigned int n = stop - start;
2503
2504        // calculate how many steps this workitem will contribute
2505        unsigned int numStepsPerWorkItem = n / numItems;
2506        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2507
2508        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2509        if (startStep < n)
2510        {
2511          const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2512
2513          unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2514          for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2515          {
2516            float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2517              (ResizeWeightingFunctionType) resizeFilterType,
2518              (ResizeWeightingFunctionType) resizeWindowType,
2519              resizeFilterScale, resizeFilterWindowSupport,
2520              resizeFilterBlur, scale*(start + i - bisect + 0.5));
2521
2522            float4 cp = (float4)0.0f;
2523
2524            __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2525            cp.x = (float) *(p);
2526            if (number_channels > 2)
2527            {
2528              cp.y = (float) *(p + 1);
2529              cp.z = (float) *(p + 2);
2530            }
2531
2532            if (alpha_index != 0)
2533            {
2534              cp.w = (float) *(p + alpha_index);
2535
2536              float alpha = weight * QuantumScale * cp.w;
2537
2538              filteredPixel.x += alpha * cp.x;
2539              filteredPixel.y += alpha * cp.y;
2540              filteredPixel.z += alpha * cp.z;
2541              filteredPixel.w += weight * cp.w;
2542              gamma += alpha;
2543            }
2544            else
2545              filteredPixel += ((float4) weight)*cp;
2546
2547            density += weight;
2548          }
2549        }
2550      }
2551
2552      // initialize the accumulators to zero
2553      if (itemID < actualNumPixelInThisChunk) {
2554        outputPixelCache[itemID] = (float4)0.0f;
2555        densityCache[itemID] = 0.0f;
2556        if (alpha_index != 0)
2557          gammaCache[itemID] = 0.0f;
2558      }
2559      barrier(CLK_LOCAL_MEM_FENCE);
2560
2561      // accumulatte the filtered pixel value and the density
2562      for (unsigned int i = 0; i < numItems; i++) {
2563        if (pixelIndex != -1) {
2564          if (itemID%numItems == i) {
2565            outputPixelCache[pixelIndex]+=filteredPixel;
2566            densityCache[pixelIndex]+=density;
2567            if (alpha_index != 0)
2568              gammaCache[pixelIndex]+=gamma;
2569          }
2570        }
2571        barrier(CLK_LOCAL_MEM_FENCE);
2572      }
2573
2574      if (itemID < actualNumPixelInThisChunk)
2575      {
2576        float4 filteredPixel = outputPixelCache[itemID];
2577
2578        float gamma = 0.0f;
2579        if (alpha_index != 0)
2580          gamma = gammaCache[itemID];
2581
2582        float density = densityCache[itemID];
2583        if ((density != 0.0f) && (density != 1.0f))
2584        {
2585          density = PerceptibleReciprocal(density);
2586          filteredPixel *= (float4) density;
2587          if (alpha_index != 0)
2588            gamma *= density;
2589        }
2590
2591        if (alpha_index != 0)
2592        {
2593          gamma = PerceptibleReciprocal(gamma);
2594          filteredPixel.x *= gamma;
2595          filteredPixel.y *= gamma;
2596          filteredPixel.z *= gamma;
2597        }
2598
2599        WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
2600      }
2601    }
2602  }
2603  )
2604
2605
2606  STRINGIFY(
2607 __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2608    void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2609      const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2610      const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2611      const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2612      const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2613      const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2614      const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2615      __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2616  {
2617    // calculate the range of resized image pixels computed by this workgroup
2618    const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2619    const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2620    const unsigned int actualNumPixelToCompute = stopY - startY;
2621
2622    // calculate the range of input image pixels to cache
2623    float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2624    const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2625    scale = PerceptibleReciprocal(scale);
2626
2627    const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2628    const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2629
2630    // cache the input pixels into local memory
2631    const unsigned int x = get_global_id(0);
2632    unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2633    unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2634    unsigned int stride = inputColumns * number_channels;
2635    for (unsigned int i = 0; i < number_channels; i++)
2636    {
2637      event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2638      wait_group_events(1,&e);
2639    }
2640
2641    unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2642    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2643    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2644    {
2645      const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2646      const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2647      const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2648
2649      // determine which resized pixel computed by this workitem
2650      const unsigned int itemID = get_local_id(1);
2651      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2652
2653      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2654
2655      float4 filteredPixel = (float4)0.0f;
2656      float density = 0.0f;
2657      float gamma = 0.0f;
2658      // -1 means this workitem doesn't participate in the computation
2659      if (pixelIndex != -1)
2660      {
2661        // x coordinated of the resized pixel computed by this workitem
2662        const int y = chunkStartY + pixelIndex;
2663
2664        // calculate how many steps required for this pixel
2665        const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2666        const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2667        const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2668        const unsigned int n = stop - start;
2669
2670        // calculate how many steps this workitem will contribute
2671        unsigned int numStepsPerWorkItem = n / numItems;
2672        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2673
2674        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2675        if (startStep < n)
2676        {
2677          const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2678
2679          unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2680          for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2681          {
2682            float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2683              (ResizeWeightingFunctionType) resizeFilterType,
2684              (ResizeWeightingFunctionType) resizeWindowType,
2685              resizeFilterScale, resizeFilterWindowSupport,
2686              resizeFilterBlur, scale*(start + i - bisect + 0.5));
2687
2688            float4 cp = (float4)0.0f;
2689
2690            __local CLQuantum *p = inputImageCache + cacheIndex;
2691            cp.x = (float) *(p);
2692            if (number_channels > 2)
2693            {
2694              cp.y = (float) *(p + rangeLength);
2695              cp.z = (float) *(p + (rangeLength * 2));
2696            }
2697
2698            if (alpha_index != 0)
2699            {
2700              cp.w = (float) *(p + (rangeLength * alpha_index));
2701
2702              float alpha = weight * QuantumScale * cp.w;
2703
2704              filteredPixel.x += alpha * cp.x;
2705              filteredPixel.y += alpha * cp.y;
2706              filteredPixel.z += alpha * cp.z;
2707              filteredPixel.w += weight * cp.w;
2708              gamma += alpha;
2709            }
2710            else
2711              filteredPixel += ((float4) weight)*cp;
2712
2713            density += weight;
2714          }
2715        }
2716      }
2717
2718      // initialize the accumulators to zero
2719      if (itemID < actualNumPixelInThisChunk) {
2720        outputPixelCache[itemID] = (float4)0.0f;
2721        densityCache[itemID] = 0.0f;
2722        if (alpha_index != 0)
2723          gammaCache[itemID] = 0.0f;
2724      }
2725      barrier(CLK_LOCAL_MEM_FENCE);
2726
2727      // accumulatte the filtered pixel value and the density
2728      for (unsigned int i = 0; i < numItems; i++) {
2729        if (pixelIndex != -1) {
2730          if (itemID%numItems == i) {
2731            outputPixelCache[pixelIndex]+=filteredPixel;
2732            densityCache[pixelIndex]+=density;
2733            if (alpha_index != 0)
2734              gammaCache[pixelIndex]+=gamma;
2735          }
2736        }
2737        barrier(CLK_LOCAL_MEM_FENCE);
2738      }
2739
2740      if (itemID < actualNumPixelInThisChunk)
2741      {
2742        float4 filteredPixel = outputPixelCache[itemID];
2743
2744        float gamma = 0.0f;
2745        if (alpha_index != 0)
2746          gamma = gammaCache[itemID];
2747
2748        float density = densityCache[itemID];
2749        if ((density != 0.0f) && (density != 1.0f))
2750        {
2751          density = PerceptibleReciprocal(density);
2752          filteredPixel *= (float4) density;
2753          if (alpha_index != 0)
2754            gamma *= density;
2755        }
2756
2757        if (alpha_index != 0)
2758        {
2759          gamma = PerceptibleReciprocal(gamma);
2760          filteredPixel.x *= gamma;
2761          filteredPixel.y *= gamma;
2762          filteredPixel.z *= gamma;
2763        }
2764
2765        WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
2766      }
2767    }
2768  }
2769  )
2770
2771/*
2772%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2773%                                                                             %
2774%                                                                             %
2775%                                                                             %
2776%     R o t a t i o n a l B l u r                                             %
2777%                                                                             %
2778%                                                                             %
2779%                                                                             %
2780%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2781*/
2782
2783  STRINGIFY(
2784  __kernel void RotationalBlur(const __global CLQuantum *image,
2785    const unsigned int number_channels,const unsigned int channel,
2786    const float2 blurCenter,__constant float *cos_theta,
2787    __constant float *sin_theta,const unsigned int cossin_theta_size,
2788    __global CLQuantum *filteredImage)
2789  {
2790    const int x = get_global_id(0);
2791    const int y = get_global_id(1);
2792    const int columns = get_global_size(0);
2793    const int rows = get_global_size(1);
2794    unsigned int step = 1;
2795    float center_x = (float) x - blurCenter.x;
2796    float center_y = (float) y - blurCenter.y;
2797    float radius = hypot(center_x, center_y);
2798
2799    float blur_radius = hypot(blurCenter.x, blurCenter.y);
2800
2801    if (radius > MagickEpsilon)
2802    {
2803      step = (unsigned int) (blur_radius / radius);
2804      if (step == 0)
2805        step = 1;
2806      if (step >= cossin_theta_size)
2807        step = cossin_theta_size-1;
2808    }
2809
2810    float4 result = 0.0f;
2811    float normalize = 0.0f;
2812    float gamma = 0.0f;
2813
2814    for (unsigned int i=0; i<cossin_theta_size; i+=step)
2815    {
2816      int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2817      int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2818
2819      float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
2820
2821      if ((number_channels == 4) || (number_channels == 2))
2822      {
2823        float alpha = (float)(QuantumScale*pixel.w);
2824
2825        gamma += alpha;
2826
2827        result.x += alpha * pixel.x;
2828        result.y += alpha * pixel.y;
2829        result.z += alpha * pixel.z;
2830        result.w += pixel.w;
2831      }
2832      else
2833        result += pixel;
2834
2835      normalize += 1.0f;
2836    }
2837
2838    normalize = PerceptibleReciprocal(normalize);
2839
2840    if ((number_channels == 4) || (number_channels == 2))
2841    {
2842      gamma = PerceptibleReciprocal(gamma);
2843      result.x *= gamma;
2844      result.y *= gamma;
2845      result.z *= gamma;
2846      result.w *= normalize;
2847    }
2848    else
2849      result *= normalize;
2850
2851    WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2852  }
2853  )
2854
2855/*
2856%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2857%                                                                             %
2858%                                                                             %
2859%                                                                             %
2860%     U n s h a r p M a s k                                                   %
2861%                                                                             %
2862%                                                                             %
2863%                                                                             %
2864%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2865*/
2866
2867  STRINGIFY(
2868  __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
2869    const __global float4 *blurRowData,const unsigned int number_channels,
2870    const ChannelType channel,const unsigned int columns,
2871    const unsigned int rows,__local float4* cachedData,
2872    __local float* cachedFilter,const __global float *filter,
2873    const unsigned int width,const float gain, const float threshold,
2874    __global CLQuantum *filteredImage)
2875  {
2876    const unsigned int radius = (width-1)/2;
2877
2878    // cache the pixel shared by the workgroup
2879    const int groupX = get_group_id(0);
2880    const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2881    const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2882
2883    if ((groupStartY >= 0) && (groupStopY < rows))
2884    {
2885      event_t e = async_work_group_strided_copy(cachedData,
2886        blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2887      wait_group_events(1,&e);
2888    }
2889    else
2890    {
2891      for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2892        cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2893
2894      barrier(CLK_LOCAL_MEM_FENCE);
2895    }
2896    // cache the filter as well
2897    event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2898    wait_group_events(1,&e);
2899
2900    // only do the work if this is not a patched item
2901    const int cy = get_global_id(1);
2902
2903    if (cy < rows)
2904    {
2905      float4 blurredPixel = (float4) 0.0f;
2906
2907      int i = 0;
2908
2909      for ( ; i+7 < width; )
2910      {
2911        for (int j=0; j < 8; j++, i++)
2912          blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2913      }
2914
2915      for ( ; i < width; i++)
2916        blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2917
2918      float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2919      float4 outputPixel = inputImagePixel - blurredPixel;
2920
2921      float quantumThreshold = QuantumRange*threshold;
2922
2923      int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2924      outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2925
2926      //write back
2927      WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2928    }
2929  }
2930  )
2931
2932  STRINGIFY(
2933  __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
2934    const ChannelType channel,__constant float *filter,const unsigned int width,
2935    const unsigned int columns,const unsigned int rows,__local float4 *pixels,
2936    const float gain,const float threshold,__global CLQuantum *filteredImage)
2937  {
2938    const unsigned int x = get_global_id(0);
2939    const unsigned int y = get_global_id(1);
2940
2941    const unsigned int radius = (width - 1) / 2;
2942
2943    int row = y - radius;
2944    int baseRow = get_group_id(1) * get_local_size(1) - radius;
2945    int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2946
2947    while (row < endRow) {
2948      int srcy = (row < 0) ? -row : row; // mirror pad
2949      srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2950
2951      float4 value = 0.0f;
2952
2953      int ix = x - radius;
2954      int i = 0;
2955
2956      while (i + 7 < width) {
2957        for (int j = 0; j < 8; ++j) { // unrolled
2958          int srcx = ix + j;
2959          srcx = (srcx < 0) ? -srcx : srcx;
2960          srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2961          value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2962        }
2963        ix += 8;
2964        i += 8;
2965      }
2966
2967      while (i < width) {
2968        int srcx = (ix < 0) ? -ix : ix; // mirror pad
2969        srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2970        value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2971        ++i;
2972        ++ix;
2973      }
2974      pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2975      row += get_local_size(1);
2976    }
2977
2978    barrier(CLK_LOCAL_MEM_FENCE);
2979
2980    const int px = get_local_id(0);
2981    const int py = get_local_id(1);
2982    const int prp = get_local_size(0);
2983    float4 value = (float4)(0.0f);
2984
2985    int i = 0;
2986    while (i + 7 < width) {
2987      for (int j = 0; j < 8; ++j) // unrolled
2988        value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
2989      i += 8;
2990    }
2991    while (i < width) {
2992      value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
2993      ++i;
2994    }
2995
2996    float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
2997    float4 diff = srcPixel - value;
2998
2999    float quantumThreshold = QuantumRange*threshold;
3000
3001    int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3002    value = select(srcPixel + diff * gain, srcPixel, mask);
3003
3004    if ((x < columns) && (y < rows))
3005      WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
3006  }
3007  )
3008
3009  STRINGIFY(
3010    __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
3011    void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
3012      const unsigned int number_channels,const unsigned int max_channels,
3013      const float threshold,const int passes,const unsigned int imageWidth,
3014      const unsigned int imageHeight)
3015  {
3016    const int pad = (1 << (passes - 1));
3017    const int tileSize = 64;
3018    const int tileRowPixels = 64;
3019    const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3020
3021    CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
3022
3023    local float buffer[64 * 64];
3024
3025    int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0);
3026    int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad;
3027
3028    for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3029      int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
3030                (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
3031
3032      for (int channel = 0; channel < max_channels; ++channel)
3033        stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
3034    }
3035
3036    for (int channel = 0; channel < max_channels; ++channel) {
3037      // Load LDS
3038      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3039        buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
3040
3041      // Process
3042
3043      float tmp[16];
3044      float accum[16];
3045      float pixel;
3046
3047      for (int i = 0; i < 16; i++)
3048        accum[i]=0.0f;
3049
3050      for (int pass = 0; pass < passes; ++pass) {
3051        const int radius = 1 << pass;
3052        const int x = get_local_id(0);
3053        const float thresh = threshold * noise[pass];
3054
3055        // Apply horizontal hat
3056        for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3057          const int offset = i * tileRowPixels;
3058          if (pass == 0)
3059            tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3060          pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3061          barrier(CLK_LOCAL_MEM_FENCE);
3062          buffer[x + offset] = pixel;
3063        }
3064        barrier(CLK_LOCAL_MEM_FENCE);
3065
3066        // Apply vertical hat
3067        for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3068          pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3069          float delta = tmp[i / 4] - pixel;
3070          tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3071          if (delta < -thresh)
3072            delta += thresh;
3073          else if (delta > thresh)
3074            delta -= thresh;
3075          else
3076            delta = 0;
3077          accum[i / 4] += delta;
3078        }
3079        barrier(CLK_LOCAL_MEM_FENCE);
3080
3081        if (pass < passes - 1)
3082          for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3083            buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3084        else  // last pass
3085          for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3086            accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3087        barrier(CLK_LOCAL_MEM_FENCE);
3088      }
3089
3090      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3091        stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
3092
3093      barrier(CLK_LOCAL_MEM_FENCE);
3094    }
3095
3096    // Write from stage to output
3097
3098    if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3099      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3100        if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3101          int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
3102          for (int channel = 0; channel < max_channels; ++channel) {
3103            dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
3104          }
3105        }
3106      }
3107    }
3108  }
3109  )
3110
3111  ;
3112
3113#endif // MAGICKCORE_OPENCL_SUPPORT
3114
3115#if defined(__cplusplus) || defined(c_plusplus)
3116}
3117#endif
3118
3119#endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
3120