accelerate-kernels-private.h revision fdf0c636b0dd3c1a32a7667e6a84698713f84a68
1/*
2  Copyright 1999-2016 ImageMagick Studio LLC, a non-profit organization
3  dedicated to making software imaging solutions freely available.
4
5  You may not use this file except in compliance with the License.
6  obtain a copy of the License at
7
8    http://www.imagemagick.org/script/license.php
9
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15
16  MagickCore private kernels for accelerated functions.
17*/
18
19#ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20#define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21
22#if defined(__cplusplus) || defined(c_plusplus)
23extern "C" {
24#endif
25
26#if defined(MAGICKCORE_OPENCL_SUPPORT)
27
28/*
29  Define declarations.
30*/
31#define OPENCL_DEFINE(VAR,...)	"\n #""define " #VAR " " #__VA_ARGS__ " \n"
32#define OPENCL_ELIF(...)	"\n #""elif " #__VA_ARGS__ " \n"
33#define OPENCL_ELSE()		"\n #""else " " \n"
34#define OPENCL_ENDIF()		"\n #""endif " " \n"
35#define OPENCL_IF(...)		"\n #""if " #__VA_ARGS__ " \n"
36#define STRINGIFY(...) #__VA_ARGS__ "\n"
37
38/*
39  Typedef declarations.
40*/
41
42typedef struct _FloatPixelPacket
43{
44  MagickRealType
45    red,
46    green,
47    blue,
48    alpha,
49    black;
50} FloatPixelPacket;
51
52const char *accelerateKernels =
53
54/*
55  Define declarations.
56*/
57  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
58  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
59  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
60  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
61  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
62  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
63  OPENCL_DEFINE(SigmaRandom, (attenuate))
64  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
65  OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
66  OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
67
68/*
69  Typedef declarations.
70*/
71  STRINGIFY(
72    typedef enum
73  {
74    UndefinedColorspace,
75    RGBColorspace,            /* Linear RGB colorspace */
76    GRAYColorspace,           /* greyscale (linear) image (faked 1 channel) */
77    TransparentColorspace,
78    OHTAColorspace,
79    LabColorspace,
80    XYZColorspace,
81    YCbCrColorspace,
82    YCCColorspace,
83    YIQColorspace,
84    YPbPrColorspace,
85    YUVColorspace,
86    CMYKColorspace,           /* negared linear RGB with black separated */
87    sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
88    HSBColorspace,
89    HSLColorspace,
90    HWBColorspace,
91    Rec601LumaColorspace,
92    Rec601YCbCrColorspace,
93    Rec709LumaColorspace,
94    Rec709YCbCrColorspace,
95    LogColorspace,
96    CMYColorspace,            /* negated linear RGB colorspace */
97    LuvColorspace,
98    HCLColorspace,
99    LCHColorspace,            /* alias for LCHuv */
100    LMSColorspace,
101    LCHabColorspace,          /* Cylindrical (Polar) Lab */
102    LCHuvColorspace,          /* Cylindrical (Polar) Luv */
103    scRGBColorspace,
104    HSIColorspace,
105    HSVColorspace,            /* alias for HSB */
106    HCLpColorspace,
107    YDbDrColorspace
108  } ColorspaceType;
109  )
110
111  STRINGIFY(
112    typedef enum
113    {
114      UndefinedCompositeOp,
115      NoCompositeOp,
116      ModulusAddCompositeOp,
117      AtopCompositeOp,
118      BlendCompositeOp,
119      BumpmapCompositeOp,
120      ChangeMaskCompositeOp,
121      ClearCompositeOp,
122      ColorBurnCompositeOp,
123      ColorDodgeCompositeOp,
124      ColorizeCompositeOp,
125      CopyBlackCompositeOp,
126      CopyBlueCompositeOp,
127      CopyCompositeOp,
128      CopyCyanCompositeOp,
129      CopyGreenCompositeOp,
130      CopyMagentaCompositeOp,
131      CopyOpacityCompositeOp,
132      CopyRedCompositeOp,
133      CopyYellowCompositeOp,
134      DarkenCompositeOp,
135      DstAtopCompositeOp,
136      DstCompositeOp,
137      DstInCompositeOp,
138      DstOutCompositeOp,
139      DstOverCompositeOp,
140      DifferenceCompositeOp,
141      DisplaceCompositeOp,
142      DissolveCompositeOp,
143      ExclusionCompositeOp,
144      HardLightCompositeOp,
145      HueCompositeOp,
146      InCompositeOp,
147      LightenCompositeOp,
148      LinearLightCompositeOp,
149      LuminizeCompositeOp,
150      MinusDstCompositeOp,
151      ModulateCompositeOp,
152      MultiplyCompositeOp,
153      OutCompositeOp,
154      OverCompositeOp,
155      OverlayCompositeOp,
156      PlusCompositeOp,
157      ReplaceCompositeOp,
158      SaturateCompositeOp,
159      ScreenCompositeOp,
160      SoftLightCompositeOp,
161      SrcAtopCompositeOp,
162      SrcCompositeOp,
163      SrcInCompositeOp,
164      SrcOutCompositeOp,
165      SrcOverCompositeOp,
166      ModulusSubtractCompositeOp,
167      ThresholdCompositeOp,
168      XorCompositeOp,
169      /* These are new operators, added after the above was last sorted.
170       * The list should be re-sorted only when a new library version is
171       * created.
172       */
173      DivideDstCompositeOp,
174      DistortCompositeOp,
175      BlurCompositeOp,
176      PegtopLightCompositeOp,
177      VividLightCompositeOp,
178      PinLightCompositeOp,
179      LinearDodgeCompositeOp,
180      LinearBurnCompositeOp,
181      MathematicsCompositeOp,
182      DivideSrcCompositeOp,
183      MinusSrcCompositeOp,
184      DarkenIntensityCompositeOp,
185      LightenIntensityCompositeOp
186    } CompositeOperator;
187  )
188
189  STRINGIFY(
190     typedef enum
191     {
192       UndefinedFunction,
193       ArcsinFunction,
194       ArctanFunction,
195       PolynomialFunction,
196       SinusoidFunction
197     } MagickFunction;
198  )
199
200  STRINGIFY(
201    typedef enum
202    {
203      UndefinedNoise,
204      UniformNoise,
205      GaussianNoise,
206      MultiplicativeGaussianNoise,
207      ImpulseNoise,
208      LaplacianNoise,
209      PoissonNoise,
210      RandomNoise
211    } NoiseType;
212  )
213
214  STRINGIFY(
215    typedef enum
216  {
217    UndefinedPixelIntensityMethod = 0,
218    AveragePixelIntensityMethod,
219    BrightnessPixelIntensityMethod,
220    LightnessPixelIntensityMethod,
221    Rec601LumaPixelIntensityMethod,
222    Rec601LuminancePixelIntensityMethod,
223    Rec709LumaPixelIntensityMethod,
224    Rec709LuminancePixelIntensityMethod,
225    RMSPixelIntensityMethod,
226    MSPixelIntensityMethod
227  } PixelIntensityMethod;
228  )
229
230  STRINGIFY(
231  typedef enum {
232    BoxWeightingFunction = 0,
233    TriangleWeightingFunction,
234    CubicBCWeightingFunction,
235    HannWeightingFunction,
236    HammingWeightingFunction,
237    BlackmanWeightingFunction,
238    GaussianWeightingFunction,
239    QuadraticWeightingFunction,
240    JincWeightingFunction,
241    SincWeightingFunction,
242    SincFastWeightingFunction,
243    KaiserWeightingFunction,
244    WelshWeightingFunction,
245    BohmanWeightingFunction,
246    LagrangeWeightingFunction,
247    CosineWeightingFunction,
248  } ResizeWeightingFunctionType;
249  )
250
251  STRINGIFY(
252     typedef enum
253     {
254       UndefinedChannel = 0x0000,
255       RedChannel = 0x0001,
256       GrayChannel = 0x0001,
257       CyanChannel = 0x0001,
258       GreenChannel = 0x0002,
259       MagentaChannel = 0x0002,
260       BlueChannel = 0x0004,
261       YellowChannel = 0x0004,
262       BlackChannel = 0x0008,
263       AlphaChannel = 0x0010,
264       OpacityChannel = 0x0010,
265       IndexChannel = 0x0020,             /* Color Index Table? */
266       ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
267       WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
268       MetaChannel = 0x0100,              /* ???? */
269       CompositeChannels = 0x002F,
270       AllChannels = 0x7ffffff,
271       /*
272         Special purpose channel types.
273         FUTURE: are these needed any more - they are more like hacks
274         SyncChannels for example is NOT a real channel but a 'flag'
275         It really says -- "User has not defined channels"
276         Though it does have extra meaning in the "-auto-level" operator
277       */
278       TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
279       RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
280       GrayChannels = 0x0400,
281       SyncChannels = 0x20000,    /* channels modified as a single unit */
282       DefaultChannels = AllChannels
283     } ChannelType;  /* must correspond to PixelChannel */
284  )
285
286/*
287  Helper functions.
288*/
289
290OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
291
292  STRINGIFY(
293    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
294    {
295      return((CLQuantum) value);
296    }
297  )
298
299OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
300
301  STRINGIFY(
302    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
303    {
304      return((CLQuantum) (257.0f*value));
305    }
306  )
307
308OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
309
310  STRINGIFY(
311    inline CLQuantum ScaleCharToQuantum(const unsigned char value)
312    {
313      return((CLQuantum) (16843009.0*value));
314    }
315  )
316
317OPENCL_ENDIF()
318
319  STRINGIFY(
320    inline int ClampToCanvas(const int offset,const int range)
321      {
322        return clamp(offset, (int)0, range-1);
323      }
324  )
325
326  STRINGIFY(
327    inline CLQuantum ClampToQuantum(const float value)
328      {
329        return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
330      }
331  )
332
333  STRINGIFY(
334    inline uint ScaleQuantumToMap(CLQuantum value)
335      {
336        if (value >= (CLQuantum) MaxMap)
337          return ((uint)MaxMap);
338        else
339          return ((uint)value);
340      }
341  )
342
343  STRINGIFY(
344    inline float PerceptibleReciprocal(const float x)
345    {
346      float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
347      return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
348    }
349  )
350
351  STRINGIFY(
352  inline float RoundToUnity(const float value)
353   {
354     return clamp(value,0.0f,1.0f);
355   }
356  )
357
358  STRINGIFY(
359
360  inline unsigned int getPixelIndex(const unsigned int number_channels,
361    const unsigned int columns, const unsigned int x, const unsigned int y)
362  {
363    return (x * number_channels) + (y * columns * number_channels);
364  }
365
366  inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
367  inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
368  inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
369  inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
370
371  inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
372  inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
373  inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
374  inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
375
376  inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
377  inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
378  inline float getBlueF4(float4 p)                      { return p.x; }
379  inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
380
381  inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
382  inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
383  inline float getGreenF4(float4 p)                     { return p.y; }
384  inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
385
386  inline CLQuantum getRed(CLPixelType p)                { return p.z; }
387  inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
388  inline float getRedF4(float4 p)                       { return p.z; }
389  inline void setRedF4(float4* p, float value)          { (*p).z = value; }
390
391  inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
392  inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
393  inline float getAlphaF4(float4 p)                     { return p.w; }
394  inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
395
396  inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
397    const ChannelType channel, float *red, float *green, float *blue, float *alpha)
398  {
399    if ((channel & RedChannel) != 0)
400      *red=getPixelRed(p);
401
402    if (number_channels > 2)
403      {
404        if ((channel & GreenChannel) != 0)
405          *green=getPixelGreen(p);
406
407        if ((channel & BlueChannel) != 0)
408          *blue=getPixelBlue(p);
409      }
410
411    if (((number_channels == 4) || (number_channels == 2)) &&
412        ((channel & AlphaChannel) != 0))
413      *alpha=getPixelAlpha(p,number_channels);
414  }
415
416  inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
417    const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
418  {
419    const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
420
421    float red = 0.0f;
422    float green = 0.0f;
423    float blue = 0.0f;
424    float alpha = 0.0f;
425
426    ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
427    return (float4)(red, green, blue, alpha);
428  }
429
430  inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
431    const ChannelType channel, float red, float green, float blue, float alpha)
432  {
433    if ((channel & RedChannel) != 0)
434      setPixelRed(p,ClampToQuantum(red));
435
436    if (number_channels > 2)
437      {
438        if ((channel & GreenChannel) != 0)
439          setPixelGreen(p,ClampToQuantum(green));
440
441        if ((channel & BlueChannel) != 0)
442          setPixelBlue(p,ClampToQuantum(blue));
443      }
444
445    if (((number_channels == 4) || (number_channels == 2)) &&
446        ((channel & AlphaChannel) != 0))
447      setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
448  }
449
450  inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
451    const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
452    float4 pixel)
453  {
454    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
455    WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
456  }
457
458  inline float GetPixelIntensity(const unsigned int colorspace,
459    const unsigned int method,float red,float green,float blue)
460  {
461    float intensity;
462
463    if (colorspace == GRAYColorspace)
464      return red;
465
466    switch (method)
467    {
468      case AveragePixelIntensityMethod:
469        {
470          intensity=(red+green+blue)/3.0;
471          break;
472        }
473      case BrightnessPixelIntensityMethod:
474        {
475          intensity=MagickMax(MagickMax(red,green),blue);
476          break;
477        }
478      case LightnessPixelIntensityMethod:
479        {
480          intensity=(MagickMin(MagickMin(red,green),blue)+
481              MagickMax(MagickMax(red,green),blue))/2.0;
482          break;
483        }
484      case MSPixelIntensityMethod:
485        {
486          intensity=(float) (((float) red*red+green*green+blue*blue)/
487              (3.0*QuantumRange));
488          break;
489        }
490      case Rec601LumaPixelIntensityMethod:
491        {
492          /*
493          if (image->colorspace == RGBColorspace)
494          {
495            red=EncodePixelGamma(red);
496            green=EncodePixelGamma(green);
497            blue=EncodePixelGamma(blue);
498          }
499          */
500          intensity=0.298839*red+0.586811*green+0.114350*blue;
501          break;
502        }
503      case Rec601LuminancePixelIntensityMethod:
504        {
505          /*
506          if (image->colorspace == sRGBColorspace)
507          {
508            red=DecodePixelGamma(red);
509            green=DecodePixelGamma(green);
510            blue=DecodePixelGamma(blue);
511          }
512          */
513          intensity=0.298839*red+0.586811*green+0.114350*blue;
514          break;
515        }
516      case Rec709LumaPixelIntensityMethod:
517      default:
518        {
519          /*
520          if (image->colorspace == RGBColorspace)
521          {
522            red=EncodePixelGamma(red);
523            green=EncodePixelGamma(green);
524            blue=EncodePixelGamma(blue);
525          }
526          */
527          intensity=0.212656*red+0.715158*green+0.072186*blue;
528          break;
529        }
530      case Rec709LuminancePixelIntensityMethod:
531        {
532          /*
533          if (image->colorspace == sRGBColorspace)
534          {
535            red=DecodePixelGamma(red);
536            green=DecodePixelGamma(green);
537            blue=DecodePixelGamma(blue);
538          }
539          */
540          intensity=0.212656*red+0.715158*green+0.072186*blue;
541          break;
542        }
543      case RMSPixelIntensityMethod:
544        {
545          intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
546              sqrt(3.0));
547          break;
548        }
549    }
550
551    return intensity;
552  }
553
554  inline int mirrorBottom(int value)
555  {
556      return (value < 0) ? - (value) : value;
557  }
558
559  inline int mirrorTop(int value, int width)
560  {
561      return (value >= width) ? (2 * width - value - 1) : value;
562  }
563  )
564
565/*
566%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
567%                                                                             %
568%                                                                             %
569%                                                                             %
570%     A d d N o i s e                                                         %
571%                                                                             %
572%                                                                             %
573%                                                                             %
574%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
575*/
576
577  STRINGIFY(
578  /*
579  Part of MWC64X by David Thomas, dt10@imperial.ac.uk
580  This is provided under BSD, full license is with the main package.
581  See http://www.doc.ic.ac.uk/~dt10/research
582  */
583
584  // Pre: a<M, b<M
585  // Post: r=(a+b) mod M
586  ulong MWC_AddMod64(ulong a, ulong b, ulong M)
587  {
588    ulong v=a+b;
589    //if( (v>=M) || (v<a) )
590    if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
591      v=v-M;
592    return v;
593  }
594
595  // Pre: a<M,b<M
596  // Post: r=(a*b) mod M
597  // This could be done more efficently, but it is portable, and should
598  // be easy to understand. It can be replaced with any of the better
599  // modular multiplication algorithms (for example if you know you have
600  // double precision available or something).
601  ulong MWC_MulMod64(ulong a, ulong b, ulong M)
602  {
603    ulong r=0;
604    while(a!=0){
605      if(a&1)
606        r=MWC_AddMod64(r,b,M);
607      b=MWC_AddMod64(b,b,M);
608      a=a>>1;
609    }
610    return r;
611  }
612
613  // Pre: a<M, e>=0
614  // Post: r=(a^b) mod M
615  // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
616  // most architectures
617  ulong MWC_PowMod64(ulong a, ulong e, ulong M)
618  {
619    ulong sqr=a, acc=1;
620    while(e!=0){
621      if(e&1)
622        acc=MWC_MulMod64(acc,sqr,M);
623        sqr=MWC_MulMod64(sqr,sqr,M);
624      e=e>>1;
625    }
626    return acc;
627  }
628
629  uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
630  {
631    ulong m=MWC_PowMod64(A, distance, M);
632    ulong x=curr.x*(ulong)A+curr.y;
633    x=MWC_MulMod64(x, m, M);
634    return (uint2)((uint)(x/A), (uint)(x%A));
635  }
636
637  uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
638  {
639    // This is an arbitrary constant for starting LCG jumping from. I didn't
640    // want to start from 1, as then you end up with the two or three first values
641    // being a bit poor in ones - once you've decided that, one constant is as
642    // good as any another. There is no deep mathematical reason for it, I just
643    // generated a random number.
644    enum{ MWC_BASEID = 4077358422479273989UL };
645
646    ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
647    ulong m=MWC_PowMod64(A, dist, M);
648
649    ulong x=MWC_MulMod64(MWC_BASEID, m, M);
650    return (uint2)((uint)(x/A), (uint)(x%A));
651  }
652
653  //! Represents the state of a particular generator
654  typedef struct{ uint x; uint c; } mwc64x_state_t;
655
656  enum{ MWC64X_A = 4294883355U };
657  enum{ MWC64X_M = 18446383549859758079UL };
658
659  void MWC64X_Step(mwc64x_state_t *s)
660  {
661    uint X=s->x, C=s->c;
662
663    uint Xn=MWC64X_A*X+C;
664    uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
665    uint Cn=mad_hi(MWC64X_A,X,carry);
666
667    s->x=Xn;
668    s->c=Cn;
669  }
670
671  void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
672  {
673    uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
674    s->x=tmp.x;
675    s->c=tmp.y;
676  }
677
678  void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
679  {
680    uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
681    s->x=tmp.x;
682    s->c=tmp.y;
683  }
684
685  //! Return a 32-bit integer in the range [0..2^32)
686  uint MWC64X_NextUint(mwc64x_state_t *s)
687  {
688    uint res=s->x ^ s->c;
689    MWC64X_Step(s);
690    return res;
691  }
692
693  //
694  // End of MWC64X excerpt
695  //
696
697  float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
698  {
699    return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
700  }
701
702  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
703  {
704    float
705      alpha,
706      beta,
707      noise,
708      sigma;
709
710    noise = 0.0f;
711    alpha=mwcReadPseudoRandomValue(r);
712    switch(noise_type)
713    {
714      case UniformNoise:
715      default:
716        {
717          noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
718          break;
719        }
720      case GaussianNoise:
721        {
722          float
723            gamma,
724            tau;
725
726          if (alpha == 0.0f)
727            alpha=1.0f;
728          beta=mwcReadPseudoRandomValue(r);
729          gamma=sqrt(-2.0f*log(alpha));
730          sigma=gamma*cospi((2.0f*beta));
731          tau=gamma*sinpi((2.0f*beta));
732          noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
733          break;
734        }
735      case ImpulseNoise:
736      {
737        if (alpha < (SigmaImpulse/2.0f))
738          noise=0.0f;
739        else
740          if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
741            noise=QuantumRange;
742          else
743            noise=pixel;
744        break;
745      }
746      case LaplacianNoise:
747      {
748        if (alpha <= 0.5f)
749          {
750            if (alpha <= MagickEpsilon)
751              noise=(pixel-QuantumRange);
752            else
753              noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
754                0.5f);
755            break;
756          }
757        beta=1.0f-alpha;
758        if (beta <= (0.5f*MagickEpsilon))
759          noise=(pixel+QuantumRange);
760        else
761          noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
762        break;
763      }
764      case MultiplicativeGaussianNoise:
765      {
766        sigma=1.0f;
767        if (alpha > MagickEpsilon)
768          sigma=sqrt(-2.0f*log(alpha));
769        beta=mwcReadPseudoRandomValue(r);
770        noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
771          cospi((2.0f*beta))/2.0f);
772        break;
773      }
774      case PoissonNoise:
775      {
776        float
777          poisson;
778        unsigned int i;
779        poisson=exp(-SigmaPoisson*QuantumScale*pixel);
780        for (i=0; alpha > poisson; i++)
781        {
782          beta=mwcReadPseudoRandomValue(r);
783          alpha*=beta;
784        }
785        noise=(QuantumRange*i/SigmaPoisson);
786        break;
787      }
788      case RandomNoise:
789      {
790        noise=(QuantumRange*SigmaRandom*alpha);
791        break;
792      }
793    }
794    return noise;
795  }
796
797  __kernel
798  void AddNoise(const __global CLQuantum *image,
799    const unsigned int number_channels,const ChannelType channel,
800    const unsigned int length,const unsigned int pixelsPerWorkItem,
801    const NoiseType noise_type,const float attenuate,const unsigned int seed0,
802    const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
803    __global CLQuantum *filteredImage)
804  {
805    mwc64x_state_t rng;
806    rng.x = seed0;
807    rng.c = seed1;
808
809    uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
810    uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
811    MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
812
813    uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
814    uint count = pixelsPerWorkItem;
815
816    while (count > 0 && pos < length)
817    {
818      const __global CLQuantum *p = image + pos;
819      __global CLQuantum *q = filteredImage + pos;
820
821      float red;
822      float green;
823      float blue;
824      float alpha;
825
826      ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
827
828      if ((channel & RedChannel) != 0)
829        red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
830
831      if (number_channels > 2)
832      {
833        if ((channel & GreenChannel) != 0)
834          green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
835
836        if ((channel & BlueChannel) != 0)
837          blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
838      }
839
840      if (((number_channels == 4) || (number_channels == 2)) &&
841          ((channel & AlphaChannel) != 0))
842        alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
843
844      WriteChannels(q, number_channels, channel, red, green, blue, alpha);
845
846      pos += (get_local_size(0) * number_channels);
847      count--;
848    }
849  }
850  )
851
852/*
853%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
854%                                                                             %
855%                                                                             %
856%                                                                             %
857%    B l u r                                                                  %
858%                                                                             %
859%                                                                             %
860%                                                                             %
861%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
862*/
863
864  STRINGIFY(
865  /*
866  Reduce image noise and reduce detail levels by row
867  */
868  __kernel void BlurRow(const __global CLQuantum *image,
869    const unsigned int number_channels,const ChannelType channel,
870    __constant float *filter,const unsigned int width,
871    const unsigned int imageColumns,const unsigned int imageRows,
872    __local float4 *temp,__global float4 *tempImage)
873  {
874    const int x = get_global_id(0);
875    const int y = get_global_id(1);
876
877    const int columns = imageColumns;
878
879    const unsigned int radius = (width-1)/2;
880    const int wsize = get_local_size(0);
881    const unsigned int loadSize = wsize+width;
882
883    //group coordinate
884    const int groupX=get_local_size(0)*get_group_id(0);
885
886    //parallel load and clamp
887    for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
888    {
889      int cx = ClampToCanvas(i + groupX - radius, columns);
890      temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
891    }
892
893    // barrier
894    barrier(CLK_LOCAL_MEM_FENCE);
895
896    // only do the work if this is not a patched item
897    if (get_global_id(0) < columns)
898    {
899      // compute
900      float4 result = (float4) 0;
901
902      int i = 0;
903
904      for ( ; i+7 < width; )
905      {
906        for (int j=0; j < 8; j++)
907          result+=filter[i+j]*temp[i+j+get_local_id(0)];
908        i+=8;
909      }
910
911      for ( ; i < width; i++)
912        result+=filter[i]*temp[i+get_local_id(0)];
913
914      // write back to global
915      tempImage[y*columns+x] = result;
916    }
917  }
918  )
919
920  STRINGIFY(
921  /*
922  Reduce image noise and reduce detail levels by line
923  */
924  __kernel void BlurColumn(const __global float4 *blurRowData,
925    const unsigned int number_channels,const ChannelType channel,
926    __constant float *filter,const unsigned int width,
927    const unsigned int imageColumns,const unsigned int imageRows,
928    __local float4 *temp,__global CLQuantum *filteredImage)
929  {
930    const int x = get_global_id(0);
931    const int y = get_global_id(1);
932
933    const int columns = imageColumns;
934    const int rows = imageRows;
935
936    unsigned int radius = (width-1)/2;
937    const int wsize = get_local_size(1);
938    const unsigned int loadSize = wsize+width;
939
940    //group coordinate
941    const int groupX=get_local_size(0)*get_group_id(0);
942    const int groupY=get_local_size(1)*get_group_id(1);
943    //notice that get_local_size(0) is 1, so
944    //groupX=get_group_id(0);
945
946    //parallel load and clamp
947    for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
948      temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
949
950    // barrier
951    barrier(CLK_LOCAL_MEM_FENCE);
952
953    // only do the work if this is not a patched item
954    if (get_global_id(1) < rows)
955    {
956      // compute
957      float4 result = (float4) 0;
958
959      int i = 0;
960
961      for ( ; i+7 < width; )
962      {
963        for (int j=0; j < 8; j++)
964          result+=filter[i+j]*temp[i+j+get_local_id(1)];
965        i+=8;
966      }
967
968      for ( ; i < width; i++)
969        result+=filter[i]*temp[i+get_local_id(1)];
970
971      // write back to global
972      WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
973    }
974  }
975  )
976
977/*
978%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
979%                                                                             %
980%                                                                             %
981%                                                                             %
982%    C o n t r a s t                                                          %
983%                                                                             %
984%                                                                             %
985%                                                                             %
986%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
987*/
988
989  STRINGIFY(
990
991  inline float3 ConvertRGBToHSB(CLPixelType pixel) {
992    float3 HueSaturationBrightness;
993    HueSaturationBrightness.x = 0.0f; // Hue
994    HueSaturationBrightness.y = 0.0f; // Saturation
995    HueSaturationBrightness.z = 0.0f; // Brightness
996
997    float r=(float) getRed(pixel);
998    float g=(float) getGreen(pixel);
999    float b=(float) getBlue(pixel);
1000
1001    float tmin=MagickMin(MagickMin(r,g),b);
1002    float tmax=MagickMax(MagickMax(r,g),b);
1003
1004    if (tmax!=0.0f) {
1005      float delta=tmax-tmin;
1006      HueSaturationBrightness.y=delta/tmax;
1007      HueSaturationBrightness.z=QuantumScale*tmax;
1008
1009      if (delta != 0.0f) {
1010  HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
1011  HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
1012        HueSaturationBrightness.x/=6.0f;
1013        HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
1014      }
1015    }
1016    return HueSaturationBrightness;
1017  }
1018
1019  inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
1020
1021    float hue = HueSaturationBrightness.x;
1022    float brightness = HueSaturationBrightness.z;
1023    float saturation = HueSaturationBrightness.y;
1024
1025    CLPixelType rgb;
1026
1027    if (saturation == 0.0f) {
1028      setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
1029      setGreen(&rgb,getRed(rgb));
1030      setBlue(&rgb,getRed(rgb));
1031    }
1032    else {
1033
1034      float h=6.0f*(hue-floor(hue));
1035      float f=h-floor(h);
1036      float p=brightness*(1.0f-saturation);
1037      float q=brightness*(1.0f-saturation*f);
1038      float t=brightness*(1.0f-(saturation*(1.0f-f)));
1039
1040      float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
1041      float clamped_t = ClampToQuantum(QuantumRange*t);
1042      float clamped_p = ClampToQuantum(QuantumRange*p);
1043      float clamped_q = ClampToQuantum(QuantumRange*q);
1044      int ih = (int)h;
1045      setRed(&rgb, (ih == 1)?clamped_q:
1046        (ih == 2 || ih == 3)?clamped_p:
1047        (ih == 4)?clamped_t:
1048                 clampedBrightness);
1049
1050      setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
1051        (ih == 3)?clamped_q:
1052        (ih == 4 || ih == 5)?clamped_p:
1053                 clamped_t);
1054
1055      setBlue(&rgb, (ih == 2)?clamped_t:
1056        (ih == 3 || ih == 4)?clampedBrightness:
1057        (ih == 5)?clamped_q:
1058                 clamped_p);
1059    }
1060    return rgb;
1061  }
1062
1063  __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
1064  {
1065
1066    const int sign = sharpen!=0?1:-1;
1067    const int x = get_global_id(0);
1068    const int y = get_global_id(1);
1069    const int columns = get_global_size(0);
1070    const int c = x + y * columns;
1071
1072    CLPixelType pixel = im[c];
1073    float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
1074    float brightness = HueSaturationBrightness.z;
1075    brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1076    brightness = clamp(brightness,0.0f,1.0f);
1077    HueSaturationBrightness.z = brightness;
1078
1079    CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
1080    filteredPixel.w = pixel.w;
1081    im[c] = filteredPixel;
1082  }
1083  )
1084
1085/*
1086%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1087%                                                                             %
1088%                                                                             %
1089%                                                                             %
1090%    C o n t r a s t S t r e t c h                                            %
1091%                                                                             %
1092%                                                                             %
1093%                                                                             %
1094%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1095*/
1096
1097    STRINGIFY(
1098    /*
1099    */
1100    __kernel void Histogram(__global CLPixelType * restrict im,
1101      const ChannelType channel,
1102      const unsigned int colorspace,
1103      const unsigned int method,
1104      __global uint4 * restrict histogram)
1105      {
1106        const int x = get_global_id(0);
1107        const int y = get_global_id(1);
1108        const int columns = get_global_size(0);
1109        const int c = x + y * columns;
1110        if ((channel & SyncChannels) != 0)
1111        {
1112          float red=(float)getRed(im[c]);
1113          float green=(float)getGreen(im[c]);
1114          float blue=(float)getBlue(im[c]);
1115
1116          float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1117          uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1118          atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1119        }
1120        else
1121        {
1122          // for equalizing, we always need all channels?
1123          // otherwise something more
1124        }
1125      }
1126    )
1127
1128    STRINGIFY(
1129    /*
1130    */
1131    __kernel void ContrastStretch(__global CLPixelType * restrict im,
1132      const ChannelType channel,
1133      __global CLPixelType * restrict stretch_map,
1134      const float4 white, const float4 black)
1135      {
1136        const int x = get_global_id(0);
1137        const int y = get_global_id(1);
1138        const int columns = get_global_size(0);
1139        const int c = x + y * columns;
1140
1141        uint ePos;
1142        CLPixelType oValue, eValue;
1143        CLQuantum red, green, blue, alpha;
1144
1145        //read from global
1146        oValue=im[c];
1147
1148        if ((channel & RedChannel) != 0)
1149        {
1150          if (getRedF4(white) != getRedF4(black))
1151          {
1152            ePos = ScaleQuantumToMap(getRed(oValue));
1153            eValue = stretch_map[ePos];
1154            red = getRed(eValue);
1155          }
1156        }
1157
1158        if ((channel & GreenChannel) != 0)
1159        {
1160          if (getGreenF4(white) != getGreenF4(black))
1161          {
1162            ePos = ScaleQuantumToMap(getGreen(oValue));
1163            eValue = stretch_map[ePos];
1164            green = getGreen(eValue);
1165          }
1166        }
1167
1168        if ((channel & BlueChannel) != 0)
1169        {
1170          if (getBlueF4(white) != getBlueF4(black))
1171          {
1172            ePos = ScaleQuantumToMap(getBlue(oValue));
1173            eValue = stretch_map[ePos];
1174            blue = getBlue(eValue);
1175          }
1176        }
1177
1178        if ((channel & AlphaChannel) != 0)
1179        {
1180          if (getAlphaF4(white) != getAlphaF4(black))
1181          {
1182            ePos = ScaleQuantumToMap(getAlpha(oValue));
1183            eValue = stretch_map[ePos];
1184            alpha = getAlpha(eValue);
1185          }
1186        }
1187
1188        //write back
1189        im[c]=(CLPixelType)(blue, green, red, alpha);
1190
1191      }
1192    )
1193
1194/*
1195%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1196%                                                                             %
1197%                                                                             %
1198%                                                                             %
1199%    C o n v o l v e                                                          %
1200%                                                                             %
1201%                                                                             %
1202%                                                                             %
1203%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1204*/
1205
1206  STRINGIFY(
1207    __kernel
1208    void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1209    const unsigned int imageWidth, const unsigned int imageHeight,
1210    __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1211    const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1212
1213      int2 blockID;
1214      blockID.x = get_global_id(0) / get_local_size(0);
1215      blockID.y = get_global_id(1) / get_local_size(1);
1216
1217      // image area processed by this workgroup
1218      int2 imageAreaOrg;
1219      imageAreaOrg.x = blockID.x * get_local_size(0);
1220      imageAreaOrg.y = blockID.y * get_local_size(1);
1221
1222      int2 midFilterDimen;
1223      midFilterDimen.x = (filterWidth-1)/2;
1224      midFilterDimen.y = (filterHeight-1)/2;
1225
1226      int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1227
1228      // dimension of the local cache
1229      int2 cachedAreaDimen;
1230      cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1231      cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1232
1233      // cache the pixels accessed by this workgroup in local memory
1234      int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1235      int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1236      int groupSize = get_local_size(0) * get_local_size(1);
1237      for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1238
1239        int2 cachedAreaIndex;
1240        cachedAreaIndex.x = i % cachedAreaDimen.x;
1241        cachedAreaIndex.y = i / cachedAreaDimen.x;
1242
1243        int2 imagePixelIndex;
1244        imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1245
1246        // only support EdgeVirtualPixelMethod through ClampToCanvas
1247        // TODO: implement other virtual pixel method
1248        imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1249        imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1250
1251        pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1252      }
1253
1254      // cache the filter
1255      for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1256        filterCache[i] = filter[i];
1257      }
1258      barrier(CLK_LOCAL_MEM_FENCE);
1259
1260
1261      int2 imageIndex;
1262      imageIndex.x = imageAreaOrg.x + get_local_id(0);
1263      imageIndex.y = imageAreaOrg.y + get_local_id(1);
1264
1265      // if out-of-range, stops here and quit
1266      if (imageIndex.x >= imageWidth
1267        || imageIndex.y >= imageHeight) {
1268          return;
1269      }
1270
1271      int filterIndex = 0;
1272      float4 sum = (float4)0.0f;
1273      float gamma = 0.0f;
1274      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1275        int cacheIndexY = get_local_id(1);
1276        for (int j = 0; j < filterHeight; j++) {
1277          int cacheIndexX = get_local_id(0);
1278          for (int i = 0; i < filterWidth; i++) {
1279            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1280            float f = filterCache[filterIndex];
1281
1282            sum.x += f * p.x;
1283            sum.y += f * p.y;
1284            sum.z += f * p.z;
1285            sum.w += f * p.w;
1286
1287            gamma += f;
1288            filterIndex++;
1289            cacheIndexX++;
1290          }
1291          cacheIndexY++;
1292        }
1293      }
1294      else {
1295        int cacheIndexY = get_local_id(1);
1296        for (int j = 0; j < filterHeight; j++) {
1297          int cacheIndexX = get_local_id(0);
1298          for (int i = 0; i < filterWidth; i++) {
1299
1300            CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1301            float alpha = QuantumScale*p.w;
1302            float f = filterCache[filterIndex];
1303            float g = alpha * f;
1304
1305            sum.x += g*p.x;
1306            sum.y += g*p.y;
1307            sum.z += g*p.z;
1308            sum.w += f*p.w;
1309
1310            gamma += g;
1311            filterIndex++;
1312            cacheIndexX++;
1313          }
1314          cacheIndexY++;
1315        }
1316        gamma = PerceptibleReciprocal(gamma);
1317        sum.xyz = gamma*sum.xyz;
1318      }
1319      CLPixelType outputPixel;
1320      outputPixel.x = ClampToQuantum(sum.x);
1321      outputPixel.y = ClampToQuantum(sum.y);
1322      outputPixel.z = ClampToQuantum(sum.z);
1323      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1324
1325      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1326    }
1327  )
1328
1329  STRINGIFY(
1330    __kernel
1331    void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1332                  const uint imageWidth, const uint imageHeight,
1333                  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1334                  const uint matte, const ChannelType channel) {
1335
1336      int2 imageIndex;
1337      imageIndex.x = get_global_id(0);
1338      imageIndex.y = get_global_id(1);
1339
1340      /*
1341      unsigned int imageWidth = get_global_size(0);
1342      unsigned int imageHeight = get_global_size(1);
1343      */
1344      if (imageIndex.x >= imageWidth
1345          || imageIndex.y >= imageHeight)
1346          return;
1347
1348      int2 midFilterDimen;
1349      midFilterDimen.x = (filterWidth-1)/2;
1350      midFilterDimen.y = (filterHeight-1)/2;
1351
1352      int filterIndex = 0;
1353      float4 sum = (float4)0.0f;
1354      float gamma = 0.0f;
1355      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1356        for (int j = 0; j < filterHeight; j++) {
1357          int2 inputPixelIndex;
1358          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1359          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1360          for (int i = 0; i < filterWidth; i++) {
1361            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1362            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1363
1364            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1365            float f = filter[filterIndex];
1366
1367            sum.x += f * p.x;
1368            sum.y += f * p.y;
1369            sum.z += f * p.z;
1370            sum.w += f * p.w;
1371
1372            gamma += f;
1373
1374            filterIndex++;
1375          }
1376        }
1377      }
1378      else {
1379
1380        for (int j = 0; j < filterHeight; j++) {
1381          int2 inputPixelIndex;
1382          inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1383          inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1384          for (int i = 0; i < filterWidth; i++) {
1385            inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1386            inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1387
1388            CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1389            float alpha = QuantumScale*p.w;
1390            float f = filter[filterIndex];
1391            float g = alpha * f;
1392
1393            sum.x += g*p.x;
1394            sum.y += g*p.y;
1395            sum.z += g*p.z;
1396            sum.w += f*p.w;
1397
1398            gamma += g;
1399
1400
1401            filterIndex++;
1402          }
1403        }
1404        gamma = PerceptibleReciprocal(gamma);
1405        sum.xyz = gamma*sum.xyz;
1406      }
1407
1408      CLPixelType outputPixel;
1409      outputPixel.x = ClampToQuantum(sum.x);
1410      outputPixel.y = ClampToQuantum(sum.y);
1411      outputPixel.z = ClampToQuantum(sum.z);
1412      outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1413
1414      output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1415    }
1416  )
1417
1418/*
1419%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1420%                                                                             %
1421%                                                                             %
1422%                                                                             %
1423%     D e s p e c k l e                                                       %
1424%                                                                             %
1425%                                                                             %
1426%                                                                             %
1427%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1428*/
1429
1430  STRINGIFY(
1431
1432  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1433  , const unsigned int imageWidth, const unsigned int imageHeight
1434  , const int2 offset, const int polarity, const int matte) {
1435
1436    int x = get_global_id(0);
1437    int y = get_global_id(1);
1438
1439    CLPixelType v = inputImage[y*imageWidth+x];
1440
1441    int2 neighbor;
1442    neighbor.y = y + offset.y;
1443    neighbor.x = x + offset.x;
1444
1445    int2 clampedNeighbor;
1446    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1447    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1448
1449    CLPixelType r = (clampedNeighbor.x == neighbor.x
1450                     && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1451    :(CLPixelType)0;
1452
1453    int sv[4];
1454    sv[0] = (int)v.x;
1455    sv[1] = (int)v.y;
1456    sv[2] = (int)v.z;
1457    sv[3] = (int)v.w;
1458
1459    int sr[4];
1460    sr[0] = (int)r.x;
1461    sr[1] = (int)r.y;
1462    sr[2] = (int)r.z;
1463    sr[3] = (int)r.w;
1464
1465    if (polarity > 0) {
1466      \n #pragma unroll 4\n
1467      for (unsigned int i = 0; i < 4; i++) {
1468        sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1469      }
1470    }
1471    else {
1472      \n #pragma unroll 4\n
1473      for (unsigned int i = 0; i < 4; i++) {
1474        sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1475      }
1476
1477    }
1478
1479    v.x = (CLQuantum)sv[0];
1480    v.y = (CLQuantum)sv[1];
1481    v.z = (CLQuantum)sv[2];
1482
1483    if (matte!=0)
1484      v.w = (CLQuantum)sv[3];
1485
1486    outputImage[y*imageWidth+x] = v;
1487
1488    }
1489
1490
1491  )
1492
1493  STRINGIFY(
1494
1495  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1496  , const unsigned int imageWidth, const unsigned int imageHeight
1497  , const int2 offset, const int polarity, const int matte) {
1498
1499    int x = get_global_id(0);
1500    int y = get_global_id(1);
1501
1502    CLPixelType v = inputImage[y*imageWidth+x];
1503
1504    int2 neighbor, clampedNeighbor;
1505
1506    neighbor.y = y + offset.y;
1507    neighbor.x = x + offset.x;
1508    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1509    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1510
1511    CLPixelType r = (clampedNeighbor.x == neighbor.x
1512      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1513    :(CLPixelType)0;
1514
1515
1516    neighbor.y = y - offset.y;
1517    neighbor.x = x - offset.x;
1518    clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1519    clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1520
1521    CLPixelType s = (clampedNeighbor.x == neighbor.x
1522      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1523    :(CLPixelType)0;
1524
1525
1526    int sv[4];
1527    sv[0] = (int)v.x;
1528    sv[1] = (int)v.y;
1529    sv[2] = (int)v.z;
1530    sv[3] = (int)v.w;
1531
1532    int sr[4];
1533    sr[0] = (int)r.x;
1534    sr[1] = (int)r.y;
1535    sr[2] = (int)r.z;
1536    sr[3] = (int)r.w;
1537
1538    int ss[4];
1539    ss[0] = (int)s.x;
1540    ss[1] = (int)s.y;
1541    ss[2] = (int)s.z;
1542    ss[3] = (int)s.w;
1543
1544    if (polarity > 0) {
1545      \n #pragma unroll 4\n
1546      for (unsigned int i = 0; i < 4; i++) {
1547        //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1548        //
1549        //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1550        //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1551        sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1552      }
1553    }
1554    else {
1555      \n #pragma unroll 4\n
1556      for (unsigned int i = 0; i < 4; i++) {
1557        //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1558        //
1559        //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1560        sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1561      }
1562    }
1563
1564    v.x = (CLQuantum)sv[0];
1565    v.y = (CLQuantum)sv[1];
1566    v.z = (CLQuantum)sv[2];
1567
1568    if (matte!=0)
1569      v.w = (CLQuantum)sv[3];
1570
1571    outputImage[y*imageWidth+x] = v;
1572
1573    }
1574  )
1575
1576/*
1577%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1578%                                                                             %
1579%                                                                             %
1580%                                                                             %
1581%     E q u a l i z e                                                         %
1582%                                                                             %
1583%                                                                             %
1584%                                                                             %
1585%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1586*/
1587
1588    STRINGIFY(
1589    /*
1590    */
1591    __kernel void Equalize(__global CLPixelType * restrict im,
1592      const ChannelType channel,
1593      __global CLPixelType * restrict equalize_map,
1594      const float4 white, const float4 black)
1595      {
1596        const int x = get_global_id(0);
1597        const int y = get_global_id(1);
1598        const int columns = get_global_size(0);
1599        const int c = x + y * columns;
1600
1601        uint ePos;
1602        CLPixelType oValue, eValue;
1603        CLQuantum red, green, blue, alpha;
1604
1605        //read from global
1606        oValue=im[c];
1607
1608        if ((channel & SyncChannels) != 0)
1609        {
1610          if (getRedF4(white) != getRedF4(black))
1611          {
1612            ePos = ScaleQuantumToMap(getRed(oValue));
1613            eValue = equalize_map[ePos];
1614            red = getRed(eValue);
1615            ePos = ScaleQuantumToMap(getGreen(oValue));
1616            eValue = equalize_map[ePos];
1617            green = getRed(eValue);
1618            ePos = ScaleQuantumToMap(getBlue(oValue));
1619            eValue = equalize_map[ePos];
1620            blue = getRed(eValue);
1621            ePos = ScaleQuantumToMap(getAlpha(oValue));
1622            eValue = equalize_map[ePos];
1623            alpha = getRed(eValue);
1624
1625            //write back
1626            im[c]=(CLPixelType)(blue, green, red, alpha);
1627          }
1628
1629        }
1630
1631        // for equalizing, we always need all channels?
1632        // otherwise something more
1633
1634     }
1635    )
1636
1637/*
1638%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1639%                                                                             %
1640%                                                                             %
1641%                                                                             %
1642%     F u n c t i o n                                                         %
1643%                                                                             %
1644%                                                                             %
1645%                                                                             %
1646%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1647*/
1648
1649  STRINGIFY(
1650  /*
1651  apply FunctionImageChannel(braightness-contrast)
1652  */
1653  CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1654    const unsigned int number_parameters,__constant float *parameters)
1655  {
1656    float result = 0.0f;
1657
1658    switch (function)
1659    {
1660    case PolynomialFunction:
1661      {
1662        for (unsigned int i=0; i < number_parameters; i++)
1663          result = result*QuantumScale*pixel + parameters[i];
1664        result *= QuantumRange;
1665        break;
1666      }
1667    case SinusoidFunction:
1668      {
1669        float  freq,phase,ampl,bias;
1670        freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1671        phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1672        ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1673        bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1674        result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1675          (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1676        break;
1677      }
1678    case ArcsinFunction:
1679      {
1680        float  width,range,center,bias;
1681        width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1682        center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1683        range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1684        bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1685
1686        result = 2.0f/width*(QuantumScale*pixel - center);
1687        result = range/MagickPI*asin(result)+bias;
1688        result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1689        result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1690        result *= QuantumRange;
1691        break;
1692      }
1693    case ArctanFunction:
1694      {
1695        float slope,range,center,bias;
1696        slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1697        center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1698        range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1699        bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1700        result = MagickPI*slope*(QuantumScale*pixel-center);
1701        result = QuantumRange*(range/MagickPI*atan(result) + bias);
1702        break;
1703      }
1704    case UndefinedFunction:
1705      break;
1706    }
1707    return(ClampToQuantum(result));
1708  }
1709  )
1710
1711  STRINGIFY(
1712  /*
1713  Improve brightness / contrast of the image
1714  channel : define which channel is improved
1715  function : the function called to enchance the brightness contrast
1716  number_parameters : numbers of parameters
1717  parameters : the parameter
1718  */
1719  __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1720    const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1721    __constant float *parameters)
1722  {
1723    const unsigned int x = get_global_id(0);
1724    const unsigned int y = get_global_id(1);
1725    const unsigned int columns = get_global_size(0);
1726    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1727
1728    float red;
1729    float green;
1730    float blue;
1731    float alpha;
1732
1733    ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1734
1735    if ((channel & RedChannel) != 0)
1736      red=ApplyFunction(red, function, number_parameters, parameters);
1737
1738    if (number_channels > 2)
1739      {
1740        if ((channel & GreenChannel) != 0)
1741          green=ApplyFunction(green, function, number_parameters, parameters);
1742
1743        if ((channel & BlueChannel) != 0)
1744          blue=ApplyFunction(blue, function, number_parameters, parameters);
1745      }
1746
1747    if (((number_channels == 4) || (number_channels == 2)) &&
1748        ((channel & AlphaChannel) != 0))
1749      alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1750
1751    WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1752  }
1753  )
1754
1755/*
1756%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1757%                                                                             %
1758%                                                                             %
1759%                                                                             %
1760%     G r a y s c a l e                                                       %
1761%                                                                             %
1762%                                                                             %
1763%                                                                             %
1764%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1765*/
1766
1767  STRINGIFY(
1768  __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1769    const unsigned int colorspace,const unsigned int method)
1770  {
1771    const unsigned int x = get_global_id(0);
1772    const unsigned int y = get_global_id(1);
1773    const unsigned int columns = get_global_size(0);
1774    __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1775
1776    float
1777      blue,
1778      green,
1779      red;
1780
1781    red=getPixelRed(p);
1782    green=getPixelGreen(p);
1783    blue=getPixelBlue(p);
1784
1785    CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1786
1787    setPixelRed(p,intensity);
1788    setPixelGreen(p,intensity);
1789    setPixelBlue(p,intensity);
1790  }
1791  )
1792
1793/*
1794%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1795%                                                                             %
1796%                                                                             %
1797%                                                                             %
1798%     L o c a l C o n t r a s t                                               %
1799%                                                                             %
1800%                                                                             %
1801%                                                                             %
1802%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1803*/
1804
1805    STRINGIFY(
1806
1807      __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1808          const int radius,
1809          const int imageWidth,
1810          const int imageHeight)
1811      {
1812        const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1813
1814        int x = get_local_id(0);
1815        int y = get_global_id(1);
1816
1817        global CLPixelType *src = srcImage + y * imageWidth;
1818
1819        for (int i = x; i < imageWidth; i += get_local_size(0)) {
1820            float sum = 0.0f;
1821            float weight = 1.0f;
1822
1823            int j = i - radius;
1824            while ((j + 7) < i) {
1825                for (int k = 0; k < 8; ++k) // Unroll 8x
1826                    sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1827                weight += 8.0f;
1828                j+=8;
1829            }
1830            while (j < i) {
1831                sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1832                weight += 1.0f;
1833                ++j;
1834            }
1835
1836            while ((j + 7) < radius + i) {
1837                for (int k = 0; k < 8; ++k) // Unroll 8x
1838                    sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1839                weight -= 8.0f;
1840                j+=8;
1841            }
1842            while (j < radius + i) {
1843                sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1844                weight -= 1.0f;
1845                ++j;
1846            }
1847
1848            tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1849        }
1850      }
1851    )
1852
1853    STRINGIFY(
1854      __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
1855          const int radius,
1856          const float strength,
1857          const int imageWidth,
1858          const int imageHeight)
1859      {
1860        const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1861
1862        int x = get_global_id(0);
1863        int y = get_global_id(1);
1864
1865        if ((x >= imageWidth) || (y >= imageHeight))
1866                return;
1867
1868        global float *src = blurImage + x;
1869
1870        float sum = 0.0f;
1871        float weight = 1.0f;
1872
1873        int j = y - radius;
1874        while ((j + 7) < y) {
1875            for (int k = 0; k < 8; ++k) // Unroll 8x
1876                sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1877            weight += 8.0f;
1878            j+=8;
1879        }
1880        while (j < y) {
1881            sum += weight * src[mirrorBottom(j) * imageWidth];
1882            weight += 1.0f;
1883            ++j;
1884        }
1885
1886        while ((j + 7) < radius + y) {
1887            for (int k = 0; k < 8; ++k) // Unroll 8x
1888                sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1889            weight -= 8.0f;
1890            j+=8;
1891        }
1892        while (j < radius + y) {
1893            sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1894            weight -= 1.0f;
1895            ++j;
1896        }
1897
1898        CLPixelType pixel = srcImage[x + y * imageWidth];
1899        float srcVal = dot(RGB, convert_float4(pixel));
1900        float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1901        mult = (srcVal + mult) / srcVal;
1902
1903        pixel.x = ClampToQuantum(pixel.x * mult);
1904        pixel.y = ClampToQuantum(pixel.y * mult);
1905        pixel.z = ClampToQuantum(pixel.z * mult);
1906
1907        dstImage[x + y * imageWidth] = pixel;
1908      }
1909    )
1910
1911/*
1912%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1913%                                                                             %
1914%                                                                             %
1915%                                                                             %
1916%     M o d u l a t e                                                         %
1917%                                                                             %
1918%                                                                             %
1919%                                                                             %
1920%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1921*/
1922
1923  STRINGIFY(
1924
1925  inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1926    float *hue, float *saturation, float *lightness)
1927  {
1928  float
1929    c,
1930    tmax,
1931    tmin;
1932
1933  /*
1934     Convert RGB to HSL colorspace.
1935     */
1936  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1937  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1938
1939  c=tmax-tmin;
1940
1941  *lightness=(tmax+tmin)/2.0;
1942  if (c <= 0.0)
1943  {
1944    *hue=0.0;
1945    *saturation=0.0;
1946    return;
1947  }
1948
1949  if (tmax == (QuantumScale*red))
1950  {
1951    *hue=(QuantumScale*green-QuantumScale*blue)/c;
1952    if ((QuantumScale*green) < (QuantumScale*blue))
1953      *hue+=6.0;
1954  }
1955  else
1956    if (tmax == (QuantumScale*green))
1957      *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
1958    else
1959      *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
1960
1961  *hue*=60.0/360.0;
1962  if (*lightness <= 0.5)
1963    *saturation=c/(2.0*(*lightness));
1964  else
1965    *saturation=c/(2.0-2.0*(*lightness));
1966  }
1967
1968  inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
1969      CLQuantum *red,CLQuantum *green,CLQuantum *blue)
1970  {
1971    float
1972      b,
1973      c,
1974      g,
1975      h,
1976      tmin,
1977      r,
1978      x;
1979
1980    /*
1981       Convert HSL to RGB colorspace.
1982       */
1983    h=hue*360.0;
1984    if (lightness <= 0.5)
1985      c=2.0*lightness*saturation;
1986    else
1987      c=(2.0-2.0*lightness)*saturation;
1988    tmin=lightness-0.5*c;
1989    h-=360.0*floor(h/360.0);
1990    h/=60.0;
1991    x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
1992    switch ((int) floor(h) % 6)
1993    {
1994      case 0:
1995        {
1996          r=tmin+c;
1997          g=tmin+x;
1998          b=tmin;
1999          break;
2000        }
2001      case 1:
2002        {
2003          r=tmin+x;
2004          g=tmin+c;
2005          b=tmin;
2006          break;
2007        }
2008      case 2:
2009        {
2010          r=tmin;
2011          g=tmin+c;
2012          b=tmin+x;
2013          break;
2014        }
2015      case 3:
2016        {
2017          r=tmin;
2018          g=tmin+x;
2019          b=tmin+c;
2020          break;
2021        }
2022      case 4:
2023        {
2024          r=tmin+x;
2025          g=tmin;
2026          b=tmin+c;
2027          break;
2028        }
2029      case 5:
2030        {
2031          r=tmin+c;
2032          g=tmin;
2033          b=tmin+x;
2034          break;
2035        }
2036      default:
2037        {
2038          r=0.0;
2039          g=0.0;
2040          b=0.0;
2041        }
2042    }
2043    *red=ClampToQuantum(QuantumRange*r);
2044    *green=ClampToQuantum(QuantumRange*g);
2045    *blue=ClampToQuantum(QuantumRange*b);
2046  }
2047
2048  inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2049    CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2050  {
2051    float
2052      hue,
2053      lightness,
2054      saturation;
2055
2056    /*
2057    Increase or decrease color lightness, saturation, or hue.
2058    */
2059    ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2060    hue+=0.5*(0.01*percent_hue-1.0);
2061    while (hue < 0.0)
2062      hue+=1.0;
2063    while (hue >= 1.0)
2064      hue-=1.0;
2065    saturation*=0.01*percent_saturation;
2066    lightness*=0.01*percent_lightness;
2067    ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2068  }
2069
2070  __kernel void Modulate(__global CLPixelType *im,
2071    const float percent_brightness,
2072    const float percent_hue,
2073    const float percent_saturation,
2074    const int colorspace)
2075  {
2076
2077    const int x = get_global_id(0);
2078    const int y = get_global_id(1);
2079    const int columns = get_global_size(0);
2080    const int c = x + y * columns;
2081
2082    CLPixelType pixel = im[c];
2083
2084    CLQuantum
2085        blue,
2086        green,
2087        red;
2088
2089    red=getRed(pixel);
2090    green=getGreen(pixel);
2091    blue=getBlue(pixel);
2092
2093    switch (colorspace)
2094    {
2095      case HSLColorspace:
2096      default:
2097        {
2098          ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2099              &red, &green, &blue);
2100        }
2101
2102    }
2103
2104    CLPixelType filteredPixel;
2105
2106    setRed(&filteredPixel, red);
2107    setGreen(&filteredPixel, green);
2108    setBlue(&filteredPixel, blue);
2109    filteredPixel.w = pixel.w;
2110
2111    im[c] = filteredPixel;
2112  }
2113  )
2114
2115/*
2116%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2117%                                                                             %
2118%                                                                             %
2119%                                                                             %
2120%     M o t i o n B l u r                                                     %
2121%                                                                             %
2122%                                                                             %
2123%                                                                             %
2124%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2125*/
2126
2127  STRINGIFY(
2128    __kernel
2129    void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2130                    const unsigned int imageWidth, const unsigned int imageHeight,
2131                    const __global float *filter, const unsigned int width, const __global int2* offset,
2132                    const float4 bias,
2133                    const ChannelType channel, const unsigned int matte) {
2134
2135      int2 currentPixel;
2136      currentPixel.x = get_global_id(0);
2137      currentPixel.y = get_global_id(1);
2138
2139      if (currentPixel.x >= imageWidth
2140          || currentPixel.y >= imageHeight)
2141          return;
2142
2143      float4 pixel;
2144      pixel.x = (float)bias.x;
2145      pixel.y = (float)bias.y;
2146      pixel.z = (float)bias.z;
2147      pixel.w = (float)bias.w;
2148
2149      if (((channel & AlphaChannel) == 0) || (matte == 0)) {
2150
2151        for (int i = 0; i < width; i++) {
2152          // only support EdgeVirtualPixelMethod through ClampToCanvas
2153          // TODO: implement other virtual pixel method
2154          int2 samplePixel = currentPixel + offset[i];
2155          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2156          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2157          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2158
2159          pixel.x += (filter[i] * (float)samplePixelValue.x);
2160          pixel.y += (filter[i] * (float)samplePixelValue.y);
2161          pixel.z += (filter[i] * (float)samplePixelValue.z);
2162          pixel.w += (filter[i] * (float)samplePixelValue.w);
2163        }
2164
2165        CLPixelType outputPixel;
2166        outputPixel.x = ClampToQuantum(pixel.x);
2167        outputPixel.y = ClampToQuantum(pixel.y);
2168        outputPixel.z = ClampToQuantum(pixel.z);
2169        outputPixel.w = ClampToQuantum(pixel.w);
2170        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2171      }
2172      else {
2173
2174        float gamma = 0.0f;
2175        for (int i = 0; i < width; i++) {
2176          // only support EdgeVirtualPixelMethod through ClampToCanvas
2177          // TODO: implement other virtual pixel method
2178          int2 samplePixel = currentPixel + offset[i];
2179          samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2180          samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2181
2182          CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2183
2184          float alpha = QuantumScale*samplePixelValue.w;
2185          float k = filter[i];
2186          pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2187          pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2188          pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2189
2190          pixel.w += k * alpha * samplePixelValue.w;
2191
2192          gamma+=k*alpha;
2193        }
2194        gamma = PerceptibleReciprocal(gamma);
2195        pixel.xyz = gamma*pixel.xyz;
2196
2197        CLPixelType outputPixel;
2198        outputPixel.x = ClampToQuantum(pixel.x);
2199        outputPixel.y = ClampToQuantum(pixel.y);
2200        outputPixel.z = ClampToQuantum(pixel.z);
2201        outputPixel.w = ClampToQuantum(pixel.w);
2202        output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2203      }
2204    }
2205  )
2206
2207/*
2208%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2209%                                                                             %
2210%                                                                             %
2211%                                                                             %
2212%     R e s i z e                                                             %
2213%                                                                             %
2214%                                                                             %
2215%                                                                             %
2216%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2217*/
2218
2219  STRINGIFY(
2220  // Based on Box from resize.c
2221  float BoxResizeFilter(const float x)
2222  {
2223    return 1.0f;
2224  }
2225  )
2226
2227  STRINGIFY(
2228  // Based on CubicBC from resize.c
2229  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2230  {
2231    /*
2232    Cubic Filters using B,C determined values:
2233    Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
2234    Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
2235    Spline              B = 1   C = 0    B-Spline Gaussian approximation
2236    Hermite             B = 0   C = 0    B-Spline interpolator
2237
2238    See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2239    Graphics Computer Graphics, Volume 22, Number 4, August 1988
2240    http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2241    Mitchell.pdf.
2242
2243    Coefficents are determined from B,C values:
2244    P0 = (  6 - 2*B       )/6 = coeff[0]
2245    P1 =         0
2246    P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2247    P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2248    Q0 = (      8*B +24*C )/6 = coeff[3]
2249    Q1 = (    -12*B -48*C )/6 = coeff[4]
2250    Q2 = (      6*B +30*C )/6 = coeff[5]
2251    Q3 = (    - 1*B - 6*C )/6 = coeff[6]
2252
2253    which are used to define the filter:
2254
2255    P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
2256    Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
2257
2258    which ensures function is continuous in value and derivative (slope).
2259    */
2260    if (x < 1.0)
2261      return(resizeFilterCoefficients[0]+x*(x*
2262      (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2263    if (x < 2.0)
2264      return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2265      (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2266    return(0.0);
2267  }
2268  )
2269
2270  STRINGIFY(
2271  float Sinc(const float x)
2272  {
2273    if (x != 0.0f)
2274    {
2275      const float alpha=(float) (MagickPI*x);
2276      return sinpi(x)/alpha;
2277    }
2278    return(1.0f);
2279  }
2280  )
2281
2282  STRINGIFY(
2283  float Triangle(const float x)
2284  {
2285    /*
2286    1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2287    a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
2288    for Sinc().
2289    */
2290    return ((x<1.0f)?(1.0f-x):0.0f);
2291  }
2292  )
2293
2294
2295  STRINGIFY(
2296  float Hann(const float x)
2297  {
2298    /*
2299    Cosine window function:
2300      0.5+0.5*cos(pi*x).
2301    */
2302    const float cosine=cos((MagickPI*x));
2303    return(0.5f+0.5f*cosine);
2304  }
2305  )
2306
2307  STRINGIFY(
2308  float Hamming(const float x)
2309  {
2310    /*
2311      Offset cosine window function:
2312       .54 + .46 cos(pi x).
2313    */
2314    const float cosine=cos((MagickPI*x));
2315    return(0.54f+0.46f*cosine);
2316  }
2317  )
2318
2319  STRINGIFY(
2320  float Blackman(const float x)
2321  {
2322    /*
2323      Blackman: 2nd order cosine windowing function:
2324        0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2325
2326      Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2327      five flops.
2328    */
2329    const float cosine=cos((MagickPI*x));
2330    return(0.34f+cosine*(0.5f+cosine*0.16f));
2331  }
2332  )
2333
2334  STRINGIFY(
2335  inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2336  {
2337    switch (filterType)
2338    {
2339    /* Call Sinc even for SincFast to get better precision on GPU
2340       and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
2341    case SincWeightingFunction:
2342    case SincFastWeightingFunction:
2343      return Sinc(x);
2344    case CubicBCWeightingFunction:
2345      return CubicBC(x,filterCoefficients);
2346    case BoxWeightingFunction:
2347      return BoxResizeFilter(x);
2348    case TriangleWeightingFunction:
2349      return Triangle(x);
2350    case HannWeightingFunction:
2351      return Hann(x);
2352    case HammingWeightingFunction:
2353      return Hamming(x);
2354    case BlackmanWeightingFunction:
2355      return Blackman(x);
2356
2357    default:
2358      return 0.0f;
2359    }
2360  }
2361  )
2362
2363
2364  STRINGIFY(
2365  inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2366           , const ResizeWeightingFunctionType resizeWindowType
2367           , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2368  {
2369    float scale;
2370    float xBlur = fabs(x/resizeFilterBlur);
2371    if (resizeWindowSupport < MagickEpsilon
2372        || resizeWindowType == BoxWeightingFunction)
2373    {
2374      scale = 1.0f;
2375    }
2376    else
2377    {
2378      scale = resizeFilterScale;
2379      scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2380    }
2381    float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2382    return weight;
2383  }
2384
2385  )
2386
2387  ;
2388  const char *accelerateKernels2 =
2389
2390  STRINGIFY(
2391
2392  inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2393    return (numWorkItems/pixelPerWorkgroup);
2394  }
2395
2396  // returns the index of the pixel for the current workitem to compute.
2397  // returns -1 if this workitem doesn't need to participate in any computation
2398  inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2399    const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2400    int pixelIndex = itemID/numWorkItemsPerPixel;
2401    pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2402    return pixelIndex;
2403  }
2404
2405  )
2406
2407  STRINGIFY(
2408  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2409    void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2410      const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2411      const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2412      const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2413      const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2414      const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2415      const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2416      __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2417  {
2418    // calculate the range of resized image pixels computed by this workgroup
2419    const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2420    const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2421    const unsigned int actualNumPixelToCompute = stopX - startX;
2422
2423    // calculate the range of input image pixels to cache
2424    float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2425    const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2426    scale = PerceptibleReciprocal(scale);
2427
2428    const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2429    const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2430
2431    // cache the input pixels into local memory
2432    const unsigned int y = get_global_id(1);
2433    const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2434    const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2435    event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2436    wait_group_events(1, &e);
2437
2438    unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2439    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2440    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2441    {
2442      const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2443      const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2444      const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2445
2446      // determine which resized pixel computed by this workitem
2447      const unsigned int itemID = get_local_id(0);
2448      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2449
2450      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2451
2452      float4 filteredPixel = (float4)0.0f;
2453      float density = 0.0f;
2454      float gamma = 0.0f;
2455      // -1 means this workitem doesn't participate in the computation
2456      if (pixelIndex != -1)
2457      {
2458        // x coordinated of the resized pixel computed by this workitem
2459        const int x = chunkStartX + pixelIndex;
2460
2461        // calculate how many steps required for this pixel
2462        const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2463        const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2464        const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2465        const unsigned int n = stop - start;
2466
2467        // calculate how many steps this workitem will contribute
2468        unsigned int numStepsPerWorkItem = n / numItems;
2469        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2470
2471        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2472        if (startStep < n)
2473        {
2474          const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2475
2476          unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2477
2478          for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2479          {
2480            float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2481              (ResizeWeightingFunctionType) resizeFilterType,
2482              (ResizeWeightingFunctionType) resizeWindowType,
2483              resizeFilterScale, resizeFilterWindowSupport,
2484              resizeFilterBlur, scale*(start + i - bisect + 0.5));
2485
2486            float4 cp = (float4) 0;
2487
2488            __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2489            cp.x = (float) *(p);
2490            if (number_channels > 2)
2491            {
2492              cp.y = (float) *(p + 1);
2493              cp.z = (float) *(p + 2);
2494            }
2495
2496            if (alpha_index != 0)
2497            {
2498              cp.w = (float) *(p + alpha_index);
2499
2500              float alpha = weight * QuantumScale * cp.w;
2501
2502              filteredPixel.x += alpha * cp.x;
2503              filteredPixel.y += alpha * cp.y;
2504              filteredPixel.z += alpha * cp.z;
2505              filteredPixel.w += weight * cp.w;
2506              gamma += alpha;
2507            }
2508            else
2509              filteredPixel += ((float4) weight)*cp;
2510
2511            density += weight;
2512          }
2513        }
2514      }
2515
2516      // initialize the accumulators to zero
2517      if (itemID < actualNumPixelInThisChunk) {
2518        outputPixelCache[itemID] = (float4)0.0f;
2519        densityCache[itemID] = 0.0f;
2520        if (alpha_index != 0)
2521          gammaCache[itemID] = 0.0f;
2522      }
2523      barrier(CLK_LOCAL_MEM_FENCE);
2524
2525      // accumulatte the filtered pixel value and the density
2526      for (unsigned int i = 0; i < numItems; i++) {
2527        if (pixelIndex != -1) {
2528          if (itemID%numItems == i) {
2529            outputPixelCache[pixelIndex]+=filteredPixel;
2530            densityCache[pixelIndex]+=density;
2531            if (alpha_index != 0)
2532              gammaCache[pixelIndex]+=gamma;
2533          }
2534        }
2535        barrier(CLK_LOCAL_MEM_FENCE);
2536      }
2537
2538      if (itemID < actualNumPixelInThisChunk)
2539      {
2540        float4 filteredPixel = outputPixelCache[itemID];
2541
2542        float gamma = 0.0f;
2543        if (alpha_index != 0)
2544          gamma = gammaCache[itemID];
2545
2546        float density = densityCache[itemID];
2547        if ((density != 0.0f) && (density != 1.0f))
2548        {
2549          density = PerceptibleReciprocal(density);
2550          filteredPixel *= (float4) density;
2551          gamma *= density;
2552        }
2553
2554        if (alpha_index != 0)
2555        {
2556          gamma = PerceptibleReciprocal(gamma);
2557          filteredPixel.x *= gamma;
2558          filteredPixel.y *= gamma;
2559          filteredPixel.z *= gamma;
2560        }
2561
2562        WriteFloat4(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, AllChannels, filteredPixel);
2563      }
2564    }
2565  }
2566  )
2567
2568
2569  STRINGIFY(
2570 __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2571    void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2572      const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2573      const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2574      const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2575      const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2576      const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2577      const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2578      __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2579  {
2580    // calculate the range of resized image pixels computed by this workgroup
2581    const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2582    const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2583    const unsigned int actualNumPixelToCompute = stopY - startY;
2584
2585    // calculate the range of input image pixels to cache
2586    float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2587    const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2588    scale = PerceptibleReciprocal(scale);
2589
2590    const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2591    const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2592
2593    // cache the input pixels into local memory
2594    const unsigned int x = get_global_id(0);
2595    unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2596    unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2597    unsigned int stride = inputColumns * number_channels;
2598    for (unsigned int i = 0; i < number_channels; i++)
2599    {
2600      event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2601      wait_group_events(1,&e);
2602    }
2603
2604    unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2605    unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2606    for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2607    {
2608      const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2609      const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2610      const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2611
2612      // determine which resized pixel computed by this workitem
2613      const unsigned int itemID = get_local_id(1);
2614      const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2615
2616      const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2617
2618      float4 filteredPixel = (float4)0.0f;
2619      float density = 0.0f;
2620      float gamma = 0.0f;
2621      // -1 means this workitem doesn't participate in the computation
2622      if (pixelIndex != -1)
2623      {
2624        // x coordinated of the resized pixel computed by this workitem
2625        const int y = chunkStartY + pixelIndex;
2626
2627        // calculate how many steps required for this pixel
2628        const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2629        const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2630        const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2631        const unsigned int n = stop - start;
2632
2633        // calculate how many steps this workitem will contribute
2634        unsigned int numStepsPerWorkItem = n / numItems;
2635        numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2636
2637        const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2638        if (startStep < n)
2639        {
2640          const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2641
2642          unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2643          for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2644          {
2645            float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2646              (ResizeWeightingFunctionType) resizeFilterType,
2647              (ResizeWeightingFunctionType) resizeWindowType,
2648              resizeFilterScale, resizeFilterWindowSupport,
2649              resizeFilterBlur, scale*(start + i - bisect + 0.5));
2650
2651            float4 cp = (float4)0.0f;
2652
2653            __local CLQuantum *p = inputImageCache + cacheIndex;
2654            cp.x = (float) *(p);
2655            if (number_channels > 2)
2656            {
2657              cp.y = (float) *(p + rangeLength);
2658              cp.z = (float) *(p + (rangeLength * 2));
2659            }
2660
2661            if (alpha_index != 0)
2662            {
2663              cp.w = (float) *(p + (rangeLength * alpha_index));
2664
2665              float alpha = weight * QuantumScale * cp.w;
2666
2667              filteredPixel.x += alpha * cp.x;
2668              filteredPixel.y += alpha * cp.y;
2669              filteredPixel.z += alpha * cp.z;
2670              filteredPixel.w += weight * cp.w;
2671              gamma += alpha;
2672            }
2673            else
2674              filteredPixel += ((float4) weight)*cp;
2675
2676            density += weight;
2677          }
2678        }
2679      }
2680
2681      // initialize the accumulators to zero
2682      if (itemID < actualNumPixelInThisChunk) {
2683        outputPixelCache[itemID] = (float4)0.0f;
2684        densityCache[itemID] = 0.0f;
2685        if (alpha_index != 0)
2686          gammaCache[itemID] = 0.0f;
2687      }
2688      barrier(CLK_LOCAL_MEM_FENCE);
2689
2690      // accumulatte the filtered pixel value and the density
2691      for (unsigned int i = 0; i < numItems; i++) {
2692        if (pixelIndex != -1) {
2693          if (itemID%numItems == i) {
2694            outputPixelCache[pixelIndex]+=filteredPixel;
2695            densityCache[pixelIndex]+=density;
2696            if (alpha_index != 0)
2697              gammaCache[pixelIndex]+=gamma;
2698          }
2699        }
2700        barrier(CLK_LOCAL_MEM_FENCE);
2701      }
2702
2703      if (itemID < actualNumPixelInThisChunk)
2704      {
2705        float4 filteredPixel = outputPixelCache[itemID];
2706
2707        float gamma = 0.0f;
2708        if (alpha_index != 0)
2709          gamma = gammaCache[itemID];
2710
2711        float density = densityCache[itemID];
2712        if ((density != 0.0f) && (density != 1.0f))
2713        {
2714          density = PerceptibleReciprocal(density);
2715          filteredPixel *= (float4) density;
2716          gamma *= density;
2717        }
2718
2719        if (alpha_index != 0)
2720        {
2721          gamma = PerceptibleReciprocal(gamma);
2722          filteredPixel.x *= gamma;
2723          filteredPixel.y *= gamma;
2724          filteredPixel.z *= gamma;
2725        }
2726
2727        WriteFloat4(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, AllChannels, filteredPixel);
2728      }
2729    }
2730  }
2731  )
2732
2733/*
2734%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2735%                                                                             %
2736%                                                                             %
2737%                                                                             %
2738%     R o t a t i o n a l B l u r                                             %
2739%                                                                             %
2740%                                                                             %
2741%                                                                             %
2742%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2743*/
2744
2745  STRINGIFY(
2746  __kernel void RotationalBlur(const __global CLQuantum *image,
2747    const unsigned int number_channels,const unsigned int channel,
2748    const float2 blurCenter,__constant float *cos_theta,
2749    __constant float *sin_theta,const unsigned int cossin_theta_size,
2750    __global CLQuantum *filteredImage)
2751  {
2752    const int x = get_global_id(0);
2753    const int y = get_global_id(1);
2754    const int columns = get_global_size(0);
2755    const int rows = get_global_size(1);
2756    unsigned int step = 1;
2757    float center_x = (float) x - blurCenter.x;
2758    float center_y = (float) y - blurCenter.y;
2759    float radius = hypot(center_x, center_y);
2760
2761    float blur_radius = hypot(blurCenter.x, blurCenter.y);
2762
2763    if (radius > MagickEpsilon)
2764    {
2765      step = (unsigned int) (blur_radius / radius);
2766      if (step == 0)
2767        step = 1;
2768      if (step >= cossin_theta_size)
2769        step = cossin_theta_size-1;
2770    }
2771
2772    float4 result = 0.0f;
2773    float normalize = 0.0f;
2774    float gamma = 0.0f;
2775
2776    for (unsigned int i=0; i<cossin_theta_size; i+=step)
2777    {
2778      int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2779      int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2780
2781      float4 pixel = ReadFloat4(image, number_channels, columns, cx, cy, AllChannels);
2782
2783      if ((number_channels == 4) || (number_channels == 2))
2784      {
2785        float alpha = (float)(QuantumScale*pixel.w);
2786
2787        gamma += alpha;
2788
2789        result.x += alpha * pixel.x;
2790        result.y += alpha * pixel.y;
2791        result.z += alpha * pixel.z;
2792        result.w += pixel.w;
2793      }
2794      else
2795        result += pixel;
2796
2797      normalize += 1.0f;
2798    }
2799
2800    normalize = PerceptibleReciprocal(normalize);
2801
2802    if ((number_channels == 4) || (number_channels == 2))
2803    {
2804      gamma = PerceptibleReciprocal(gamma);
2805      result.x *= gamma;
2806      result.y *= gamma;
2807      result.z *= gamma;
2808      result.w *= normalize;
2809    }
2810    else
2811      result *= normalize;
2812
2813    WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2814  }
2815  )
2816
2817/*
2818%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2819%                                                                             %
2820%                                                                             %
2821%                                                                             %
2822%     U n s h a r p M a s k                                                   %
2823%                                                                             %
2824%                                                                             %
2825%                                                                             %
2826%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2827*/
2828
2829  STRINGIFY(
2830  __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
2831    const __global float4 *blurRowData,const unsigned int number_channels,
2832    const ChannelType channel,const unsigned int columns,
2833    const unsigned int rows,__local float4* cachedData,
2834    __local float* cachedFilter,const __global float *filter,
2835    const unsigned int width,const float gain, const float threshold,
2836    __global CLQuantum *filteredImage)
2837  {
2838    const unsigned int radius = (width-1)/2;
2839
2840    // cache the pixel shared by the workgroup
2841    const int groupX = get_group_id(0);
2842    const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2843    const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2844
2845    if ((groupStartY >= 0) && (groupStopY < rows))
2846    {
2847      event_t e = async_work_group_strided_copy(cachedData,
2848        blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2849      wait_group_events(1,&e);
2850    }
2851    else
2852    {
2853      for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2854        cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2855
2856      barrier(CLK_LOCAL_MEM_FENCE);
2857    }
2858    // cache the filter as well
2859    event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2860    wait_group_events(1,&e);
2861
2862    // only do the work if this is not a patched item
2863    const int cy = get_global_id(1);
2864
2865    if (cy < rows)
2866    {
2867      float4 blurredPixel = (float4) 0.0f;
2868
2869      int i = 0;
2870
2871      for ( ; i+7 < width; )
2872      {
2873        for (int j=0; j < 8; j++, i++)
2874          blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2875      }
2876
2877      for ( ; i < width; i++)
2878        blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2879
2880      float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2881      float4 outputPixel = inputImagePixel - blurredPixel;
2882
2883      float quantumThreshold = QuantumRange*threshold;
2884
2885      int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2886      outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2887
2888      //write back
2889      WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2890    }
2891  }
2892  )
2893
2894  STRINGIFY(
2895  __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
2896    const ChannelType channel,__constant float *filter,const unsigned int width,
2897    const unsigned int columns,const unsigned int rows,__local float4 *pixels,
2898    const float gain,const float threshold,__global CLQuantum *filteredImage)
2899  {
2900    const unsigned int x = get_global_id(0);
2901    const unsigned int y = get_global_id(1);
2902
2903    const unsigned int radius = (width - 1) / 2;
2904
2905    int row = y - radius;
2906    int baseRow = get_group_id(1) * get_local_size(1) - radius;
2907    int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2908
2909    while (row < endRow) {
2910      int srcy = (row < 0) ? -row : row; // mirror pad
2911      srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2912
2913      float4 value = 0.0f;
2914
2915      int ix = x - radius;
2916      int i = 0;
2917
2918      while (i + 7 < width) {
2919        for (int j = 0; j < 8; ++j) { // unrolled
2920          int srcx = ix + j;
2921          srcx = (srcx < 0) ? -srcx : srcx;
2922          srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2923          value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2924        }
2925        ix += 8;
2926        i += 8;
2927      }
2928
2929      while (i < width) {
2930        int srcx = (ix < 0) ? -ix : ix; // mirror pad
2931        srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2932        value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2933        ++i;
2934        ++ix;
2935      }
2936      pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2937      row += get_local_size(1);
2938    }
2939
2940    barrier(CLK_LOCAL_MEM_FENCE);
2941
2942    const int px = get_local_id(0);
2943    const int py = get_local_id(1);
2944    const int prp = get_local_size(0);
2945    float4 value = (float4)(0.0f);
2946
2947    int i = 0;
2948    while (i + 7 < width) {
2949      for (int j = 0; j < 8; ++j) // unrolled
2950        value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
2951      i += 8;
2952    }
2953    while (i < width) {
2954      value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
2955      ++i;
2956    }
2957
2958    float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
2959    float4 diff = srcPixel - value;
2960
2961    float quantumThreshold = QuantumRange*threshold;
2962
2963    int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
2964    value = select(srcPixel + diff * gain, srcPixel, mask);
2965
2966    if ((x < columns) && (y < rows))
2967      WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
2968  }
2969  )
2970
2971  STRINGIFY(
2972    __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
2973    void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
2974      const unsigned int number_channels,const unsigned int max_channels,
2975      const float threshold,const int passes,const unsigned int imageWidth,
2976      const unsigned int imageHeight)
2977  {
2978    const int pad = (1 << (passes - 1));
2979    const int tileSize = 64;
2980    const int tileRowPixels = 64;
2981    const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
2982
2983    CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
2984
2985    local float buffer[64 * 64];
2986
2987    int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0);
2988    int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad;
2989
2990    for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
2991      int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
2992                (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
2993
2994      for (int channel = 0; channel < max_channels; ++channel)
2995        stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
2996    }
2997
2998    for (int channel = 0; channel < max_channels; ++channel) {
2999      // Load LDS
3000      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3001        buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
3002
3003      // Process
3004
3005      float tmp[16];
3006      float accum[16];
3007      float pixel;
3008
3009      for (int i = 0; i < 16; i++)
3010        accum[i]=0.0f;
3011
3012      for (int pass = 0; pass < passes; ++pass) {
3013        const int radius = 1 << pass;
3014        const int x = get_local_id(0);
3015        const float thresh = threshold * noise[pass];
3016
3017        // Apply horizontal hat
3018        for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3019          const int offset = i * tileRowPixels;
3020          if (pass == 0)
3021            tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3022          pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3023          barrier(CLK_LOCAL_MEM_FENCE);
3024          buffer[x + offset] = pixel;
3025        }
3026        barrier(CLK_LOCAL_MEM_FENCE);
3027
3028        // Apply vertical hat
3029        for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3030          pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3031          float delta = tmp[i / 4] - pixel;
3032          tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3033          if (delta < -thresh)
3034            delta += thresh;
3035          else if (delta > thresh)
3036            delta -= thresh;
3037          else
3038            delta = 0;
3039          accum[i / 4] += delta;
3040        }
3041        barrier(CLK_LOCAL_MEM_FENCE);
3042
3043        if (pass < passes - 1)
3044          for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3045            buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3046        else  // last pass
3047          for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3048            accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3049        barrier(CLK_LOCAL_MEM_FENCE);
3050      }
3051
3052      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3053        stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
3054
3055      barrier(CLK_LOCAL_MEM_FENCE);
3056    }
3057
3058    // Write from stage to output
3059
3060    if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3061      for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3062        if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3063          int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
3064          for (int channel = 0; channel < max_channels; ++channel) {
3065            dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
3066          }
3067        }
3068      }
3069    }
3070  }
3071  )
3072
3073  ;
3074
3075#endif // MAGICKCORE_OPENCL_SUPPORT
3076
3077#if defined(__cplusplus) || defined(c_plusplus)
3078}
3079#endif
3080
3081#endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
3082