accelerate-kernels-private.h revision 44fb54c2b1dfacf5cb2c9b20b08c164d84f35ecd
1/* 2 Copyright 1999-2016 ImageMagick Studio LLC, a non-profit organization 3 dedicated to making software imaging solutions freely available. 4 5 You may not use this file except in compliance with the License. 6 obtain a copy of the License at 7 8 http://www.imagemagick.org/script/license.php 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 16 MagickCore private kernels for accelerated functions. 17*/ 18 19#ifndef _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 20#define _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 21 22#if defined(__cplusplus) || defined(c_plusplus) 23extern "C" { 24#endif 25 26#if defined(MAGICKCORE_OPENCL_SUPPORT) 27 28/* 29 Define declarations. 30*/ 31#define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n" 32#define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n" 33#define OPENCL_ELSE() "\n #""else " " \n" 34#define OPENCL_ENDIF() "\n #""endif " " \n" 35#define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n" 36#define STRINGIFY(...) #__VA_ARGS__ "\n" 37 38/* 39 Typedef declarations. 40*/ 41 42typedef struct _FloatPixelPacket 43{ 44 MagickRealType 45 red, 46 green, 47 blue, 48 alpha, 49 black; 50} FloatPixelPacket; 51 52const char *accelerateKernels = 53 54/* 55 Define declarations. 56*/ 57 OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f)) 58 OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f)) 59 OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f)) 60 OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f)) 61 OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f)) 62 OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f)) 63 OPENCL_DEFINE(SigmaRandom, (attenuate)) 64 OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f)) 65 OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y))) 66 OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y))) 67 68/* 69 Typedef declarations. 70*/ 71 STRINGIFY( 72 typedef enum 73 { 74 UndefinedColorspace, 75 RGBColorspace, /* Linear RGB colorspace */ 76 GRAYColorspace, /* greyscale (linear) image (faked 1 channel) */ 77 TransparentColorspace, 78 OHTAColorspace, 79 LabColorspace, 80 XYZColorspace, 81 YCbCrColorspace, 82 YCCColorspace, 83 YIQColorspace, 84 YPbPrColorspace, 85 YUVColorspace, 86 CMYKColorspace, /* negared linear RGB with black separated */ 87 sRGBColorspace, /* Default: non-lienar sRGB colorspace */ 88 HSBColorspace, 89 HSLColorspace, 90 HWBColorspace, 91 Rec601LumaColorspace, 92 Rec601YCbCrColorspace, 93 Rec709LumaColorspace, 94 Rec709YCbCrColorspace, 95 LogColorspace, 96 CMYColorspace, /* negated linear RGB colorspace */ 97 LuvColorspace, 98 HCLColorspace, 99 LCHColorspace, /* alias for LCHuv */ 100 LMSColorspace, 101 LCHabColorspace, /* Cylindrical (Polar) Lab */ 102 LCHuvColorspace, /* Cylindrical (Polar) Luv */ 103 scRGBColorspace, 104 HSIColorspace, 105 HSVColorspace, /* alias for HSB */ 106 HCLpColorspace, 107 YDbDrColorspace 108 } ColorspaceType; 109 ) 110 111 STRINGIFY( 112 typedef enum 113 { 114 UndefinedCompositeOp, 115 NoCompositeOp, 116 ModulusAddCompositeOp, 117 AtopCompositeOp, 118 BlendCompositeOp, 119 BumpmapCompositeOp, 120 ChangeMaskCompositeOp, 121 ClearCompositeOp, 122 ColorBurnCompositeOp, 123 ColorDodgeCompositeOp, 124 ColorizeCompositeOp, 125 CopyBlackCompositeOp, 126 CopyBlueCompositeOp, 127 CopyCompositeOp, 128 CopyCyanCompositeOp, 129 CopyGreenCompositeOp, 130 CopyMagentaCompositeOp, 131 CopyOpacityCompositeOp, 132 CopyRedCompositeOp, 133 CopyYellowCompositeOp, 134 DarkenCompositeOp, 135 DstAtopCompositeOp, 136 DstCompositeOp, 137 DstInCompositeOp, 138 DstOutCompositeOp, 139 DstOverCompositeOp, 140 DifferenceCompositeOp, 141 DisplaceCompositeOp, 142 DissolveCompositeOp, 143 ExclusionCompositeOp, 144 HardLightCompositeOp, 145 HueCompositeOp, 146 InCompositeOp, 147 LightenCompositeOp, 148 LinearLightCompositeOp, 149 LuminizeCompositeOp, 150 MinusDstCompositeOp, 151 ModulateCompositeOp, 152 MultiplyCompositeOp, 153 OutCompositeOp, 154 OverCompositeOp, 155 OverlayCompositeOp, 156 PlusCompositeOp, 157 ReplaceCompositeOp, 158 SaturateCompositeOp, 159 ScreenCompositeOp, 160 SoftLightCompositeOp, 161 SrcAtopCompositeOp, 162 SrcCompositeOp, 163 SrcInCompositeOp, 164 SrcOutCompositeOp, 165 SrcOverCompositeOp, 166 ModulusSubtractCompositeOp, 167 ThresholdCompositeOp, 168 XorCompositeOp, 169 /* These are new operators, added after the above was last sorted. 170 * The list should be re-sorted only when a new library version is 171 * created. 172 */ 173 DivideDstCompositeOp, 174 DistortCompositeOp, 175 BlurCompositeOp, 176 PegtopLightCompositeOp, 177 VividLightCompositeOp, 178 PinLightCompositeOp, 179 LinearDodgeCompositeOp, 180 LinearBurnCompositeOp, 181 MathematicsCompositeOp, 182 DivideSrcCompositeOp, 183 MinusSrcCompositeOp, 184 DarkenIntensityCompositeOp, 185 LightenIntensityCompositeOp 186 } CompositeOperator; 187 ) 188 189 STRINGIFY( 190 typedef enum 191 { 192 UndefinedFunction, 193 ArcsinFunction, 194 ArctanFunction, 195 PolynomialFunction, 196 SinusoidFunction 197 } MagickFunction; 198 ) 199 200 STRINGIFY( 201 typedef enum 202 { 203 UndefinedNoise, 204 UniformNoise, 205 GaussianNoise, 206 MultiplicativeGaussianNoise, 207 ImpulseNoise, 208 LaplacianNoise, 209 PoissonNoise, 210 RandomNoise 211 } NoiseType; 212 ) 213 214 STRINGIFY( 215 typedef enum 216 { 217 UndefinedPixelIntensityMethod = 0, 218 AveragePixelIntensityMethod, 219 BrightnessPixelIntensityMethod, 220 LightnessPixelIntensityMethod, 221 Rec601LumaPixelIntensityMethod, 222 Rec601LuminancePixelIntensityMethod, 223 Rec709LumaPixelIntensityMethod, 224 Rec709LuminancePixelIntensityMethod, 225 RMSPixelIntensityMethod, 226 MSPixelIntensityMethod 227 } PixelIntensityMethod; 228 ) 229 230 STRINGIFY( 231 typedef enum { 232 BoxWeightingFunction = 0, 233 TriangleWeightingFunction, 234 CubicBCWeightingFunction, 235 HannWeightingFunction, 236 HammingWeightingFunction, 237 BlackmanWeightingFunction, 238 GaussianWeightingFunction, 239 QuadraticWeightingFunction, 240 JincWeightingFunction, 241 SincWeightingFunction, 242 SincFastWeightingFunction, 243 KaiserWeightingFunction, 244 WelshWeightingFunction, 245 BohmanWeightingFunction, 246 LagrangeWeightingFunction, 247 CosineWeightingFunction, 248 } ResizeWeightingFunctionType; 249 ) 250 251 STRINGIFY( 252 typedef enum 253 { 254 UndefinedChannel = 0x0000, 255 RedChannel = 0x0001, 256 GrayChannel = 0x0001, 257 CyanChannel = 0x0001, 258 GreenChannel = 0x0002, 259 MagentaChannel = 0x0002, 260 BlueChannel = 0x0004, 261 YellowChannel = 0x0004, 262 BlackChannel = 0x0008, 263 AlphaChannel = 0x0010, 264 OpacityChannel = 0x0010, 265 IndexChannel = 0x0020, /* Color Index Table? */ 266 ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */ 267 WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */ 268 MetaChannel = 0x0100, /* ???? */ 269 CompositeChannels = 0x002F, 270 AllChannels = 0x7ffffff, 271 /* 272 Special purpose channel types. 273 FUTURE: are these needed any more - they are more like hacks 274 SyncChannels for example is NOT a real channel but a 'flag' 275 It really says -- "User has not defined channels" 276 Though it does have extra meaning in the "-auto-level" operator 277 */ 278 TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */ 279 RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */ 280 GrayChannels = 0x0400, 281 SyncChannels = 0x20000, /* channels modified as a single unit */ 282 DefaultChannels = AllChannels 283 } ChannelType; /* must correspond to PixelChannel */ 284 ) 285 286/* 287 Helper functions. 288*/ 289 290OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8)) 291 292 STRINGIFY( 293 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 294 { 295 return((CLQuantum) value); 296 } 297 ) 298 299OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16)) 300 301 STRINGIFY( 302 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 303 { 304 return((CLQuantum) (257.0f*value)); 305 } 306 ) 307 308OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32)) 309 310 STRINGIFY( 311 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 312 { 313 return((CLQuantum) (16843009.0*value)); 314 } 315 ) 316 317OPENCL_ENDIF() 318 319 STRINGIFY( 320 inline int ClampToCanvas(const int offset,const int range) 321 { 322 return clamp(offset, (int)0, range-1); 323 } 324 ) 325 326 STRINGIFY( 327 inline CLQuantum ClampToQuantum(const float value) 328 { 329 return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f); 330 } 331 ) 332 333 STRINGIFY( 334 inline uint ScaleQuantumToMap(CLQuantum value) 335 { 336 if (value >= (CLQuantum) MaxMap) 337 return ((uint)MaxMap); 338 else 339 return ((uint)value); 340 } 341 ) 342 343 STRINGIFY( 344 inline float PerceptibleReciprocal(const float x) 345 { 346 float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0; 347 return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon)); 348 } 349 ) 350 351 STRINGIFY( 352 inline float RoundToUnity(const float value) 353 { 354 return clamp(value,0.0f,1.0f); 355 } 356 ) 357 358 STRINGIFY( 359 360 inline unsigned int getPixelIndex(const unsigned int number_channels, 361 const unsigned int columns, const unsigned int x, const unsigned int y) 362 { 363 return (x * number_channels) + (y * columns * number_channels); 364 } 365 366 inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; } 367 inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); } 368 inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); } 369 inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); } 370 371 inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; } 372 inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; } 373 inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; } 374 inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; } 375 376 inline CLQuantum getBlue(CLPixelType p) { return p.x; } 377 inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; } 378 inline float getBlueF4(float4 p) { return p.x; } 379 inline void setBlueF4(float4* p, float value) { (*p).x = value; } 380 381 inline CLQuantum getGreen(CLPixelType p) { return p.y; } 382 inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; } 383 inline float getGreenF4(float4 p) { return p.y; } 384 inline void setGreenF4(float4* p, float value) { (*p).y = value; } 385 386 inline CLQuantum getRed(CLPixelType p) { return p.z; } 387 inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; } 388 inline float getRedF4(float4 p) { return p.z; } 389 inline void setRedF4(float4* p, float value) { (*p).z = value; } 390 391 inline CLQuantum getAlpha(CLPixelType p) { return p.w; } 392 inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; } 393 inline float getAlphaF4(float4 p) { return p.w; } 394 inline void setAlphaF4(float4* p, float value) { (*p).w = value; } 395 396 inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels, 397 const ChannelType channel, float *red, float *green, float *blue, float *alpha) 398 { 399 if ((channel & RedChannel) != 0) 400 *red=getPixelRed(p); 401 402 if (number_channels > 2) 403 { 404 if ((channel & GreenChannel) != 0) 405 *green=getPixelGreen(p); 406 407 if ((channel & BlueChannel) != 0) 408 *blue=getPixelBlue(p); 409 } 410 411 if (((number_channels == 4) || (number_channels == 2)) && 412 ((channel & AlphaChannel) != 0)) 413 *alpha=getPixelAlpha(p,number_channels); 414 } 415 416 inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels, 417 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel) 418 { 419 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 420 421 float red = 0.0f; 422 float green = 0.0f; 423 float blue = 0.0f; 424 float alpha = 0.0f; 425 426 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 427 return (float4)(red, green, blue, alpha); 428 } 429 430 inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels, 431 const ChannelType channel, float red, float green, float blue, float alpha) 432 { 433 if ((channel & RedChannel) != 0) 434 setPixelRed(p,ClampToQuantum(red)); 435 436 if (number_channels > 2) 437 { 438 if ((channel & GreenChannel) != 0) 439 setPixelGreen(p,ClampToQuantum(green)); 440 441 if ((channel & BlueChannel) != 0) 442 setPixelBlue(p,ClampToQuantum(blue)); 443 } 444 445 if (((number_channels == 4) || (number_channels == 2)) && 446 ((channel & AlphaChannel) != 0)) 447 setPixelAlpha(p,number_channels,ClampToQuantum(alpha)); 448 } 449 450 inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels, 451 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel, 452 float4 pixel) 453 { 454 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 455 WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w); 456 } 457 458 inline float GetPixelIntensity(const unsigned int colorspace, 459 const unsigned int method,float red,float green,float blue) 460 { 461 float intensity; 462 463 if (colorspace == GRAYColorspace) 464 return red; 465 466 switch (method) 467 { 468 case AveragePixelIntensityMethod: 469 { 470 intensity=(red+green+blue)/3.0; 471 break; 472 } 473 case BrightnessPixelIntensityMethod: 474 { 475 intensity=MagickMax(MagickMax(red,green),blue); 476 break; 477 } 478 case LightnessPixelIntensityMethod: 479 { 480 intensity=(MagickMin(MagickMin(red,green),blue)+ 481 MagickMax(MagickMax(red,green),blue))/2.0; 482 break; 483 } 484 case MSPixelIntensityMethod: 485 { 486 intensity=(float) (((float) red*red+green*green+blue*blue)/ 487 (3.0*QuantumRange)); 488 break; 489 } 490 case Rec601LumaPixelIntensityMethod: 491 { 492 /* 493 if (image->colorspace == RGBColorspace) 494 { 495 red=EncodePixelGamma(red); 496 green=EncodePixelGamma(green); 497 blue=EncodePixelGamma(blue); 498 } 499 */ 500 intensity=0.298839*red+0.586811*green+0.114350*blue; 501 break; 502 } 503 case Rec601LuminancePixelIntensityMethod: 504 { 505 /* 506 if (image->colorspace == sRGBColorspace) 507 { 508 red=DecodePixelGamma(red); 509 green=DecodePixelGamma(green); 510 blue=DecodePixelGamma(blue); 511 } 512 */ 513 intensity=0.298839*red+0.586811*green+0.114350*blue; 514 break; 515 } 516 case Rec709LumaPixelIntensityMethod: 517 default: 518 { 519 /* 520 if (image->colorspace == RGBColorspace) 521 { 522 red=EncodePixelGamma(red); 523 green=EncodePixelGamma(green); 524 blue=EncodePixelGamma(blue); 525 } 526 */ 527 intensity=0.212656*red+0.715158*green+0.072186*blue; 528 break; 529 } 530 case Rec709LuminancePixelIntensityMethod: 531 { 532 /* 533 if (image->colorspace == sRGBColorspace) 534 { 535 red=DecodePixelGamma(red); 536 green=DecodePixelGamma(green); 537 blue=DecodePixelGamma(blue); 538 } 539 */ 540 intensity=0.212656*red+0.715158*green+0.072186*blue; 541 break; 542 } 543 case RMSPixelIntensityMethod: 544 { 545 intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/ 546 sqrt(3.0)); 547 break; 548 } 549 } 550 551 return intensity; 552 } 553 554 inline int mirrorBottom(int value) 555 { 556 return (value < 0) ? - (value) : value; 557 } 558 559 inline int mirrorTop(int value, int width) 560 { 561 return (value >= width) ? (2 * width - value - 1) : value; 562 } 563 ) 564 565/* 566%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 567% % 568% % 569% % 570% A d d N o i s e % 571% % 572% % 573% % 574%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 575*/ 576 577 STRINGIFY( 578 /* 579 Part of MWC64X by David Thomas, dt10@imperial.ac.uk 580 This is provided under BSD, full license is with the main package. 581 See http://www.doc.ic.ac.uk/~dt10/research 582 */ 583 584 // Pre: a<M, b<M 585 // Post: r=(a+b) mod M 586 ulong MWC_AddMod64(ulong a, ulong b, ulong M) 587 { 588 ulong v=a+b; 589 //if( (v>=M) || (v<a) ) 590 if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug. 591 v=v-M; 592 return v; 593 } 594 595 // Pre: a<M,b<M 596 // Post: r=(a*b) mod M 597 // This could be done more efficently, but it is portable, and should 598 // be easy to understand. It can be replaced with any of the better 599 // modular multiplication algorithms (for example if you know you have 600 // double precision available or something). 601 ulong MWC_MulMod64(ulong a, ulong b, ulong M) 602 { 603 ulong r=0; 604 while(a!=0){ 605 if(a&1) 606 r=MWC_AddMod64(r,b,M); 607 b=MWC_AddMod64(b,b,M); 608 a=a>>1; 609 } 610 return r; 611 } 612 613 // Pre: a<M, e>=0 614 // Post: r=(a^b) mod M 615 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on 616 // most architectures 617 ulong MWC_PowMod64(ulong a, ulong e, ulong M) 618 { 619 ulong sqr=a, acc=1; 620 while(e!=0){ 621 if(e&1) 622 acc=MWC_MulMod64(acc,sqr,M); 623 sqr=MWC_MulMod64(sqr,sqr,M); 624 e=e>>1; 625 } 626 return acc; 627 } 628 629 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance) 630 { 631 ulong m=MWC_PowMod64(A, distance, M); 632 ulong x=curr.x*(ulong)A+curr.y; 633 x=MWC_MulMod64(x, m, M); 634 return (uint2)((uint)(x/A), (uint)(x%A)); 635 } 636 637 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap) 638 { 639 // This is an arbitrary constant for starting LCG jumping from. I didn't 640 // want to start from 1, as then you end up with the two or three first values 641 // being a bit poor in ones - once you've decided that, one constant is as 642 // good as any another. There is no deep mathematical reason for it, I just 643 // generated a random number. 644 enum{ MWC_BASEID = 4077358422479273989UL }; 645 646 ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap; 647 ulong m=MWC_PowMod64(A, dist, M); 648 649 ulong x=MWC_MulMod64(MWC_BASEID, m, M); 650 return (uint2)((uint)(x/A), (uint)(x%A)); 651 } 652 653 //! Represents the state of a particular generator 654 typedef struct{ uint x; uint c; } mwc64x_state_t; 655 656 enum{ MWC64X_A = 4294883355U }; 657 enum{ MWC64X_M = 18446383549859758079UL }; 658 659 void MWC64X_Step(mwc64x_state_t *s) 660 { 661 uint X=s->x, C=s->c; 662 663 uint Xn=MWC64X_A*X+C; 664 uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar 665 uint Cn=mad_hi(MWC64X_A,X,carry); 666 667 s->x=Xn; 668 s->c=Cn; 669 } 670 671 void MWC64X_Skip(mwc64x_state_t *s, ulong distance) 672 { 673 uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance); 674 s->x=tmp.x; 675 s->c=tmp.y; 676 } 677 678 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset) 679 { 680 uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset); 681 s->x=tmp.x; 682 s->c=tmp.y; 683 } 684 685 //! Return a 32-bit integer in the range [0..2^32) 686 uint MWC64X_NextUint(mwc64x_state_t *s) 687 { 688 uint res=s->x ^ s->c; 689 MWC64X_Step(s); 690 return res; 691 } 692 693 // 694 // End of MWC64X excerpt 695 // 696 697 float mwcReadPseudoRandomValue(mwc64x_state_t* rng) 698 { 699 return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0 700 } 701 702 float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate) 703 { 704 float 705 alpha, 706 beta, 707 noise, 708 sigma; 709 710 noise = 0.0f; 711 alpha=mwcReadPseudoRandomValue(r); 712 switch(noise_type) 713 { 714 case UniformNoise: 715 default: 716 { 717 noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f)); 718 break; 719 } 720 case GaussianNoise: 721 { 722 float 723 gamma, 724 tau; 725 726 if (alpha == 0.0f) 727 alpha=1.0f; 728 beta=mwcReadPseudoRandomValue(r); 729 gamma=sqrt(-2.0f*log(alpha)); 730 sigma=gamma*cospi((2.0f*beta)); 731 tau=gamma*sinpi((2.0f*beta)); 732 noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau; 733 break; 734 } 735 case ImpulseNoise: 736 { 737 if (alpha < (SigmaImpulse/2.0f)) 738 noise=0.0f; 739 else 740 if (alpha >= (1.0f-(SigmaImpulse/2.0f))) 741 noise=QuantumRange; 742 else 743 noise=pixel; 744 break; 745 } 746 case LaplacianNoise: 747 { 748 if (alpha <= 0.5f) 749 { 750 if (alpha <= MagickEpsilon) 751 noise=(pixel-QuantumRange); 752 else 753 noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+ 754 0.5f); 755 break; 756 } 757 beta=1.0f-alpha; 758 if (beta <= (0.5f*MagickEpsilon)) 759 noise=(pixel+QuantumRange); 760 else 761 noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f); 762 break; 763 } 764 case MultiplicativeGaussianNoise: 765 { 766 sigma=1.0f; 767 if (alpha > MagickEpsilon) 768 sigma=sqrt(-2.0f*log(alpha)); 769 beta=mwcReadPseudoRandomValue(r); 770 noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma* 771 cospi((2.0f*beta))/2.0f); 772 break; 773 } 774 case PoissonNoise: 775 { 776 float 777 poisson; 778 unsigned int i; 779 poisson=exp(-SigmaPoisson*QuantumScale*pixel); 780 for (i=0; alpha > poisson; i++) 781 { 782 beta=mwcReadPseudoRandomValue(r); 783 alpha*=beta; 784 } 785 noise=(QuantumRange*i/SigmaPoisson); 786 break; 787 } 788 case RandomNoise: 789 { 790 noise=(QuantumRange*SigmaRandom*alpha); 791 break; 792 } 793 } 794 return noise; 795 } 796 797 __kernel 798 void AddNoise(const __global CLQuantum *image, 799 const unsigned int number_channels,const ChannelType channel, 800 const unsigned int length,const unsigned int pixelsPerWorkItem, 801 const NoiseType noise_type,const float attenuate,const unsigned int seed0, 802 const unsigned int seed1,const unsigned int numRandomNumbersPerPixel, 803 __global CLQuantum *filteredImage) 804 { 805 mwc64x_state_t rng; 806 rng.x = seed0; 807 rng.c = seed1; 808 809 uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use 810 uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream); 811 MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams 812 813 uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels); 814 uint count = pixelsPerWorkItem; 815 816 while (count > 0 && pos < length) 817 { 818 const __global CLQuantum *p = image + pos; 819 __global CLQuantum *q = filteredImage + pos; 820 821 float red; 822 float green; 823 float blue; 824 float alpha; 825 826 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 827 828 if ((channel & RedChannel) != 0) 829 red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate); 830 831 if (number_channels > 2) 832 { 833 if ((channel & GreenChannel) != 0) 834 green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate); 835 836 if ((channel & BlueChannel) != 0) 837 blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate); 838 } 839 840 if (((number_channels == 4) || (number_channels == 2)) && 841 ((channel & AlphaChannel) != 0)) 842 alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate); 843 844 WriteChannels(q, number_channels, channel, red, green, blue, alpha); 845 846 pos += (get_local_size(0) * number_channels); 847 count--; 848 } 849 } 850 ) 851 852/* 853%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 854% % 855% % 856% % 857% B l u r % 858% % 859% % 860% % 861%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 862*/ 863 864 STRINGIFY( 865 /* 866 Reduce image noise and reduce detail levels by row 867 */ 868 __kernel void BlurRow(const __global CLQuantum *image, 869 const unsigned int number_channels,const ChannelType channel, 870 __constant float *filter,const unsigned int width, 871 const unsigned int imageColumns,const unsigned int imageRows, 872 __local float4 *temp,__global float4 *tempImage) 873 { 874 const int x = get_global_id(0); 875 const int y = get_global_id(1); 876 877 const int columns = imageColumns; 878 879 const unsigned int radius = (width-1)/2; 880 const int wsize = get_local_size(0); 881 const unsigned int loadSize = wsize+width; 882 883 //group coordinate 884 const int groupX=get_local_size(0)*get_group_id(0); 885 886 //parallel load and clamp 887 for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0)) 888 { 889 int cx = ClampToCanvas(i + groupX - radius, columns); 890 temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel); 891 } 892 893 // barrier 894 barrier(CLK_LOCAL_MEM_FENCE); 895 896 // only do the work if this is not a patched item 897 if (get_global_id(0) < columns) 898 { 899 // compute 900 float4 result = (float4) 0; 901 902 int i = 0; 903 904 for ( ; i+7 < width; ) 905 { 906 for (int j=0; j < 8; j++) 907 result+=filter[i+j]*temp[i+j+get_local_id(0)]; 908 i+=8; 909 } 910 911 for ( ; i < width; i++) 912 result+=filter[i]*temp[i+get_local_id(0)]; 913 914 // write back to global 915 tempImage[y*columns+x] = result; 916 } 917 } 918 ) 919 920 STRINGIFY( 921 /* 922 Reduce image noise and reduce detail levels by line 923 */ 924 __kernel void BlurColumn(const __global float4 *blurRowData, 925 const unsigned int number_channels,const ChannelType channel, 926 __constant float *filter,const unsigned int width, 927 const unsigned int imageColumns,const unsigned int imageRows, 928 __local float4 *temp,__global CLQuantum *filteredImage) 929 { 930 const int x = get_global_id(0); 931 const int y = get_global_id(1); 932 933 const int columns = imageColumns; 934 const int rows = imageRows; 935 936 unsigned int radius = (width-1)/2; 937 const int wsize = get_local_size(1); 938 const unsigned int loadSize = wsize+width; 939 940 //group coordinate 941 const int groupX=get_local_size(0)*get_group_id(0); 942 const int groupY=get_local_size(1)*get_group_id(1); 943 //notice that get_local_size(0) is 1, so 944 //groupX=get_group_id(0); 945 946 //parallel load and clamp 947 for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1)) 948 temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX]; 949 950 // barrier 951 barrier(CLK_LOCAL_MEM_FENCE); 952 953 // only do the work if this is not a patched item 954 if (get_global_id(1) < rows) 955 { 956 // compute 957 float4 result = (float4) 0; 958 959 int i = 0; 960 961 for ( ; i+7 < width; ) 962 { 963 for (int j=0; j < 8; j++) 964 result+=filter[i+j]*temp[i+j+get_local_id(1)]; 965 i+=8; 966 } 967 968 for ( ; i < width; i++) 969 result+=filter[i]*temp[i+get_local_id(1)]; 970 971 // write back to global 972 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); 973 } 974 } 975 ) 976 977/* 978%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 979% % 980% % 981% % 982% C o m p o s i t e % 983% % 984% % 985% % 986%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 987*/ 988 989 STRINGIFY( 990 inline float ColorDodge(const float Sca, 991 const float Sa,const float Dca,const float Da) 992 { 993 /* 994 Oct 2004 SVG specification. 995 */ 996 if ((Sca*Da+Dca*Sa) >= Sa*Da) 997 return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa)); 998 return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa)); 999 1000 1001 /* 1002 New specification, March 2009 SVG specification. This specification was 1003 also wrong of non-overlap cases. 1004 */ 1005 /* 1006 if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon)) 1007 return(Sca*(1.0-Da)); 1008 if (fabs(Sca-Sa) < MagickEpsilon) 1009 return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa)); 1010 return(Sa*MagickMin(Da,Dca*Sa/(Sa-Sca))); 1011 */ 1012 1013 /* 1014 Working from first principles using the original formula: 1015 1016 f(Sc,Dc) = Dc/(1-Sc) 1017 1018 This works correctly! Looks like the 2004 model was right but just 1019 required a extra condition for correct handling. 1020 */ 1021 1022 /* 1023 if ((fabs(Sca-Sa) < MagickEpsilon) && (fabs(Dca) < MagickEpsilon)) 1024 return(Sca*(1.0-Da)+Dca*(1.0-Sa)); 1025 if (fabs(Sca-Sa) < MagickEpsilon) 1026 return(Sa*Da+Sca*(1.0-Da)+Dca*(1.0-Sa)); 1027 return(Dca*Sa*Sa/(Sa-Sca)+Sca*(1.0-Da)+Dca*(1.0-Sa)); 1028 */ 1029 } 1030 1031 inline void CompositeColorDodge(const float4 *p, 1032 const float4 *q,float4 *composite) { 1033 1034 float 1035 Da, 1036 gamma, 1037 Sa; 1038 1039 Sa=QuantumScale*getAlphaF4(*p); /* simplify and speed up equations */ 1040 Da=QuantumScale*getAlphaF4(*q); 1041 gamma=RoundToUnity(Sa+Da-Sa*Da); /* over blend, as per SVG doc */ 1042 setAlphaF4(composite,QuantumRange*gamma); 1043 gamma=QuantumRange/(fabs(gamma) < MagickEpsilon ? MagickEpsilon : gamma); 1044 setRedF4(composite,gamma*ColorDodge(QuantumScale*getRedF4(*p)*Sa,Sa,QuantumScale* 1045 getRedF4(*q)*Da,Da)); 1046 setGreenF4(composite,gamma*ColorDodge(QuantumScale*getGreenF4(*p)*Sa,Sa,QuantumScale* 1047 getGreenF4(*q)*Da,Da)); 1048 setBlueF4(composite,gamma*ColorDodge(QuantumScale*getBlueF4(*p)*Sa,Sa,QuantumScale* 1049 getBlueF4(*q)*Da,Da)); 1050 } 1051 ) 1052 1053 STRINGIFY( 1054 inline void MagickPixelCompositePlus(const float4 *p, 1055 const float alpha,const float4 *q, 1056 const float beta,float4 *composite) 1057 { 1058 float 1059 gamma; 1060 1061 float 1062 Da, 1063 Sa; 1064 /* 1065 Add two pixels with the given opacities. 1066 */ 1067 Sa=QuantumScale*alpha; 1068 Da=QuantumScale*beta; 1069 gamma=RoundToUnity(Sa+Da); /* 'Plus' blending -- not 'Over' blending */ 1070 setAlphaF4(composite,QuantumRange*gamma); 1071 gamma=PerceptibleReciprocal(gamma); 1072 setRedF4(composite,gamma*(Sa*getRedF4(*p)+Da*getRedF4(*q))); 1073 setGreenF4(composite,gamma*(Sa*getGreenF4(*p)+Da*getGreenF4(*q))); 1074 setBlueF4(composite,gamma*(Sa*getBlueF4(*p)+Da*getBlueF4(*q))); 1075 } 1076 ) 1077 1078 STRINGIFY( 1079 inline void MagickPixelCompositeBlend(const float4 *p, 1080 const float alpha,const float4 *q, 1081 const float beta,float4 *composite) 1082 { 1083 MagickPixelCompositePlus(p,(float) (alpha* 1084 (getAlphaF4(*p))),q,(float) (beta* 1085 (getAlphaF4(*q))),composite); 1086 } 1087 ) 1088 1089 STRINGIFY( 1090 __kernel 1091 void Composite(__global CLPixelType *image, 1092 const unsigned int imageWidth, 1093 const unsigned int imageHeight, 1094 const __global CLPixelType *compositeImage, 1095 const unsigned int compositeWidth, 1096 const unsigned int compositeHeight, 1097 const unsigned int compose, 1098 const ChannelType channel, 1099 const unsigned int matte, 1100 const float destination_dissolve, 1101 const float source_dissolve) { 1102 1103 uint2 index; 1104 index.x = get_global_id(0); 1105 index.y = get_global_id(1); 1106 1107 1108 if (index.x >= imageWidth 1109 || index.y >= imageHeight) { 1110 return; 1111 } 1112 const CLPixelType inputPixel = image[index.y*imageWidth+index.x]; 1113 float4 destination; 1114 setRedF4(&destination,getRed(inputPixel)); 1115 setGreenF4(&destination,getGreen(inputPixel)); 1116 setBlueF4(&destination,getBlue(inputPixel)); 1117 1118 1119 const CLPixelType compositePixel 1120 = compositeImage[index.y*imageWidth+index.x]; 1121 float4 source; 1122 setRedF4(&source,getRed(compositePixel)); 1123 setGreenF4(&source,getGreen(compositePixel)); 1124 setBlueF4(&source,getBlue(compositePixel)); 1125 1126 if (matte != 0) { 1127 setAlphaF4(&destination,getAlpha(inputPixel)); 1128 setAlphaF4(&source,getAlpha(compositePixel)); 1129 } 1130 else { 1131 setAlphaF4(&destination,1.0f); 1132 setAlphaF4(&source,1.0f); 1133 } 1134 1135 float4 composite=destination; 1136 1137 CompositeOperator op = (CompositeOperator)compose; 1138 switch (op) { 1139 case ColorDodgeCompositeOp: 1140 CompositeColorDodge(&source,&destination,&composite); 1141 break; 1142 case BlendCompositeOp: 1143 MagickPixelCompositeBlend(&source,source_dissolve,&destination, 1144 destination_dissolve,&composite); 1145 break; 1146 default: 1147 // unsupported operators 1148 break; 1149 }; 1150 1151 CLPixelType outputPixel; 1152 setRed(&outputPixel, ClampToQuantum(getRedF4(composite))); 1153 setGreen(&outputPixel, ClampToQuantum(getGreenF4(composite))); 1154 setBlue(&outputPixel, ClampToQuantum(getBlueF4(composite))); 1155 setAlpha(&outputPixel, ClampToQuantum(getAlphaF4(composite))); 1156 image[index.y*imageWidth+index.x] = outputPixel; 1157 } 1158 ) 1159 1160/* 1161%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1162% % 1163% % 1164% % 1165% C o n t r a s t % 1166% % 1167% % 1168% % 1169%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1170*/ 1171 1172 STRINGIFY( 1173 1174 inline float3 ConvertRGBToHSB(CLPixelType pixel) { 1175 float3 HueSaturationBrightness; 1176 HueSaturationBrightness.x = 0.0f; // Hue 1177 HueSaturationBrightness.y = 0.0f; // Saturation 1178 HueSaturationBrightness.z = 0.0f; // Brightness 1179 1180 float r=(float) getRed(pixel); 1181 float g=(float) getGreen(pixel); 1182 float b=(float) getBlue(pixel); 1183 1184 float tmin=MagickMin(MagickMin(r,g),b); 1185 float tmax=MagickMax(MagickMax(r,g),b); 1186 1187 if (tmax!=0.0f) { 1188 float delta=tmax-tmin; 1189 HueSaturationBrightness.y=delta/tmax; 1190 HueSaturationBrightness.z=QuantumScale*tmax; 1191 1192 if (delta != 0.0f) { 1193 HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f)); 1194 HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta; 1195 HueSaturationBrightness.x/=6.0f; 1196 HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f; 1197 } 1198 } 1199 return HueSaturationBrightness; 1200 } 1201 1202 inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) { 1203 1204 float hue = HueSaturationBrightness.x; 1205 float brightness = HueSaturationBrightness.z; 1206 float saturation = HueSaturationBrightness.y; 1207 1208 CLPixelType rgb; 1209 1210 if (saturation == 0.0f) { 1211 setRed(&rgb,ClampToQuantum(QuantumRange*brightness)); 1212 setGreen(&rgb,getRed(rgb)); 1213 setBlue(&rgb,getRed(rgb)); 1214 } 1215 else { 1216 1217 float h=6.0f*(hue-floor(hue)); 1218 float f=h-floor(h); 1219 float p=brightness*(1.0f-saturation); 1220 float q=brightness*(1.0f-saturation*f); 1221 float t=brightness*(1.0f-(saturation*(1.0f-f))); 1222 1223 float clampedBrightness = ClampToQuantum(QuantumRange*brightness); 1224 float clamped_t = ClampToQuantum(QuantumRange*t); 1225 float clamped_p = ClampToQuantum(QuantumRange*p); 1226 float clamped_q = ClampToQuantum(QuantumRange*q); 1227 int ih = (int)h; 1228 setRed(&rgb, (ih == 1)?clamped_q: 1229 (ih == 2 || ih == 3)?clamped_p: 1230 (ih == 4)?clamped_t: 1231 clampedBrightness); 1232 1233 setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness: 1234 (ih == 3)?clamped_q: 1235 (ih == 4 || ih == 5)?clamped_p: 1236 clamped_t); 1237 1238 setBlue(&rgb, (ih == 2)?clamped_t: 1239 (ih == 3 || ih == 4)?clampedBrightness: 1240 (ih == 5)?clamped_q: 1241 clamped_p); 1242 } 1243 return rgb; 1244 } 1245 1246 __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen) 1247 { 1248 1249 const int sign = sharpen!=0?1:-1; 1250 const int x = get_global_id(0); 1251 const int y = get_global_id(1); 1252 const int columns = get_global_size(0); 1253 const int c = x + y * columns; 1254 1255 CLPixelType pixel = im[c]; 1256 float3 HueSaturationBrightness = ConvertRGBToHSB(pixel); 1257 float brightness = HueSaturationBrightness.z; 1258 brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness); 1259 brightness = clamp(brightness,0.0f,1.0f); 1260 HueSaturationBrightness.z = brightness; 1261 1262 CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness); 1263 filteredPixel.w = pixel.w; 1264 im[c] = filteredPixel; 1265 } 1266 ) 1267 1268/* 1269%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1270% % 1271% % 1272% % 1273% C o n t r a s t S t r e t c h % 1274% % 1275% % 1276% % 1277%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1278*/ 1279 1280 STRINGIFY( 1281 /* 1282 */ 1283 __kernel void Histogram(__global CLPixelType * restrict im, 1284 const ChannelType channel, 1285 const unsigned int colorspace, 1286 const unsigned int method, 1287 __global uint4 * restrict histogram) 1288 { 1289 const int x = get_global_id(0); 1290 const int y = get_global_id(1); 1291 const int columns = get_global_size(0); 1292 const int c = x + y * columns; 1293 if ((channel & SyncChannels) != 0) 1294 { 1295 float red=(float)getRed(im[c]); 1296 float green=(float)getGreen(im[c]); 1297 float blue=(float)getBlue(im[c]); 1298 1299 float intensity = GetPixelIntensity(colorspace, method, red, green, blue); 1300 uint pos = ScaleQuantumToMap(ClampToQuantum(intensity)); 1301 atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position 1302 } 1303 else 1304 { 1305 // for equalizing, we always need all channels? 1306 // otherwise something more 1307 } 1308 } 1309 ) 1310 1311 STRINGIFY( 1312 /* 1313 */ 1314 __kernel void ContrastStretch(__global CLPixelType * restrict im, 1315 const ChannelType channel, 1316 __global CLPixelType * restrict stretch_map, 1317 const float4 white, const float4 black) 1318 { 1319 const int x = get_global_id(0); 1320 const int y = get_global_id(1); 1321 const int columns = get_global_size(0); 1322 const int c = x + y * columns; 1323 1324 uint ePos; 1325 CLPixelType oValue, eValue; 1326 CLQuantum red, green, blue, alpha; 1327 1328 //read from global 1329 oValue=im[c]; 1330 1331 if ((channel & RedChannel) != 0) 1332 { 1333 if (getRedF4(white) != getRedF4(black)) 1334 { 1335 ePos = ScaleQuantumToMap(getRed(oValue)); 1336 eValue = stretch_map[ePos]; 1337 red = getRed(eValue); 1338 } 1339 } 1340 1341 if ((channel & GreenChannel) != 0) 1342 { 1343 if (getGreenF4(white) != getGreenF4(black)) 1344 { 1345 ePos = ScaleQuantumToMap(getGreen(oValue)); 1346 eValue = stretch_map[ePos]; 1347 green = getGreen(eValue); 1348 } 1349 } 1350 1351 if ((channel & BlueChannel) != 0) 1352 { 1353 if (getBlueF4(white) != getBlueF4(black)) 1354 { 1355 ePos = ScaleQuantumToMap(getBlue(oValue)); 1356 eValue = stretch_map[ePos]; 1357 blue = getBlue(eValue); 1358 } 1359 } 1360 1361 if ((channel & AlphaChannel) != 0) 1362 { 1363 if (getAlphaF4(white) != getAlphaF4(black)) 1364 { 1365 ePos = ScaleQuantumToMap(getAlpha(oValue)); 1366 eValue = stretch_map[ePos]; 1367 alpha = getAlpha(eValue); 1368 } 1369 } 1370 1371 //write back 1372 im[c]=(CLPixelType)(blue, green, red, alpha); 1373 1374 } 1375 ) 1376 1377/* 1378%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1379% % 1380% % 1381% % 1382% C o n v o l v e % 1383% % 1384% % 1385% % 1386%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1387*/ 1388 1389 STRINGIFY( 1390 __kernel 1391 void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output, 1392 const unsigned int imageWidth, const unsigned int imageHeight, 1393 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, 1394 const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) { 1395 1396 int2 blockID; 1397 blockID.x = get_global_id(0) / get_local_size(0); 1398 blockID.y = get_global_id(1) / get_local_size(1); 1399 1400 // image area processed by this workgroup 1401 int2 imageAreaOrg; 1402 imageAreaOrg.x = blockID.x * get_local_size(0); 1403 imageAreaOrg.y = blockID.y * get_local_size(1); 1404 1405 int2 midFilterDimen; 1406 midFilterDimen.x = (filterWidth-1)/2; 1407 midFilterDimen.y = (filterHeight-1)/2; 1408 1409 int2 cachedAreaOrg = imageAreaOrg - midFilterDimen; 1410 1411 // dimension of the local cache 1412 int2 cachedAreaDimen; 1413 cachedAreaDimen.x = get_local_size(0) + filterWidth - 1; 1414 cachedAreaDimen.y = get_local_size(1) + filterHeight - 1; 1415 1416 // cache the pixels accessed by this workgroup in local memory 1417 int localID = get_local_id(1)*get_local_size(0)+get_local_id(0); 1418 int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y; 1419 int groupSize = get_local_size(0) * get_local_size(1); 1420 for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) { 1421 1422 int2 cachedAreaIndex; 1423 cachedAreaIndex.x = i % cachedAreaDimen.x; 1424 cachedAreaIndex.y = i / cachedAreaDimen.x; 1425 1426 int2 imagePixelIndex; 1427 imagePixelIndex = cachedAreaOrg + cachedAreaIndex; 1428 1429 // only support EdgeVirtualPixelMethod through ClampToCanvas 1430 // TODO: implement other virtual pixel method 1431 imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth); 1432 imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight); 1433 1434 pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x]; 1435 } 1436 1437 // cache the filter 1438 for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) { 1439 filterCache[i] = filter[i]; 1440 } 1441 barrier(CLK_LOCAL_MEM_FENCE); 1442 1443 1444 int2 imageIndex; 1445 imageIndex.x = imageAreaOrg.x + get_local_id(0); 1446 imageIndex.y = imageAreaOrg.y + get_local_id(1); 1447 1448 // if out-of-range, stops here and quit 1449 if (imageIndex.x >= imageWidth 1450 || imageIndex.y >= imageHeight) { 1451 return; 1452 } 1453 1454 int filterIndex = 0; 1455 float4 sum = (float4)0.0f; 1456 float gamma = 0.0f; 1457 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 1458 int cacheIndexY = get_local_id(1); 1459 for (int j = 0; j < filterHeight; j++) { 1460 int cacheIndexX = get_local_id(0); 1461 for (int i = 0; i < filterWidth; i++) { 1462 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; 1463 float f = filterCache[filterIndex]; 1464 1465 sum.x += f * p.x; 1466 sum.y += f * p.y; 1467 sum.z += f * p.z; 1468 sum.w += f * p.w; 1469 1470 gamma += f; 1471 filterIndex++; 1472 cacheIndexX++; 1473 } 1474 cacheIndexY++; 1475 } 1476 } 1477 else { 1478 int cacheIndexY = get_local_id(1); 1479 for (int j = 0; j < filterHeight; j++) { 1480 int cacheIndexX = get_local_id(0); 1481 for (int i = 0; i < filterWidth; i++) { 1482 1483 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; 1484 float alpha = QuantumScale*p.w; 1485 float f = filterCache[filterIndex]; 1486 float g = alpha * f; 1487 1488 sum.x += g*p.x; 1489 sum.y += g*p.y; 1490 sum.z += g*p.z; 1491 sum.w += f*p.w; 1492 1493 gamma += g; 1494 filterIndex++; 1495 cacheIndexX++; 1496 } 1497 cacheIndexY++; 1498 } 1499 gamma = PerceptibleReciprocal(gamma); 1500 sum.xyz = gamma*sum.xyz; 1501 } 1502 CLPixelType outputPixel; 1503 outputPixel.x = ClampToQuantum(sum.x); 1504 outputPixel.y = ClampToQuantum(sum.y); 1505 outputPixel.z = ClampToQuantum(sum.z); 1506 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; 1507 1508 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; 1509 } 1510 ) 1511 1512 STRINGIFY( 1513 __kernel 1514 void Convolve(const __global CLPixelType *input, __global CLPixelType *output, 1515 const uint imageWidth, const uint imageHeight, 1516 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, 1517 const uint matte, const ChannelType channel) { 1518 1519 int2 imageIndex; 1520 imageIndex.x = get_global_id(0); 1521 imageIndex.y = get_global_id(1); 1522 1523 /* 1524 unsigned int imageWidth = get_global_size(0); 1525 unsigned int imageHeight = get_global_size(1); 1526 */ 1527 if (imageIndex.x >= imageWidth 1528 || imageIndex.y >= imageHeight) 1529 return; 1530 1531 int2 midFilterDimen; 1532 midFilterDimen.x = (filterWidth-1)/2; 1533 midFilterDimen.y = (filterHeight-1)/2; 1534 1535 int filterIndex = 0; 1536 float4 sum = (float4)0.0f; 1537 float gamma = 0.0f; 1538 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 1539 for (int j = 0; j < filterHeight; j++) { 1540 int2 inputPixelIndex; 1541 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; 1542 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); 1543 for (int i = 0; i < filterWidth; i++) { 1544 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; 1545 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); 1546 1547 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; 1548 float f = filter[filterIndex]; 1549 1550 sum.x += f * p.x; 1551 sum.y += f * p.y; 1552 sum.z += f * p.z; 1553 sum.w += f * p.w; 1554 1555 gamma += f; 1556 1557 filterIndex++; 1558 } 1559 } 1560 } 1561 else { 1562 1563 for (int j = 0; j < filterHeight; j++) { 1564 int2 inputPixelIndex; 1565 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; 1566 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); 1567 for (int i = 0; i < filterWidth; i++) { 1568 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; 1569 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); 1570 1571 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; 1572 float alpha = QuantumScale*p.w; 1573 float f = filter[filterIndex]; 1574 float g = alpha * f; 1575 1576 sum.x += g*p.x; 1577 sum.y += g*p.y; 1578 sum.z += g*p.z; 1579 sum.w += f*p.w; 1580 1581 gamma += g; 1582 1583 1584 filterIndex++; 1585 } 1586 } 1587 gamma = PerceptibleReciprocal(gamma); 1588 sum.xyz = gamma*sum.xyz; 1589 } 1590 1591 CLPixelType outputPixel; 1592 outputPixel.x = ClampToQuantum(sum.x); 1593 outputPixel.y = ClampToQuantum(sum.y); 1594 outputPixel.z = ClampToQuantum(sum.z); 1595 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; 1596 1597 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; 1598 } 1599 ) 1600 1601/* 1602%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1603% % 1604% % 1605% % 1606% D e s p e c k l e % 1607% % 1608% % 1609% % 1610%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1611*/ 1612 1613 STRINGIFY( 1614 1615 __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage 1616 , const unsigned int imageWidth, const unsigned int imageHeight 1617 , const int2 offset, const int polarity, const int matte) { 1618 1619 int x = get_global_id(0); 1620 int y = get_global_id(1); 1621 1622 CLPixelType v = inputImage[y*imageWidth+x]; 1623 1624 int2 neighbor; 1625 neighbor.y = y + offset.y; 1626 neighbor.x = x + offset.x; 1627 1628 int2 clampedNeighbor; 1629 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1630 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1631 1632 CLPixelType r = (clampedNeighbor.x == neighbor.x 1633 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1634 :(CLPixelType)0; 1635 1636 int sv[4]; 1637 sv[0] = (int)v.x; 1638 sv[1] = (int)v.y; 1639 sv[2] = (int)v.z; 1640 sv[3] = (int)v.w; 1641 1642 int sr[4]; 1643 sr[0] = (int)r.x; 1644 sr[1] = (int)r.y; 1645 sr[2] = (int)r.z; 1646 sr[3] = (int)r.w; 1647 1648 if (polarity > 0) { 1649 \n #pragma unroll 4\n 1650 for (unsigned int i = 0; i < 4; i++) { 1651 sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i]; 1652 } 1653 } 1654 else { 1655 \n #pragma unroll 4\n 1656 for (unsigned int i = 0; i < 4; i++) { 1657 sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i]; 1658 } 1659 1660 } 1661 1662 v.x = (CLQuantum)sv[0]; 1663 v.y = (CLQuantum)sv[1]; 1664 v.z = (CLQuantum)sv[2]; 1665 1666 if (matte!=0) 1667 v.w = (CLQuantum)sv[3]; 1668 1669 outputImage[y*imageWidth+x] = v; 1670 1671 } 1672 1673 1674 ) 1675 1676 STRINGIFY( 1677 1678 __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage 1679 , const unsigned int imageWidth, const unsigned int imageHeight 1680 , const int2 offset, const int polarity, const int matte) { 1681 1682 int x = get_global_id(0); 1683 int y = get_global_id(1); 1684 1685 CLPixelType v = inputImage[y*imageWidth+x]; 1686 1687 int2 neighbor, clampedNeighbor; 1688 1689 neighbor.y = y + offset.y; 1690 neighbor.x = x + offset.x; 1691 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1692 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1693 1694 CLPixelType r = (clampedNeighbor.x == neighbor.x 1695 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1696 :(CLPixelType)0; 1697 1698 1699 neighbor.y = y - offset.y; 1700 neighbor.x = x - offset.x; 1701 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1702 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1703 1704 CLPixelType s = (clampedNeighbor.x == neighbor.x 1705 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1706 :(CLPixelType)0; 1707 1708 1709 int sv[4]; 1710 sv[0] = (int)v.x; 1711 sv[1] = (int)v.y; 1712 sv[2] = (int)v.z; 1713 sv[3] = (int)v.w; 1714 1715 int sr[4]; 1716 sr[0] = (int)r.x; 1717 sr[1] = (int)r.y; 1718 sr[2] = (int)r.z; 1719 sr[3] = (int)r.w; 1720 1721 int ss[4]; 1722 ss[0] = (int)s.x; 1723 ss[1] = (int)s.y; 1724 ss[2] = (int)s.z; 1725 ss[3] = (int)s.w; 1726 1727 if (polarity > 0) { 1728 \n #pragma unroll 4\n 1729 for (unsigned int i = 0; i < 4; i++) { 1730 //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i]; 1731 // 1732 //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1733 //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1734 sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1735 } 1736 } 1737 else { 1738 \n #pragma unroll 4\n 1739 for (unsigned int i = 0; i < 4; i++) { 1740 //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i]; 1741 // 1742 //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); 1743 sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); 1744 } 1745 } 1746 1747 v.x = (CLQuantum)sv[0]; 1748 v.y = (CLQuantum)sv[1]; 1749 v.z = (CLQuantum)sv[2]; 1750 1751 if (matte!=0) 1752 v.w = (CLQuantum)sv[3]; 1753 1754 outputImage[y*imageWidth+x] = v; 1755 1756 } 1757 ) 1758 1759/* 1760%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1761% % 1762% % 1763% % 1764% E q u a l i z e % 1765% % 1766% % 1767% % 1768%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1769*/ 1770 1771 STRINGIFY( 1772 /* 1773 */ 1774 __kernel void Equalize(__global CLPixelType * restrict im, 1775 const ChannelType channel, 1776 __global CLPixelType * restrict equalize_map, 1777 const float4 white, const float4 black) 1778 { 1779 const int x = get_global_id(0); 1780 const int y = get_global_id(1); 1781 const int columns = get_global_size(0); 1782 const int c = x + y * columns; 1783 1784 uint ePos; 1785 CLPixelType oValue, eValue; 1786 CLQuantum red, green, blue, alpha; 1787 1788 //read from global 1789 oValue=im[c]; 1790 1791 if ((channel & SyncChannels) != 0) 1792 { 1793 if (getRedF4(white) != getRedF4(black)) 1794 { 1795 ePos = ScaleQuantumToMap(getRed(oValue)); 1796 eValue = equalize_map[ePos]; 1797 red = getRed(eValue); 1798 ePos = ScaleQuantumToMap(getGreen(oValue)); 1799 eValue = equalize_map[ePos]; 1800 green = getRed(eValue); 1801 ePos = ScaleQuantumToMap(getBlue(oValue)); 1802 eValue = equalize_map[ePos]; 1803 blue = getRed(eValue); 1804 ePos = ScaleQuantumToMap(getAlpha(oValue)); 1805 eValue = equalize_map[ePos]; 1806 alpha = getRed(eValue); 1807 1808 //write back 1809 im[c]=(CLPixelType)(blue, green, red, alpha); 1810 } 1811 1812 } 1813 1814 // for equalizing, we always need all channels? 1815 // otherwise something more 1816 1817 } 1818 ) 1819 1820/* 1821%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1822% % 1823% % 1824% % 1825% F u n c t i o n % 1826% % 1827% % 1828% % 1829%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1830*/ 1831 1832 STRINGIFY( 1833 /* 1834 apply FunctionImageChannel(braightness-contrast) 1835 */ 1836 CLQuantum ApplyFunction(float pixel,const MagickFunction function, 1837 const unsigned int number_parameters,__constant float *parameters) 1838 { 1839 float result = 0.0f; 1840 1841 switch (function) 1842 { 1843 case PolynomialFunction: 1844 { 1845 for (unsigned int i=0; i < number_parameters; i++) 1846 result = result*QuantumScale*pixel + parameters[i]; 1847 result *= QuantumRange; 1848 break; 1849 } 1850 case SinusoidFunction: 1851 { 1852 float freq,phase,ampl,bias; 1853 freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1854 phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f; 1855 ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f; 1856 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1857 result = QuantumRange*(ampl*sin(2.0f*MagickPI* 1858 (freq*QuantumScale*pixel + phase/360.0f)) + bias); 1859 break; 1860 } 1861 case ArcsinFunction: 1862 { 1863 float width,range,center,bias; 1864 width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1865 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; 1866 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; 1867 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1868 1869 result = 2.0f/width*(QuantumScale*pixel - center); 1870 result = range/MagickPI*asin(result)+bias; 1871 result = ( result <= -1.0f ) ? bias - range/2.0f : result; 1872 result = ( result >= 1.0f ) ? bias + range/2.0f : result; 1873 result *= QuantumRange; 1874 break; 1875 } 1876 case ArctanFunction: 1877 { 1878 float slope,range,center,bias; 1879 slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1880 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; 1881 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; 1882 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1883 result = MagickPI*slope*(QuantumScale*pixel-center); 1884 result = QuantumRange*(range/MagickPI*atan(result) + bias); 1885 break; 1886 } 1887 case UndefinedFunction: 1888 break; 1889 } 1890 return(ClampToQuantum(result)); 1891 } 1892 ) 1893 1894 STRINGIFY( 1895 /* 1896 Improve brightness / contrast of the image 1897 channel : define which channel is improved 1898 function : the function called to enchance the brightness contrast 1899 number_parameters : numbers of parameters 1900 parameters : the parameter 1901 */ 1902 __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels, 1903 const ChannelType channel,const MagickFunction function,const unsigned int number_parameters, 1904 __constant float *parameters) 1905 { 1906 const unsigned int x = get_global_id(0); 1907 const unsigned int y = get_global_id(1); 1908 const unsigned int columns = get_global_size(0); 1909 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 1910 1911 float red; 1912 float green; 1913 float blue; 1914 float alpha; 1915 1916 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 1917 1918 if ((channel & RedChannel) != 0) 1919 red=ApplyFunction(red, function, number_parameters, parameters); 1920 1921 if (number_channels > 2) 1922 { 1923 if ((channel & GreenChannel) != 0) 1924 green=ApplyFunction(green, function, number_parameters, parameters); 1925 1926 if ((channel & BlueChannel) != 0) 1927 blue=ApplyFunction(blue, function, number_parameters, parameters); 1928 } 1929 1930 if (((number_channels == 4) || (number_channels == 2)) && 1931 ((channel & AlphaChannel) != 0)) 1932 alpha=ApplyFunction(alpha, function, number_parameters, parameters); 1933 1934 WriteChannels(p, number_channels, channel, red, green, blue, alpha); 1935 } 1936 ) 1937 1938/* 1939%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1940% % 1941% % 1942% % 1943% G r a y s c a l e % 1944% % 1945% % 1946% % 1947%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1948*/ 1949 1950 STRINGIFY( 1951 __kernel void Grayscale(__global CLQuantum *image,const int number_channels, 1952 const unsigned int colorspace,const unsigned int method) 1953 { 1954 const unsigned int x = get_global_id(0); 1955 const unsigned int y = get_global_id(1); 1956 const unsigned int columns = get_global_size(0); 1957 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 1958 1959 float 1960 blue, 1961 green, 1962 red; 1963 1964 red=getPixelRed(p); 1965 green=getPixelGreen(p); 1966 blue=getPixelBlue(p); 1967 1968 CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue)); 1969 1970 setPixelRed(p,intensity); 1971 setPixelGreen(p,intensity); 1972 setPixelBlue(p,intensity); 1973 } 1974 ) 1975 1976/* 1977%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1978% % 1979% % 1980% % 1981% L o c a l C o n t r a s t % 1982% % 1983% % 1984% % 1985%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1986*/ 1987 1988 STRINGIFY( 1989 1990 __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage, 1991 const int radius, 1992 const int imageWidth, 1993 const int imageHeight) 1994 { 1995 const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f)); 1996 1997 int x = get_local_id(0); 1998 int y = get_global_id(1); 1999 2000 global CLPixelType *src = srcImage + y * imageWidth; 2001 2002 for (int i = x; i < imageWidth; i += get_local_size(0)) { 2003 float sum = 0.0f; 2004 float weight = 1.0f; 2005 2006 int j = i - radius; 2007 while ((j + 7) < i) { 2008 for (int k = 0; k < 8; ++k) // Unroll 8x 2009 sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)])); 2010 weight += 8.0f; 2011 j+=8; 2012 } 2013 while (j < i) { 2014 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)])); 2015 weight += 1.0f; 2016 ++j; 2017 } 2018 2019 while ((j + 7) < radius + i) { 2020 for (int k = 0; k < 8; ++k) // Unroll 8x 2021 sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)])); 2022 weight -= 8.0f; 2023 j+=8; 2024 } 2025 while (j < radius + i) { 2026 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)])); 2027 weight -= 1.0f; 2028 ++j; 2029 } 2030 2031 tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1)); 2032 } 2033 } 2034 ) 2035 2036 STRINGIFY( 2037 __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage, 2038 const int radius, 2039 const float strength, 2040 const int imageWidth, 2041 const int imageHeight) 2042 { 2043 const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f); 2044 2045 int x = get_global_id(0); 2046 int y = get_global_id(1); 2047 2048 if ((x >= imageWidth) || (y >= imageHeight)) 2049 return; 2050 2051 global float *src = blurImage + x; 2052 2053 float sum = 0.0f; 2054 float weight = 1.0f; 2055 2056 int j = y - radius; 2057 while ((j + 7) < y) { 2058 for (int k = 0; k < 8; ++k) // Unroll 8x 2059 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth]; 2060 weight += 8.0f; 2061 j+=8; 2062 } 2063 while (j < y) { 2064 sum += weight * src[mirrorBottom(j) * imageWidth]; 2065 weight += 1.0f; 2066 ++j; 2067 } 2068 2069 while ((j + 7) < radius + y) { 2070 for (int k = 0; k < 8; ++k) // Unroll 8x 2071 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth]; 2072 weight -= 8.0f; 2073 j+=8; 2074 } 2075 while (j < radius + y) { 2076 sum += weight * src[mirrorTop(j, imageHeight) * imageWidth]; 2077 weight -= 1.0f; 2078 ++j; 2079 } 2080 2081 CLPixelType pixel = srcImage[x + y * imageWidth]; 2082 float srcVal = dot(RGB, convert_float4(pixel)); 2083 float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f); 2084 mult = (srcVal + mult) / srcVal; 2085 2086 pixel.x = ClampToQuantum(pixel.x * mult); 2087 pixel.y = ClampToQuantum(pixel.y * mult); 2088 pixel.z = ClampToQuantum(pixel.z * mult); 2089 2090 dstImage[x + y * imageWidth] = pixel; 2091 } 2092 ) 2093 2094/* 2095%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2096% % 2097% % 2098% % 2099% M o d u l a t e % 2100% % 2101% % 2102% % 2103%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2104*/ 2105 2106 STRINGIFY( 2107 2108 inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue, 2109 float *hue, float *saturation, float *lightness) 2110 { 2111 float 2112 c, 2113 tmax, 2114 tmin; 2115 2116 /* 2117 Convert RGB to HSL colorspace. 2118 */ 2119 tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue)); 2120 tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue)); 2121 2122 c=tmax-tmin; 2123 2124 *lightness=(tmax+tmin)/2.0; 2125 if (c <= 0.0) 2126 { 2127 *hue=0.0; 2128 *saturation=0.0; 2129 return; 2130 } 2131 2132 if (tmax == (QuantumScale*red)) 2133 { 2134 *hue=(QuantumScale*green-QuantumScale*blue)/c; 2135 if ((QuantumScale*green) < (QuantumScale*blue)) 2136 *hue+=6.0; 2137 } 2138 else 2139 if (tmax == (QuantumScale*green)) 2140 *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c; 2141 else 2142 *hue=4.0+(QuantumScale*red-QuantumScale*green)/c; 2143 2144 *hue*=60.0/360.0; 2145 if (*lightness <= 0.5) 2146 *saturation=c/(2.0*(*lightness)); 2147 else 2148 *saturation=c/(2.0-2.0*(*lightness)); 2149 } 2150 2151 inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness, 2152 CLQuantum *red,CLQuantum *green,CLQuantum *blue) 2153 { 2154 float 2155 b, 2156 c, 2157 g, 2158 h, 2159 tmin, 2160 r, 2161 x; 2162 2163 /* 2164 Convert HSL to RGB colorspace. 2165 */ 2166 h=hue*360.0; 2167 if (lightness <= 0.5) 2168 c=2.0*lightness*saturation; 2169 else 2170 c=(2.0-2.0*lightness)*saturation; 2171 tmin=lightness-0.5*c; 2172 h-=360.0*floor(h/360.0); 2173 h/=60.0; 2174 x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0)); 2175 switch ((int) floor(h) % 6) 2176 { 2177 case 0: 2178 { 2179 r=tmin+c; 2180 g=tmin+x; 2181 b=tmin; 2182 break; 2183 } 2184 case 1: 2185 { 2186 r=tmin+x; 2187 g=tmin+c; 2188 b=tmin; 2189 break; 2190 } 2191 case 2: 2192 { 2193 r=tmin; 2194 g=tmin+c; 2195 b=tmin+x; 2196 break; 2197 } 2198 case 3: 2199 { 2200 r=tmin; 2201 g=tmin+x; 2202 b=tmin+c; 2203 break; 2204 } 2205 case 4: 2206 { 2207 r=tmin+x; 2208 g=tmin; 2209 b=tmin+c; 2210 break; 2211 } 2212 case 5: 2213 { 2214 r=tmin+c; 2215 g=tmin; 2216 b=tmin+x; 2217 break; 2218 } 2219 default: 2220 { 2221 r=0.0; 2222 g=0.0; 2223 b=0.0; 2224 } 2225 } 2226 *red=ClampToQuantum(QuantumRange*r); 2227 *green=ClampToQuantum(QuantumRange*g); 2228 *blue=ClampToQuantum(QuantumRange*b); 2229 } 2230 2231 inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, 2232 CLQuantum *red,CLQuantum *green,CLQuantum *blue) 2233 { 2234 float 2235 hue, 2236 lightness, 2237 saturation; 2238 2239 /* 2240 Increase or decrease color lightness, saturation, or hue. 2241 */ 2242 ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness); 2243 hue+=0.5*(0.01*percent_hue-1.0); 2244 while (hue < 0.0) 2245 hue+=1.0; 2246 while (hue >= 1.0) 2247 hue-=1.0; 2248 saturation*=0.01*percent_saturation; 2249 lightness*=0.01*percent_lightness; 2250 ConvertHSLToRGB(hue,saturation,lightness,red,green,blue); 2251 } 2252 2253 __kernel void Modulate(__global CLPixelType *im, 2254 const float percent_brightness, 2255 const float percent_hue, 2256 const float percent_saturation, 2257 const int colorspace) 2258 { 2259 2260 const int x = get_global_id(0); 2261 const int y = get_global_id(1); 2262 const int columns = get_global_size(0); 2263 const int c = x + y * columns; 2264 2265 CLPixelType pixel = im[c]; 2266 2267 CLQuantum 2268 blue, 2269 green, 2270 red; 2271 2272 red=getRed(pixel); 2273 green=getGreen(pixel); 2274 blue=getBlue(pixel); 2275 2276 switch (colorspace) 2277 { 2278 case HSLColorspace: 2279 default: 2280 { 2281 ModulateHSL(percent_hue, percent_saturation, percent_brightness, 2282 &red, &green, &blue); 2283 } 2284 2285 } 2286 2287 CLPixelType filteredPixel; 2288 2289 setRed(&filteredPixel, red); 2290 setGreen(&filteredPixel, green); 2291 setBlue(&filteredPixel, blue); 2292 filteredPixel.w = pixel.w; 2293 2294 im[c] = filteredPixel; 2295 } 2296 ) 2297 2298/* 2299%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2300% % 2301% % 2302% % 2303% M o t i o n B l u r % 2304% % 2305% % 2306% % 2307%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2308*/ 2309 2310 STRINGIFY( 2311 __kernel 2312 void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output, 2313 const unsigned int imageWidth, const unsigned int imageHeight, 2314 const __global float *filter, const unsigned int width, const __global int2* offset, 2315 const float4 bias, 2316 const ChannelType channel, const unsigned int matte) { 2317 2318 int2 currentPixel; 2319 currentPixel.x = get_global_id(0); 2320 currentPixel.y = get_global_id(1); 2321 2322 if (currentPixel.x >= imageWidth 2323 || currentPixel.y >= imageHeight) 2324 return; 2325 2326 float4 pixel; 2327 pixel.x = (float)bias.x; 2328 pixel.y = (float)bias.y; 2329 pixel.z = (float)bias.z; 2330 pixel.w = (float)bias.w; 2331 2332 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 2333 2334 for (int i = 0; i < width; i++) { 2335 // only support EdgeVirtualPixelMethod through ClampToCanvas 2336 // TODO: implement other virtual pixel method 2337 int2 samplePixel = currentPixel + offset[i]; 2338 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); 2339 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); 2340 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; 2341 2342 pixel.x += (filter[i] * (float)samplePixelValue.x); 2343 pixel.y += (filter[i] * (float)samplePixelValue.y); 2344 pixel.z += (filter[i] * (float)samplePixelValue.z); 2345 pixel.w += (filter[i] * (float)samplePixelValue.w); 2346 } 2347 2348 CLPixelType outputPixel; 2349 outputPixel.x = ClampToQuantum(pixel.x); 2350 outputPixel.y = ClampToQuantum(pixel.y); 2351 outputPixel.z = ClampToQuantum(pixel.z); 2352 outputPixel.w = ClampToQuantum(pixel.w); 2353 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; 2354 } 2355 else { 2356 2357 float gamma = 0.0f; 2358 for (int i = 0; i < width; i++) { 2359 // only support EdgeVirtualPixelMethod through ClampToCanvas 2360 // TODO: implement other virtual pixel method 2361 int2 samplePixel = currentPixel + offset[i]; 2362 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); 2363 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); 2364 2365 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; 2366 2367 float alpha = QuantumScale*samplePixelValue.w; 2368 float k = filter[i]; 2369 pixel.x = pixel.x + k * alpha * samplePixelValue.x; 2370 pixel.y = pixel.y + k * alpha * samplePixelValue.y; 2371 pixel.z = pixel.z + k * alpha * samplePixelValue.z; 2372 2373 pixel.w += k * alpha * samplePixelValue.w; 2374 2375 gamma+=k*alpha; 2376 } 2377 gamma = PerceptibleReciprocal(gamma); 2378 pixel.xyz = gamma*pixel.xyz; 2379 2380 CLPixelType outputPixel; 2381 outputPixel.x = ClampToQuantum(pixel.x); 2382 outputPixel.y = ClampToQuantum(pixel.y); 2383 outputPixel.z = ClampToQuantum(pixel.z); 2384 outputPixel.w = ClampToQuantum(pixel.w); 2385 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; 2386 } 2387 } 2388 ) 2389 2390/* 2391%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2392% % 2393% % 2394% % 2395% R e s i z e % 2396% % 2397% % 2398% % 2399%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2400*/ 2401 2402 STRINGIFY( 2403 // Based on Box from resize.c 2404 float BoxResizeFilter(const float x) 2405 { 2406 return 1.0f; 2407 } 2408 ) 2409 2410 STRINGIFY( 2411 // Based on CubicBC from resize.c 2412 float CubicBC(const float x,const __global float* resizeFilterCoefficients) 2413 { 2414 /* 2415 Cubic Filters using B,C determined values: 2416 Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter 2417 Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears 2418 Spline B = 1 C = 0 B-Spline Gaussian approximation 2419 Hermite B = 0 C = 0 B-Spline interpolator 2420 2421 See paper by Mitchell and Netravali, Reconstruction Filters in Computer 2422 Graphics Computer Graphics, Volume 22, Number 4, August 1988 2423 http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/ 2424 Mitchell.pdf. 2425 2426 Coefficents are determined from B,C values: 2427 P0 = ( 6 - 2*B )/6 = coeff[0] 2428 P1 = 0 2429 P2 = (-18 +12*B + 6*C )/6 = coeff[1] 2430 P3 = ( 12 - 9*B - 6*C )/6 = coeff[2] 2431 Q0 = ( 8*B +24*C )/6 = coeff[3] 2432 Q1 = ( -12*B -48*C )/6 = coeff[4] 2433 Q2 = ( 6*B +30*C )/6 = coeff[5] 2434 Q3 = ( - 1*B - 6*C )/6 = coeff[6] 2435 2436 which are used to define the filter: 2437 2438 P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1 2439 Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2 2440 2441 which ensures function is continuous in value and derivative (slope). 2442 */ 2443 if (x < 1.0) 2444 return(resizeFilterCoefficients[0]+x*(x* 2445 (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2]))); 2446 if (x < 2.0) 2447 return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x* 2448 (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6]))); 2449 return(0.0); 2450 } 2451 ) 2452 2453 STRINGIFY( 2454 float Sinc(const float x) 2455 { 2456 if (x != 0.0f) 2457 { 2458 const float alpha=(float) (MagickPI*x); 2459 return sinpi(x)/alpha; 2460 } 2461 return(1.0f); 2462 } 2463 ) 2464 2465 STRINGIFY( 2466 float Triangle(const float x) 2467 { 2468 /* 2469 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or 2470 a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function 2471 for Sinc(). 2472 */ 2473 return ((x<1.0f)?(1.0f-x):0.0f); 2474 } 2475 ) 2476 2477 2478 STRINGIFY( 2479 float Hann(const float x) 2480 { 2481 /* 2482 Cosine window function: 2483 0.5+0.5*cos(pi*x). 2484 */ 2485 const float cosine=cos((MagickPI*x)); 2486 return(0.5f+0.5f*cosine); 2487 } 2488 ) 2489 2490 STRINGIFY( 2491 float Hamming(const float x) 2492 { 2493 /* 2494 Offset cosine window function: 2495 .54 + .46 cos(pi x). 2496 */ 2497 const float cosine=cos((MagickPI*x)); 2498 return(0.54f+0.46f*cosine); 2499 } 2500 ) 2501 2502 STRINGIFY( 2503 float Blackman(const float x) 2504 { 2505 /* 2506 Blackman: 2nd order cosine windowing function: 2507 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x) 2508 2509 Refactored by Chantal Racette and Nicolas Robidoux to one trig call and 2510 five flops. 2511 */ 2512 const float cosine=cos((MagickPI*x)); 2513 return(0.34f+cosine*(0.5f+cosine*0.16f)); 2514 } 2515 ) 2516 2517 STRINGIFY( 2518 inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients) 2519 { 2520 switch (filterType) 2521 { 2522 /* Call Sinc even for SincFast to get better precision on GPU 2523 and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/ 2524 case SincWeightingFunction: 2525 case SincFastWeightingFunction: 2526 return Sinc(x); 2527 case CubicBCWeightingFunction: 2528 return CubicBC(x,filterCoefficients); 2529 case BoxWeightingFunction: 2530 return BoxResizeFilter(x); 2531 case TriangleWeightingFunction: 2532 return Triangle(x); 2533 case HannWeightingFunction: 2534 return Hann(x); 2535 case HammingWeightingFunction: 2536 return Hamming(x); 2537 case BlackmanWeightingFunction: 2538 return Blackman(x); 2539 2540 default: 2541 return 0.0f; 2542 } 2543 } 2544 ) 2545 2546 2547 STRINGIFY( 2548 inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType 2549 , const ResizeWeightingFunctionType resizeWindowType 2550 , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x) 2551 { 2552 float scale; 2553 float xBlur = fabs(x/resizeFilterBlur); 2554 if (resizeWindowSupport < MagickEpsilon 2555 || resizeWindowType == BoxWeightingFunction) 2556 { 2557 scale = 1.0f; 2558 } 2559 else 2560 { 2561 scale = resizeFilterScale; 2562 scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients); 2563 } 2564 float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients); 2565 return weight; 2566 } 2567 2568 ) 2569 2570 ; 2571 const char *accelerateKernels2 = 2572 2573 STRINGIFY( 2574 2575 inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { 2576 return (numWorkItems/pixelPerWorkgroup); 2577 } 2578 2579 // returns the index of the pixel for the current workitem to compute. 2580 // returns -1 if this workitem doesn't need to participate in any computation 2581 inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { 2582 const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems); 2583 int pixelIndex = itemID/numWorkItemsPerPixel; 2584 pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1; 2585 return pixelIndex; 2586 } 2587 2588 ) 2589 2590 STRINGIFY( 2591 __kernel __attribute__((reqd_work_group_size(256, 1, 1))) 2592 void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, 2593 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, 2594 const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor, 2595 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, 2596 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, 2597 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, 2598 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, 2599 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) 2600 { 2601 // calculate the range of resized image pixels computed by this workgroup 2602 const unsigned int startX = get_group_id(0)*pixelPerWorkgroup; 2603 const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns); 2604 const unsigned int actualNumPixelToCompute = stopX - startX; 2605 2606 // calculate the range of input image pixels to cache 2607 float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f); 2608 const float support = MagickMax(scale*resizeFilterSupport,0.5f); 2609 scale = PerceptibleReciprocal(scale); 2610 2611 const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0)); 2612 const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns); 2613 2614 // cache the input pixels into local memory 2615 const unsigned int y = get_global_id(1); 2616 const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y); 2617 const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels; 2618 event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0); 2619 wait_group_events(1, &e); 2620 2621 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; 2622 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; 2623 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) 2624 { 2625 const unsigned int chunkStartX = startX + chunk*pixelChunkSize; 2626 const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX); 2627 const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX; 2628 2629 // determine which resized pixel computed by this workitem 2630 const unsigned int itemID = get_local_id(0); 2631 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0)); 2632 2633 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0)); 2634 2635 float4 filteredPixel = (float4)0.0f; 2636 float density = 0.0f; 2637 float gamma = 0.0f; 2638 // -1 means this workitem doesn't participate in the computation 2639 if (pixelIndex != -1) 2640 { 2641 // x coordinated of the resized pixel computed by this workitem 2642 const int x = chunkStartX + pixelIndex; 2643 2644 // calculate how many steps required for this pixel 2645 const float bisect = (x+0.5)/xFactor+MagickEpsilon; 2646 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); 2647 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns); 2648 const unsigned int n = stop - start; 2649 2650 // calculate how many steps this workitem will contribute 2651 unsigned int numStepsPerWorkItem = n / numItems; 2652 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); 2653 2654 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; 2655 if (startStep < n) 2656 { 2657 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); 2658 2659 unsigned int cacheIndex = start+startStep-cacheRangeStartX; 2660 2661 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) 2662 { 2663 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, 2664 (ResizeWeightingFunctionType) resizeFilterType, 2665 (ResizeWeightingFunctionType) resizeWindowType, 2666 resizeFilterScale, resizeFilterWindowSupport, 2667 resizeFilterBlur, scale*(start + i - bisect + 0.5)); 2668 2669 float4 cp = (float4) 0; 2670 2671 __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels); 2672 cp.x = (float) *(p); 2673 if (number_channels > 2) 2674 { 2675 cp.y = (float) *(p + 1); 2676 cp.z = (float) *(p + 2); 2677 } 2678 2679 if (alpha_index != 0) 2680 { 2681 cp.w = (float) *(p + alpha_index); 2682 2683 float alpha = weight * QuantumScale * cp.w; 2684 2685 filteredPixel.x += alpha * cp.x; 2686 filteredPixel.y += alpha * cp.y; 2687 filteredPixel.z += alpha * cp.z; 2688 filteredPixel.w += weight * cp.w; 2689 gamma += alpha; 2690 } 2691 else 2692 filteredPixel += ((float4) weight)*cp; 2693 2694 density += weight; 2695 } 2696 } 2697 } 2698 2699 // initialize the accumulators to zero 2700 if (itemID < actualNumPixelInThisChunk) { 2701 outputPixelCache[itemID] = (float4)0.0f; 2702 densityCache[itemID] = 0.0f; 2703 if (alpha_index != 0) 2704 gammaCache[itemID] = 0.0f; 2705 } 2706 barrier(CLK_LOCAL_MEM_FENCE); 2707 2708 // accumulatte the filtered pixel value and the density 2709 for (unsigned int i = 0; i < numItems; i++) { 2710 if (pixelIndex != -1) { 2711 if (itemID%numItems == i) { 2712 outputPixelCache[pixelIndex]+=filteredPixel; 2713 densityCache[pixelIndex]+=density; 2714 if (alpha_index != 0) 2715 gammaCache[pixelIndex]+=gamma; 2716 } 2717 } 2718 barrier(CLK_LOCAL_MEM_FENCE); 2719 } 2720 2721 if (itemID < actualNumPixelInThisChunk) 2722 { 2723 float4 filteredPixel = outputPixelCache[itemID]; 2724 2725 float gamma = 0.0f; 2726 if (alpha_index != 0) 2727 gamma = gammaCache[itemID]; 2728 2729 float density = densityCache[itemID]; 2730 if ((density != 0.0f) && (density != 1.0f)) 2731 { 2732 density = PerceptibleReciprocal(density); 2733 filteredPixel *= (float4) density; 2734 gamma *= density; 2735 } 2736 2737 if (alpha_index != 0) 2738 { 2739 gamma = PerceptibleReciprocal(gamma); 2740 filteredPixel.x *= gamma; 2741 filteredPixel.y *= gamma; 2742 filteredPixel.z *= gamma; 2743 } 2744 2745 WriteFloat4(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, AllChannels, filteredPixel); 2746 } 2747 } 2748 } 2749 ) 2750 2751 2752 STRINGIFY( 2753 __kernel __attribute__((reqd_work_group_size(1, 256, 1))) 2754 void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, 2755 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, 2756 const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor, 2757 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, 2758 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, 2759 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, 2760 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, 2761 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) 2762 { 2763 // calculate the range of resized image pixels computed by this workgroup 2764 const unsigned int startY = get_group_id(1)*pixelPerWorkgroup; 2765 const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows); 2766 const unsigned int actualNumPixelToCompute = stopY - startY; 2767 2768 // calculate the range of input image pixels to cache 2769 float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f); 2770 const float support = MagickMax(scale*resizeFilterSupport,0.5f); 2771 scale = PerceptibleReciprocal(scale); 2772 2773 const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0)); 2774 const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows); 2775 2776 // cache the input pixels into local memory 2777 const unsigned int x = get_global_id(0); 2778 unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY); 2779 unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY; 2780 unsigned int stride = inputColumns * number_channels; 2781 for (unsigned int i = 0; i < number_channels; i++) 2782 { 2783 event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0); 2784 wait_group_events(1,&e); 2785 } 2786 2787 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; 2788 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; 2789 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) 2790 { 2791 const unsigned int chunkStartY = startY + chunk*pixelChunkSize; 2792 const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY); 2793 const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY; 2794 2795 // determine which resized pixel computed by this workitem 2796 const unsigned int itemID = get_local_id(1); 2797 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1)); 2798 2799 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1)); 2800 2801 float4 filteredPixel = (float4)0.0f; 2802 float density = 0.0f; 2803 float gamma = 0.0f; 2804 // -1 means this workitem doesn't participate in the computation 2805 if (pixelIndex != -1) 2806 { 2807 // x coordinated of the resized pixel computed by this workitem 2808 const int y = chunkStartY + pixelIndex; 2809 2810 // calculate how many steps required for this pixel 2811 const float bisect = (y+0.5)/yFactor+MagickEpsilon; 2812 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); 2813 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows); 2814 const unsigned int n = stop - start; 2815 2816 // calculate how many steps this workitem will contribute 2817 unsigned int numStepsPerWorkItem = n / numItems; 2818 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); 2819 2820 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; 2821 if (startStep < n) 2822 { 2823 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); 2824 2825 unsigned int cacheIndex = start+startStep-cacheRangeStartY; 2826 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) 2827 { 2828 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, 2829 (ResizeWeightingFunctionType) resizeFilterType, 2830 (ResizeWeightingFunctionType) resizeWindowType, 2831 resizeFilterScale, resizeFilterWindowSupport, 2832 resizeFilterBlur, scale*(start + i - bisect + 0.5)); 2833 2834 float4 cp = (float4)0.0f; 2835 2836 __local CLQuantum *p = inputImageCache + cacheIndex; 2837 cp.x = (float) *(p); 2838 if (number_channels > 2) 2839 { 2840 cp.y = (float) *(p + rangeLength); 2841 cp.z = (float) *(p + (rangeLength * 2)); 2842 } 2843 2844 if (alpha_index != 0) 2845 { 2846 cp.w = (float) *(p + (rangeLength * alpha_index)); 2847 2848 float alpha = weight * QuantumScale * cp.w; 2849 2850 filteredPixel.x += alpha * cp.x; 2851 filteredPixel.y += alpha * cp.y; 2852 filteredPixel.z += alpha * cp.z; 2853 filteredPixel.w += weight * cp.w; 2854 gamma += alpha; 2855 } 2856 else 2857 filteredPixel += ((float4) weight)*cp; 2858 2859 density += weight; 2860 } 2861 } 2862 } 2863 2864 // initialize the accumulators to zero 2865 if (itemID < actualNumPixelInThisChunk) { 2866 outputPixelCache[itemID] = (float4)0.0f; 2867 densityCache[itemID] = 0.0f; 2868 if (alpha_index != 0) 2869 gammaCache[itemID] = 0.0f; 2870 } 2871 barrier(CLK_LOCAL_MEM_FENCE); 2872 2873 // accumulatte the filtered pixel value and the density 2874 for (unsigned int i = 0; i < numItems; i++) { 2875 if (pixelIndex != -1) { 2876 if (itemID%numItems == i) { 2877 outputPixelCache[pixelIndex]+=filteredPixel; 2878 densityCache[pixelIndex]+=density; 2879 if (alpha_index != 0) 2880 gammaCache[pixelIndex]+=gamma; 2881 } 2882 } 2883 barrier(CLK_LOCAL_MEM_FENCE); 2884 } 2885 2886 if (itemID < actualNumPixelInThisChunk) 2887 { 2888 float4 filteredPixel = outputPixelCache[itemID]; 2889 2890 float gamma = 0.0f; 2891 if (alpha_index != 0) 2892 gamma = gammaCache[itemID]; 2893 2894 float density = densityCache[itemID]; 2895 if ((density != 0.0f) && (density != 1.0f)) 2896 { 2897 density = PerceptibleReciprocal(density); 2898 filteredPixel *= (float4) density; 2899 gamma *= density; 2900 } 2901 2902 if (alpha_index != 0) 2903 { 2904 gamma = PerceptibleReciprocal(gamma); 2905 filteredPixel.x *= gamma; 2906 filteredPixel.y *= gamma; 2907 filteredPixel.z *= gamma; 2908 } 2909 2910 WriteFloat4(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, AllChannels, filteredPixel); 2911 } 2912 } 2913 } 2914 ) 2915 2916/* 2917%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2918% % 2919% % 2920% % 2921% R o t a t i o n a l B l u r % 2922% % 2923% % 2924% % 2925%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2926*/ 2927 2928 STRINGIFY( 2929 __kernel void RotationalBlur(const __global CLQuantum *image,const unsigned int number_channels, 2930 const unsigned int channel,const float4 bias,const float2 blurCenter,__constant float *cos_theta, 2931 __constant float *sin_theta,const unsigned int cossin_theta_size,__global CLQuantum *filteredImage) 2932 { 2933 const int x = get_global_id(0); 2934 const int y = get_global_id(1); 2935 const int columns = get_global_size(0); 2936 const int rows = get_global_size(1); 2937 unsigned int step = 1; 2938 float center_x = (float) x - blurCenter.x; 2939 float center_y = (float) y - blurCenter.y; 2940 float radius = hypot(center_x, center_y); 2941 2942 float blur_radius = hypot(blurCenter.x, blurCenter.y); 2943 2944 if (radius > MagickEpsilon) 2945 { 2946 step = (unsigned int) (blur_radius / radius); 2947 if (step == 0) 2948 step = 1; 2949 if (step >= cossin_theta_size) 2950 step = cossin_theta_size-1; 2951 } 2952 2953 float4 result = bias; 2954 2955 float normalize = 0.0f; 2956 float gamma = 0.0f; 2957 2958 for (unsigned int i=0; i<cossin_theta_size; i+=step) 2959 { 2960 int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns); 2961 int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows); 2962 2963 float4 pixel = ReadFloat4(image, number_channels, columns, cx, cy, channel); 2964 2965 if ((number_channels == 4) || (number_channels == 2)) 2966 { 2967 float alpha = (float)(QuantumScale*pixel.w); 2968 2969 gamma += alpha; 2970 2971 result.x += alpha * pixel.x; 2972 result.y += alpha * pixel.y; 2973 result.z += alpha * pixel.z; 2974 result.w += pixel.w; 2975 } 2976 else 2977 result += pixel; 2978 2979 normalize += 1.0f; 2980 } 2981 2982 normalize = PerceptibleReciprocal(normalize); 2983 2984 if ((number_channels == 4) || (number_channels == 2)) 2985 { 2986 gamma = PerceptibleReciprocal(gamma); 2987 result.x *= gamma; 2988 result.y *= gamma; 2989 result.z *= gamma; 2990 result.w *= normalize; 2991 } 2992 else 2993 result *= normalize; 2994 2995 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); 2996 } 2997 ) 2998 2999/* 3000%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3001% % 3002% % 3003% % 3004% U n s h a r p M a s k % 3005% % 3006% % 3007% % 3008%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 3009*/ 3010 3011 STRINGIFY( 3012 __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image, 3013 const __global float4 *blurRowData,const unsigned int number_channels, 3014 const ChannelType channel,const unsigned int columns, 3015 const unsigned int rows,__local float4* cachedData, 3016 __local float* cachedFilter,const __global float *filter, 3017 const unsigned int width,const float gain, const float threshold, 3018 __global CLQuantum *filteredImage) 3019 { 3020 const unsigned int radius = (width-1)/2; 3021 3022 // cache the pixel shared by the workgroup 3023 const int groupX = get_group_id(0); 3024 const int groupStartY = get_group_id(1)*get_local_size(1) - radius; 3025 const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius; 3026 3027 if ((groupStartY >= 0) && (groupStopY < rows)) 3028 { 3029 event_t e = async_work_group_strided_copy(cachedData, 3030 blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0); 3031 wait_group_events(1,&e); 3032 } 3033 else 3034 { 3035 for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) 3036 cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX]; 3037 3038 barrier(CLK_LOCAL_MEM_FENCE); 3039 } 3040 // cache the filter as well 3041 event_t e = async_work_group_copy(cachedFilter,filter,width,0); 3042 wait_group_events(1,&e); 3043 3044 // only do the work if this is not a patched item 3045 const int cy = get_global_id(1); 3046 3047 if (cy < rows) 3048 { 3049 float4 blurredPixel = (float4) 0.0f; 3050 3051 int i = 0; 3052 3053 for ( ; i+7 < width; ) 3054 { 3055 for (int j=0; j < 8; j++, i++) 3056 blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)]; 3057 } 3058 3059 for ( ; i < width; i++) 3060 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)]; 3061 3062 float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel); 3063 float4 outputPixel = inputImagePixel - blurredPixel; 3064 3065 float quantumThreshold = QuantumRange*threshold; 3066 3067 int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold); 3068 outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask); 3069 3070 //write back 3071 WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel); 3072 } 3073 } 3074 ) 3075 3076 STRINGIFY( 3077 __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels, 3078 const ChannelType channel,__constant float *filter,const unsigned int width, 3079 const unsigned int columns,const unsigned int rows,__local float4 *pixels, 3080 const float gain,const float threshold,__global CLQuantum *filteredImage) 3081 { 3082 const unsigned int x = get_global_id(0); 3083 const unsigned int y = get_global_id(1); 3084 3085 const unsigned int radius = (width - 1) / 2; 3086 3087 int row = y - radius; 3088 int baseRow = get_group_id(1) * get_local_size(1) - radius; 3089 int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius; 3090 3091 while (row < endRow) { 3092 int srcy = (row < 0) ? -row : row; // mirror pad 3093 srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy; 3094 3095 float4 value = 0.0f; 3096 3097 int ix = x - radius; 3098 int i = 0; 3099 3100 while (i + 7 < width) { 3101 for (int j = 0; j < 8; ++j) { // unrolled 3102 int srcx = ix + j; 3103 srcx = (srcx < 0) ? -srcx : srcx; 3104 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; 3105 value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); 3106 } 3107 ix += 8; 3108 i += 8; 3109 } 3110 3111 while (i < width) { 3112 int srcx = (ix < 0) ? -ix : ix; // mirror pad 3113 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; 3114 value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); 3115 ++i; 3116 ++ix; 3117 } 3118 pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value; 3119 row += get_local_size(1); 3120 } 3121 3122 barrier(CLK_LOCAL_MEM_FENCE); 3123 3124 const int px = get_local_id(0); 3125 const int py = get_local_id(1); 3126 const int prp = get_local_size(0); 3127 float4 value = (float4)(0.0f); 3128 3129 int i = 0; 3130 while (i + 7 < width) { 3131 for (int j = 0; j < 8; ++j) // unrolled 3132 value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp]; 3133 i += 8; 3134 } 3135 while (i < width) { 3136 value += (float4)(filter[i]) * pixels[px + (py + i) * prp]; 3137 ++i; 3138 } 3139 3140 float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel); 3141 float4 diff = srcPixel - value; 3142 3143 float quantumThreshold = QuantumRange*threshold; 3144 3145 int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold); 3146 value = select(srcPixel + diff * gain, srcPixel, mask); 3147 3148 if ((x < columns) && (y < rows)) 3149 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value); 3150 } 3151 ) 3152 3153 STRINGIFY( 3154 __kernel __attribute__((reqd_work_group_size(64, 4, 1))) 3155 void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage, 3156 const unsigned int number_channels,const unsigned int max_channels, 3157 const float threshold,const int passes,const unsigned int imageWidth, 3158 const unsigned int imageHeight) 3159 { 3160 const int pad = (1 << (passes - 1)); 3161 const int tileSize = 64; 3162 const int tileRowPixels = 64; 3163 const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 }; 3164 3165 CLQuantum stage[48]; // 16 * 3 (we only need 3 channels) 3166 3167 local float buffer[64 * 64]; 3168 3169 int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0); 3170 int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad; 3171 3172 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3173 int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) + 3174 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels; 3175 3176 for (int channel = 0; channel < max_channels; ++channel) 3177 stage[(i / 4) + (16 * channel)] = srcImage[pos + channel]; 3178 } 3179 3180 for (int channel = 0; channel < max_channels; ++channel) { 3181 // Load LDS 3182 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3183 buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]); 3184 3185 // Process 3186 3187 float tmp[16]; 3188 float accum[16]; 3189 float pixel; 3190 3191 for (int i = 0; i < 16; i++) 3192 accum[i]=0.0f; 3193 3194 for (int pass = 0; pass < passes; ++pass) { 3195 const int radius = 1 << pass; 3196 const int x = get_local_id(0); 3197 const float thresh = threshold * noise[pass]; 3198 3199 // Apply horizontal hat 3200 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3201 const int offset = i * tileRowPixels; 3202 if (pass == 0) 3203 tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass 3204 pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]); 3205 barrier(CLK_LOCAL_MEM_FENCE); 3206 buffer[x + offset] = pixel; 3207 } 3208 barrier(CLK_LOCAL_MEM_FENCE); 3209 3210 // Apply vertical hat 3211 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3212 pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]); 3213 float delta = tmp[i / 4] - pixel; 3214 tmp[i / 4] = pixel; // hold output in tmp until all workitems are done 3215 if (delta < -thresh) 3216 delta += thresh; 3217 else if (delta > thresh) 3218 delta -= thresh; 3219 else 3220 delta = 0; 3221 accum[i / 4] += delta; 3222 } 3223 barrier(CLK_LOCAL_MEM_FENCE); 3224 3225 if (pass < passes - 1) 3226 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3227 buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass 3228 else // last pass 3229 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3230 accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output 3231 barrier(CLK_LOCAL_MEM_FENCE); 3232 } 3233 3234 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3235 stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]); 3236 3237 barrier(CLK_LOCAL_MEM_FENCE); 3238 } 3239 3240 // Write from stage to output 3241 3242 if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) { 3243 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3244 if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) { 3245 int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels)); 3246 for (int channel = 0; channel < max_channels; ++channel) { 3247 dstImage[pos + channel] = stage[(i / 4) + (16 * channel)]; 3248 } 3249 } 3250 } 3251 } 3252 } 3253 ) 3254 3255 ; 3256 3257#endif // MAGICKCORE_OPENCL_SUPPORT 3258 3259#if defined(__cplusplus) || defined(c_plusplus) 3260} 3261#endif 3262 3263#endif // _MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 3264