accelerate-kernels-private.h revision fdf0c636b0dd3c1a32a7667e6a84698713f84a68
1/* 2 Copyright 1999-2016 ImageMagick Studio LLC, a non-profit organization 3 dedicated to making software imaging solutions freely available. 4 5 You may not use this file except in compliance with the License. 6 obtain a copy of the License at 7 8 http://www.imagemagick.org/script/license.php 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 16 MagickCore private kernels for accelerated functions. 17*/ 18 19#ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 20#define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 21 22#if defined(__cplusplus) || defined(c_plusplus) 23extern "C" { 24#endif 25 26#if defined(MAGICKCORE_OPENCL_SUPPORT) 27 28/* 29 Define declarations. 30*/ 31#define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n" 32#define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n" 33#define OPENCL_ELSE() "\n #""else " " \n" 34#define OPENCL_ENDIF() "\n #""endif " " \n" 35#define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n" 36#define STRINGIFY(...) #__VA_ARGS__ "\n" 37 38/* 39 Typedef declarations. 40*/ 41 42typedef struct _FloatPixelPacket 43{ 44 MagickRealType 45 red, 46 green, 47 blue, 48 alpha, 49 black; 50} FloatPixelPacket; 51 52const char *accelerateKernels = 53 54/* 55 Define declarations. 56*/ 57 OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f)) 58 OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f)) 59 OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f)) 60 OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f)) 61 OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f)) 62 OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f)) 63 OPENCL_DEFINE(SigmaRandom, (attenuate)) 64 OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f)) 65 OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y))) 66 OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y))) 67 68/* 69 Typedef declarations. 70*/ 71 STRINGIFY( 72 typedef enum 73 { 74 UndefinedColorspace, 75 RGBColorspace, /* Linear RGB colorspace */ 76 GRAYColorspace, /* greyscale (linear) image (faked 1 channel) */ 77 TransparentColorspace, 78 OHTAColorspace, 79 LabColorspace, 80 XYZColorspace, 81 YCbCrColorspace, 82 YCCColorspace, 83 YIQColorspace, 84 YPbPrColorspace, 85 YUVColorspace, 86 CMYKColorspace, /* negared linear RGB with black separated */ 87 sRGBColorspace, /* Default: non-lienar sRGB colorspace */ 88 HSBColorspace, 89 HSLColorspace, 90 HWBColorspace, 91 Rec601LumaColorspace, 92 Rec601YCbCrColorspace, 93 Rec709LumaColorspace, 94 Rec709YCbCrColorspace, 95 LogColorspace, 96 CMYColorspace, /* negated linear RGB colorspace */ 97 LuvColorspace, 98 HCLColorspace, 99 LCHColorspace, /* alias for LCHuv */ 100 LMSColorspace, 101 LCHabColorspace, /* Cylindrical (Polar) Lab */ 102 LCHuvColorspace, /* Cylindrical (Polar) Luv */ 103 scRGBColorspace, 104 HSIColorspace, 105 HSVColorspace, /* alias for HSB */ 106 HCLpColorspace, 107 YDbDrColorspace 108 } ColorspaceType; 109 ) 110 111 STRINGIFY( 112 typedef enum 113 { 114 UndefinedCompositeOp, 115 NoCompositeOp, 116 ModulusAddCompositeOp, 117 AtopCompositeOp, 118 BlendCompositeOp, 119 BumpmapCompositeOp, 120 ChangeMaskCompositeOp, 121 ClearCompositeOp, 122 ColorBurnCompositeOp, 123 ColorDodgeCompositeOp, 124 ColorizeCompositeOp, 125 CopyBlackCompositeOp, 126 CopyBlueCompositeOp, 127 CopyCompositeOp, 128 CopyCyanCompositeOp, 129 CopyGreenCompositeOp, 130 CopyMagentaCompositeOp, 131 CopyOpacityCompositeOp, 132 CopyRedCompositeOp, 133 CopyYellowCompositeOp, 134 DarkenCompositeOp, 135 DstAtopCompositeOp, 136 DstCompositeOp, 137 DstInCompositeOp, 138 DstOutCompositeOp, 139 DstOverCompositeOp, 140 DifferenceCompositeOp, 141 DisplaceCompositeOp, 142 DissolveCompositeOp, 143 ExclusionCompositeOp, 144 HardLightCompositeOp, 145 HueCompositeOp, 146 InCompositeOp, 147 LightenCompositeOp, 148 LinearLightCompositeOp, 149 LuminizeCompositeOp, 150 MinusDstCompositeOp, 151 ModulateCompositeOp, 152 MultiplyCompositeOp, 153 OutCompositeOp, 154 OverCompositeOp, 155 OverlayCompositeOp, 156 PlusCompositeOp, 157 ReplaceCompositeOp, 158 SaturateCompositeOp, 159 ScreenCompositeOp, 160 SoftLightCompositeOp, 161 SrcAtopCompositeOp, 162 SrcCompositeOp, 163 SrcInCompositeOp, 164 SrcOutCompositeOp, 165 SrcOverCompositeOp, 166 ModulusSubtractCompositeOp, 167 ThresholdCompositeOp, 168 XorCompositeOp, 169 /* These are new operators, added after the above was last sorted. 170 * The list should be re-sorted only when a new library version is 171 * created. 172 */ 173 DivideDstCompositeOp, 174 DistortCompositeOp, 175 BlurCompositeOp, 176 PegtopLightCompositeOp, 177 VividLightCompositeOp, 178 PinLightCompositeOp, 179 LinearDodgeCompositeOp, 180 LinearBurnCompositeOp, 181 MathematicsCompositeOp, 182 DivideSrcCompositeOp, 183 MinusSrcCompositeOp, 184 DarkenIntensityCompositeOp, 185 LightenIntensityCompositeOp 186 } CompositeOperator; 187 ) 188 189 STRINGIFY( 190 typedef enum 191 { 192 UndefinedFunction, 193 ArcsinFunction, 194 ArctanFunction, 195 PolynomialFunction, 196 SinusoidFunction 197 } MagickFunction; 198 ) 199 200 STRINGIFY( 201 typedef enum 202 { 203 UndefinedNoise, 204 UniformNoise, 205 GaussianNoise, 206 MultiplicativeGaussianNoise, 207 ImpulseNoise, 208 LaplacianNoise, 209 PoissonNoise, 210 RandomNoise 211 } NoiseType; 212 ) 213 214 STRINGIFY( 215 typedef enum 216 { 217 UndefinedPixelIntensityMethod = 0, 218 AveragePixelIntensityMethod, 219 BrightnessPixelIntensityMethod, 220 LightnessPixelIntensityMethod, 221 Rec601LumaPixelIntensityMethod, 222 Rec601LuminancePixelIntensityMethod, 223 Rec709LumaPixelIntensityMethod, 224 Rec709LuminancePixelIntensityMethod, 225 RMSPixelIntensityMethod, 226 MSPixelIntensityMethod 227 } PixelIntensityMethod; 228 ) 229 230 STRINGIFY( 231 typedef enum { 232 BoxWeightingFunction = 0, 233 TriangleWeightingFunction, 234 CubicBCWeightingFunction, 235 HannWeightingFunction, 236 HammingWeightingFunction, 237 BlackmanWeightingFunction, 238 GaussianWeightingFunction, 239 QuadraticWeightingFunction, 240 JincWeightingFunction, 241 SincWeightingFunction, 242 SincFastWeightingFunction, 243 KaiserWeightingFunction, 244 WelshWeightingFunction, 245 BohmanWeightingFunction, 246 LagrangeWeightingFunction, 247 CosineWeightingFunction, 248 } ResizeWeightingFunctionType; 249 ) 250 251 STRINGIFY( 252 typedef enum 253 { 254 UndefinedChannel = 0x0000, 255 RedChannel = 0x0001, 256 GrayChannel = 0x0001, 257 CyanChannel = 0x0001, 258 GreenChannel = 0x0002, 259 MagentaChannel = 0x0002, 260 BlueChannel = 0x0004, 261 YellowChannel = 0x0004, 262 BlackChannel = 0x0008, 263 AlphaChannel = 0x0010, 264 OpacityChannel = 0x0010, 265 IndexChannel = 0x0020, /* Color Index Table? */ 266 ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */ 267 WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */ 268 MetaChannel = 0x0100, /* ???? */ 269 CompositeChannels = 0x002F, 270 AllChannels = 0x7ffffff, 271 /* 272 Special purpose channel types. 273 FUTURE: are these needed any more - they are more like hacks 274 SyncChannels for example is NOT a real channel but a 'flag' 275 It really says -- "User has not defined channels" 276 Though it does have extra meaning in the "-auto-level" operator 277 */ 278 TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */ 279 RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */ 280 GrayChannels = 0x0400, 281 SyncChannels = 0x20000, /* channels modified as a single unit */ 282 DefaultChannels = AllChannels 283 } ChannelType; /* must correspond to PixelChannel */ 284 ) 285 286/* 287 Helper functions. 288*/ 289 290OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8)) 291 292 STRINGIFY( 293 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 294 { 295 return((CLQuantum) value); 296 } 297 ) 298 299OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16)) 300 301 STRINGIFY( 302 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 303 { 304 return((CLQuantum) (257.0f*value)); 305 } 306 ) 307 308OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32)) 309 310 STRINGIFY( 311 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 312 { 313 return((CLQuantum) (16843009.0*value)); 314 } 315 ) 316 317OPENCL_ENDIF() 318 319 STRINGIFY( 320 inline int ClampToCanvas(const int offset,const int range) 321 { 322 return clamp(offset, (int)0, range-1); 323 } 324 ) 325 326 STRINGIFY( 327 inline CLQuantum ClampToQuantum(const float value) 328 { 329 return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f); 330 } 331 ) 332 333 STRINGIFY( 334 inline uint ScaleQuantumToMap(CLQuantum value) 335 { 336 if (value >= (CLQuantum) MaxMap) 337 return ((uint)MaxMap); 338 else 339 return ((uint)value); 340 } 341 ) 342 343 STRINGIFY( 344 inline float PerceptibleReciprocal(const float x) 345 { 346 float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0; 347 return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon)); 348 } 349 ) 350 351 STRINGIFY( 352 inline float RoundToUnity(const float value) 353 { 354 return clamp(value,0.0f,1.0f); 355 } 356 ) 357 358 STRINGIFY( 359 360 inline unsigned int getPixelIndex(const unsigned int number_channels, 361 const unsigned int columns, const unsigned int x, const unsigned int y) 362 { 363 return (x * number_channels) + (y * columns * number_channels); 364 } 365 366 inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; } 367 inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); } 368 inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); } 369 inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); } 370 371 inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; } 372 inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; } 373 inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; } 374 inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; } 375 376 inline CLQuantum getBlue(CLPixelType p) { return p.x; } 377 inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; } 378 inline float getBlueF4(float4 p) { return p.x; } 379 inline void setBlueF4(float4* p, float value) { (*p).x = value; } 380 381 inline CLQuantum getGreen(CLPixelType p) { return p.y; } 382 inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; } 383 inline float getGreenF4(float4 p) { return p.y; } 384 inline void setGreenF4(float4* p, float value) { (*p).y = value; } 385 386 inline CLQuantum getRed(CLPixelType p) { return p.z; } 387 inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; } 388 inline float getRedF4(float4 p) { return p.z; } 389 inline void setRedF4(float4* p, float value) { (*p).z = value; } 390 391 inline CLQuantum getAlpha(CLPixelType p) { return p.w; } 392 inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; } 393 inline float getAlphaF4(float4 p) { return p.w; } 394 inline void setAlphaF4(float4* p, float value) { (*p).w = value; } 395 396 inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels, 397 const ChannelType channel, float *red, float *green, float *blue, float *alpha) 398 { 399 if ((channel & RedChannel) != 0) 400 *red=getPixelRed(p); 401 402 if (number_channels > 2) 403 { 404 if ((channel & GreenChannel) != 0) 405 *green=getPixelGreen(p); 406 407 if ((channel & BlueChannel) != 0) 408 *blue=getPixelBlue(p); 409 } 410 411 if (((number_channels == 4) || (number_channels == 2)) && 412 ((channel & AlphaChannel) != 0)) 413 *alpha=getPixelAlpha(p,number_channels); 414 } 415 416 inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels, 417 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel) 418 { 419 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 420 421 float red = 0.0f; 422 float green = 0.0f; 423 float blue = 0.0f; 424 float alpha = 0.0f; 425 426 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 427 return (float4)(red, green, blue, alpha); 428 } 429 430 inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels, 431 const ChannelType channel, float red, float green, float blue, float alpha) 432 { 433 if ((channel & RedChannel) != 0) 434 setPixelRed(p,ClampToQuantum(red)); 435 436 if (number_channels > 2) 437 { 438 if ((channel & GreenChannel) != 0) 439 setPixelGreen(p,ClampToQuantum(green)); 440 441 if ((channel & BlueChannel) != 0) 442 setPixelBlue(p,ClampToQuantum(blue)); 443 } 444 445 if (((number_channels == 4) || (number_channels == 2)) && 446 ((channel & AlphaChannel) != 0)) 447 setPixelAlpha(p,number_channels,ClampToQuantum(alpha)); 448 } 449 450 inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels, 451 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel, 452 float4 pixel) 453 { 454 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 455 WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w); 456 } 457 458 inline float GetPixelIntensity(const unsigned int colorspace, 459 const unsigned int method,float red,float green,float blue) 460 { 461 float intensity; 462 463 if (colorspace == GRAYColorspace) 464 return red; 465 466 switch (method) 467 { 468 case AveragePixelIntensityMethod: 469 { 470 intensity=(red+green+blue)/3.0; 471 break; 472 } 473 case BrightnessPixelIntensityMethod: 474 { 475 intensity=MagickMax(MagickMax(red,green),blue); 476 break; 477 } 478 case LightnessPixelIntensityMethod: 479 { 480 intensity=(MagickMin(MagickMin(red,green),blue)+ 481 MagickMax(MagickMax(red,green),blue))/2.0; 482 break; 483 } 484 case MSPixelIntensityMethod: 485 { 486 intensity=(float) (((float) red*red+green*green+blue*blue)/ 487 (3.0*QuantumRange)); 488 break; 489 } 490 case Rec601LumaPixelIntensityMethod: 491 { 492 /* 493 if (image->colorspace == RGBColorspace) 494 { 495 red=EncodePixelGamma(red); 496 green=EncodePixelGamma(green); 497 blue=EncodePixelGamma(blue); 498 } 499 */ 500 intensity=0.298839*red+0.586811*green+0.114350*blue; 501 break; 502 } 503 case Rec601LuminancePixelIntensityMethod: 504 { 505 /* 506 if (image->colorspace == sRGBColorspace) 507 { 508 red=DecodePixelGamma(red); 509 green=DecodePixelGamma(green); 510 blue=DecodePixelGamma(blue); 511 } 512 */ 513 intensity=0.298839*red+0.586811*green+0.114350*blue; 514 break; 515 } 516 case Rec709LumaPixelIntensityMethod: 517 default: 518 { 519 /* 520 if (image->colorspace == RGBColorspace) 521 { 522 red=EncodePixelGamma(red); 523 green=EncodePixelGamma(green); 524 blue=EncodePixelGamma(blue); 525 } 526 */ 527 intensity=0.212656*red+0.715158*green+0.072186*blue; 528 break; 529 } 530 case Rec709LuminancePixelIntensityMethod: 531 { 532 /* 533 if (image->colorspace == sRGBColorspace) 534 { 535 red=DecodePixelGamma(red); 536 green=DecodePixelGamma(green); 537 blue=DecodePixelGamma(blue); 538 } 539 */ 540 intensity=0.212656*red+0.715158*green+0.072186*blue; 541 break; 542 } 543 case RMSPixelIntensityMethod: 544 { 545 intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/ 546 sqrt(3.0)); 547 break; 548 } 549 } 550 551 return intensity; 552 } 553 554 inline int mirrorBottom(int value) 555 { 556 return (value < 0) ? - (value) : value; 557 } 558 559 inline int mirrorTop(int value, int width) 560 { 561 return (value >= width) ? (2 * width - value - 1) : value; 562 } 563 ) 564 565/* 566%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 567% % 568% % 569% % 570% A d d N o i s e % 571% % 572% % 573% % 574%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 575*/ 576 577 STRINGIFY( 578 /* 579 Part of MWC64X by David Thomas, dt10@imperial.ac.uk 580 This is provided under BSD, full license is with the main package. 581 See http://www.doc.ic.ac.uk/~dt10/research 582 */ 583 584 // Pre: a<M, b<M 585 // Post: r=(a+b) mod M 586 ulong MWC_AddMod64(ulong a, ulong b, ulong M) 587 { 588 ulong v=a+b; 589 //if( (v>=M) || (v<a) ) 590 if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug. 591 v=v-M; 592 return v; 593 } 594 595 // Pre: a<M,b<M 596 // Post: r=(a*b) mod M 597 // This could be done more efficently, but it is portable, and should 598 // be easy to understand. It can be replaced with any of the better 599 // modular multiplication algorithms (for example if you know you have 600 // double precision available or something). 601 ulong MWC_MulMod64(ulong a, ulong b, ulong M) 602 { 603 ulong r=0; 604 while(a!=0){ 605 if(a&1) 606 r=MWC_AddMod64(r,b,M); 607 b=MWC_AddMod64(b,b,M); 608 a=a>>1; 609 } 610 return r; 611 } 612 613 // Pre: a<M, e>=0 614 // Post: r=(a^b) mod M 615 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on 616 // most architectures 617 ulong MWC_PowMod64(ulong a, ulong e, ulong M) 618 { 619 ulong sqr=a, acc=1; 620 while(e!=0){ 621 if(e&1) 622 acc=MWC_MulMod64(acc,sqr,M); 623 sqr=MWC_MulMod64(sqr,sqr,M); 624 e=e>>1; 625 } 626 return acc; 627 } 628 629 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance) 630 { 631 ulong m=MWC_PowMod64(A, distance, M); 632 ulong x=curr.x*(ulong)A+curr.y; 633 x=MWC_MulMod64(x, m, M); 634 return (uint2)((uint)(x/A), (uint)(x%A)); 635 } 636 637 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap) 638 { 639 // This is an arbitrary constant for starting LCG jumping from. I didn't 640 // want to start from 1, as then you end up with the two or three first values 641 // being a bit poor in ones - once you've decided that, one constant is as 642 // good as any another. There is no deep mathematical reason for it, I just 643 // generated a random number. 644 enum{ MWC_BASEID = 4077358422479273989UL }; 645 646 ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap; 647 ulong m=MWC_PowMod64(A, dist, M); 648 649 ulong x=MWC_MulMod64(MWC_BASEID, m, M); 650 return (uint2)((uint)(x/A), (uint)(x%A)); 651 } 652 653 //! Represents the state of a particular generator 654 typedef struct{ uint x; uint c; } mwc64x_state_t; 655 656 enum{ MWC64X_A = 4294883355U }; 657 enum{ MWC64X_M = 18446383549859758079UL }; 658 659 void MWC64X_Step(mwc64x_state_t *s) 660 { 661 uint X=s->x, C=s->c; 662 663 uint Xn=MWC64X_A*X+C; 664 uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar 665 uint Cn=mad_hi(MWC64X_A,X,carry); 666 667 s->x=Xn; 668 s->c=Cn; 669 } 670 671 void MWC64X_Skip(mwc64x_state_t *s, ulong distance) 672 { 673 uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance); 674 s->x=tmp.x; 675 s->c=tmp.y; 676 } 677 678 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset) 679 { 680 uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset); 681 s->x=tmp.x; 682 s->c=tmp.y; 683 } 684 685 //! Return a 32-bit integer in the range [0..2^32) 686 uint MWC64X_NextUint(mwc64x_state_t *s) 687 { 688 uint res=s->x ^ s->c; 689 MWC64X_Step(s); 690 return res; 691 } 692 693 // 694 // End of MWC64X excerpt 695 // 696 697 float mwcReadPseudoRandomValue(mwc64x_state_t* rng) 698 { 699 return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0 700 } 701 702 float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate) 703 { 704 float 705 alpha, 706 beta, 707 noise, 708 sigma; 709 710 noise = 0.0f; 711 alpha=mwcReadPseudoRandomValue(r); 712 switch(noise_type) 713 { 714 case UniformNoise: 715 default: 716 { 717 noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f)); 718 break; 719 } 720 case GaussianNoise: 721 { 722 float 723 gamma, 724 tau; 725 726 if (alpha == 0.0f) 727 alpha=1.0f; 728 beta=mwcReadPseudoRandomValue(r); 729 gamma=sqrt(-2.0f*log(alpha)); 730 sigma=gamma*cospi((2.0f*beta)); 731 tau=gamma*sinpi((2.0f*beta)); 732 noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau; 733 break; 734 } 735 case ImpulseNoise: 736 { 737 if (alpha < (SigmaImpulse/2.0f)) 738 noise=0.0f; 739 else 740 if (alpha >= (1.0f-(SigmaImpulse/2.0f))) 741 noise=QuantumRange; 742 else 743 noise=pixel; 744 break; 745 } 746 case LaplacianNoise: 747 { 748 if (alpha <= 0.5f) 749 { 750 if (alpha <= MagickEpsilon) 751 noise=(pixel-QuantumRange); 752 else 753 noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+ 754 0.5f); 755 break; 756 } 757 beta=1.0f-alpha; 758 if (beta <= (0.5f*MagickEpsilon)) 759 noise=(pixel+QuantumRange); 760 else 761 noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f); 762 break; 763 } 764 case MultiplicativeGaussianNoise: 765 { 766 sigma=1.0f; 767 if (alpha > MagickEpsilon) 768 sigma=sqrt(-2.0f*log(alpha)); 769 beta=mwcReadPseudoRandomValue(r); 770 noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma* 771 cospi((2.0f*beta))/2.0f); 772 break; 773 } 774 case PoissonNoise: 775 { 776 float 777 poisson; 778 unsigned int i; 779 poisson=exp(-SigmaPoisson*QuantumScale*pixel); 780 for (i=0; alpha > poisson; i++) 781 { 782 beta=mwcReadPseudoRandomValue(r); 783 alpha*=beta; 784 } 785 noise=(QuantumRange*i/SigmaPoisson); 786 break; 787 } 788 case RandomNoise: 789 { 790 noise=(QuantumRange*SigmaRandom*alpha); 791 break; 792 } 793 } 794 return noise; 795 } 796 797 __kernel 798 void AddNoise(const __global CLQuantum *image, 799 const unsigned int number_channels,const ChannelType channel, 800 const unsigned int length,const unsigned int pixelsPerWorkItem, 801 const NoiseType noise_type,const float attenuate,const unsigned int seed0, 802 const unsigned int seed1,const unsigned int numRandomNumbersPerPixel, 803 __global CLQuantum *filteredImage) 804 { 805 mwc64x_state_t rng; 806 rng.x = seed0; 807 rng.c = seed1; 808 809 uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use 810 uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream); 811 MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams 812 813 uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels); 814 uint count = pixelsPerWorkItem; 815 816 while (count > 0 && pos < length) 817 { 818 const __global CLQuantum *p = image + pos; 819 __global CLQuantum *q = filteredImage + pos; 820 821 float red; 822 float green; 823 float blue; 824 float alpha; 825 826 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 827 828 if ((channel & RedChannel) != 0) 829 red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate); 830 831 if (number_channels > 2) 832 { 833 if ((channel & GreenChannel) != 0) 834 green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate); 835 836 if ((channel & BlueChannel) != 0) 837 blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate); 838 } 839 840 if (((number_channels == 4) || (number_channels == 2)) && 841 ((channel & AlphaChannel) != 0)) 842 alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate); 843 844 WriteChannels(q, number_channels, channel, red, green, blue, alpha); 845 846 pos += (get_local_size(0) * number_channels); 847 count--; 848 } 849 } 850 ) 851 852/* 853%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 854% % 855% % 856% % 857% B l u r % 858% % 859% % 860% % 861%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 862*/ 863 864 STRINGIFY( 865 /* 866 Reduce image noise and reduce detail levels by row 867 */ 868 __kernel void BlurRow(const __global CLQuantum *image, 869 const unsigned int number_channels,const ChannelType channel, 870 __constant float *filter,const unsigned int width, 871 const unsigned int imageColumns,const unsigned int imageRows, 872 __local float4 *temp,__global float4 *tempImage) 873 { 874 const int x = get_global_id(0); 875 const int y = get_global_id(1); 876 877 const int columns = imageColumns; 878 879 const unsigned int radius = (width-1)/2; 880 const int wsize = get_local_size(0); 881 const unsigned int loadSize = wsize+width; 882 883 //group coordinate 884 const int groupX=get_local_size(0)*get_group_id(0); 885 886 //parallel load and clamp 887 for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0)) 888 { 889 int cx = ClampToCanvas(i + groupX - radius, columns); 890 temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel); 891 } 892 893 // barrier 894 barrier(CLK_LOCAL_MEM_FENCE); 895 896 // only do the work if this is not a patched item 897 if (get_global_id(0) < columns) 898 { 899 // compute 900 float4 result = (float4) 0; 901 902 int i = 0; 903 904 for ( ; i+7 < width; ) 905 { 906 for (int j=0; j < 8; j++) 907 result+=filter[i+j]*temp[i+j+get_local_id(0)]; 908 i+=8; 909 } 910 911 for ( ; i < width; i++) 912 result+=filter[i]*temp[i+get_local_id(0)]; 913 914 // write back to global 915 tempImage[y*columns+x] = result; 916 } 917 } 918 ) 919 920 STRINGIFY( 921 /* 922 Reduce image noise and reduce detail levels by line 923 */ 924 __kernel void BlurColumn(const __global float4 *blurRowData, 925 const unsigned int number_channels,const ChannelType channel, 926 __constant float *filter,const unsigned int width, 927 const unsigned int imageColumns,const unsigned int imageRows, 928 __local float4 *temp,__global CLQuantum *filteredImage) 929 { 930 const int x = get_global_id(0); 931 const int y = get_global_id(1); 932 933 const int columns = imageColumns; 934 const int rows = imageRows; 935 936 unsigned int radius = (width-1)/2; 937 const int wsize = get_local_size(1); 938 const unsigned int loadSize = wsize+width; 939 940 //group coordinate 941 const int groupX=get_local_size(0)*get_group_id(0); 942 const int groupY=get_local_size(1)*get_group_id(1); 943 //notice that get_local_size(0) is 1, so 944 //groupX=get_group_id(0); 945 946 //parallel load and clamp 947 for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1)) 948 temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX]; 949 950 // barrier 951 barrier(CLK_LOCAL_MEM_FENCE); 952 953 // only do the work if this is not a patched item 954 if (get_global_id(1) < rows) 955 { 956 // compute 957 float4 result = (float4) 0; 958 959 int i = 0; 960 961 for ( ; i+7 < width; ) 962 { 963 for (int j=0; j < 8; j++) 964 result+=filter[i+j]*temp[i+j+get_local_id(1)]; 965 i+=8; 966 } 967 968 for ( ; i < width; i++) 969 result+=filter[i]*temp[i+get_local_id(1)]; 970 971 // write back to global 972 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); 973 } 974 } 975 ) 976 977/* 978%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 979% % 980% % 981% % 982% C o n t r a s t % 983% % 984% % 985% % 986%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 987*/ 988 989 STRINGIFY( 990 991 inline float3 ConvertRGBToHSB(CLPixelType pixel) { 992 float3 HueSaturationBrightness; 993 HueSaturationBrightness.x = 0.0f; // Hue 994 HueSaturationBrightness.y = 0.0f; // Saturation 995 HueSaturationBrightness.z = 0.0f; // Brightness 996 997 float r=(float) getRed(pixel); 998 float g=(float) getGreen(pixel); 999 float b=(float) getBlue(pixel); 1000 1001 float tmin=MagickMin(MagickMin(r,g),b); 1002 float tmax=MagickMax(MagickMax(r,g),b); 1003 1004 if (tmax!=0.0f) { 1005 float delta=tmax-tmin; 1006 HueSaturationBrightness.y=delta/tmax; 1007 HueSaturationBrightness.z=QuantumScale*tmax; 1008 1009 if (delta != 0.0f) { 1010 HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f)); 1011 HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta; 1012 HueSaturationBrightness.x/=6.0f; 1013 HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f; 1014 } 1015 } 1016 return HueSaturationBrightness; 1017 } 1018 1019 inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) { 1020 1021 float hue = HueSaturationBrightness.x; 1022 float brightness = HueSaturationBrightness.z; 1023 float saturation = HueSaturationBrightness.y; 1024 1025 CLPixelType rgb; 1026 1027 if (saturation == 0.0f) { 1028 setRed(&rgb,ClampToQuantum(QuantumRange*brightness)); 1029 setGreen(&rgb,getRed(rgb)); 1030 setBlue(&rgb,getRed(rgb)); 1031 } 1032 else { 1033 1034 float h=6.0f*(hue-floor(hue)); 1035 float f=h-floor(h); 1036 float p=brightness*(1.0f-saturation); 1037 float q=brightness*(1.0f-saturation*f); 1038 float t=brightness*(1.0f-(saturation*(1.0f-f))); 1039 1040 float clampedBrightness = ClampToQuantum(QuantumRange*brightness); 1041 float clamped_t = ClampToQuantum(QuantumRange*t); 1042 float clamped_p = ClampToQuantum(QuantumRange*p); 1043 float clamped_q = ClampToQuantum(QuantumRange*q); 1044 int ih = (int)h; 1045 setRed(&rgb, (ih == 1)?clamped_q: 1046 (ih == 2 || ih == 3)?clamped_p: 1047 (ih == 4)?clamped_t: 1048 clampedBrightness); 1049 1050 setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness: 1051 (ih == 3)?clamped_q: 1052 (ih == 4 || ih == 5)?clamped_p: 1053 clamped_t); 1054 1055 setBlue(&rgb, (ih == 2)?clamped_t: 1056 (ih == 3 || ih == 4)?clampedBrightness: 1057 (ih == 5)?clamped_q: 1058 clamped_p); 1059 } 1060 return rgb; 1061 } 1062 1063 __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen) 1064 { 1065 1066 const int sign = sharpen!=0?1:-1; 1067 const int x = get_global_id(0); 1068 const int y = get_global_id(1); 1069 const int columns = get_global_size(0); 1070 const int c = x + y * columns; 1071 1072 CLPixelType pixel = im[c]; 1073 float3 HueSaturationBrightness = ConvertRGBToHSB(pixel); 1074 float brightness = HueSaturationBrightness.z; 1075 brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness); 1076 brightness = clamp(brightness,0.0f,1.0f); 1077 HueSaturationBrightness.z = brightness; 1078 1079 CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness); 1080 filteredPixel.w = pixel.w; 1081 im[c] = filteredPixel; 1082 } 1083 ) 1084 1085/* 1086%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1087% % 1088% % 1089% % 1090% C o n t r a s t S t r e t c h % 1091% % 1092% % 1093% % 1094%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1095*/ 1096 1097 STRINGIFY( 1098 /* 1099 */ 1100 __kernel void Histogram(__global CLPixelType * restrict im, 1101 const ChannelType channel, 1102 const unsigned int colorspace, 1103 const unsigned int method, 1104 __global uint4 * restrict histogram) 1105 { 1106 const int x = get_global_id(0); 1107 const int y = get_global_id(1); 1108 const int columns = get_global_size(0); 1109 const int c = x + y * columns; 1110 if ((channel & SyncChannels) != 0) 1111 { 1112 float red=(float)getRed(im[c]); 1113 float green=(float)getGreen(im[c]); 1114 float blue=(float)getBlue(im[c]); 1115 1116 float intensity = GetPixelIntensity(colorspace, method, red, green, blue); 1117 uint pos = ScaleQuantumToMap(ClampToQuantum(intensity)); 1118 atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position 1119 } 1120 else 1121 { 1122 // for equalizing, we always need all channels? 1123 // otherwise something more 1124 } 1125 } 1126 ) 1127 1128 STRINGIFY( 1129 /* 1130 */ 1131 __kernel void ContrastStretch(__global CLPixelType * restrict im, 1132 const ChannelType channel, 1133 __global CLPixelType * restrict stretch_map, 1134 const float4 white, const float4 black) 1135 { 1136 const int x = get_global_id(0); 1137 const int y = get_global_id(1); 1138 const int columns = get_global_size(0); 1139 const int c = x + y * columns; 1140 1141 uint ePos; 1142 CLPixelType oValue, eValue; 1143 CLQuantum red, green, blue, alpha; 1144 1145 //read from global 1146 oValue=im[c]; 1147 1148 if ((channel & RedChannel) != 0) 1149 { 1150 if (getRedF4(white) != getRedF4(black)) 1151 { 1152 ePos = ScaleQuantumToMap(getRed(oValue)); 1153 eValue = stretch_map[ePos]; 1154 red = getRed(eValue); 1155 } 1156 } 1157 1158 if ((channel & GreenChannel) != 0) 1159 { 1160 if (getGreenF4(white) != getGreenF4(black)) 1161 { 1162 ePos = ScaleQuantumToMap(getGreen(oValue)); 1163 eValue = stretch_map[ePos]; 1164 green = getGreen(eValue); 1165 } 1166 } 1167 1168 if ((channel & BlueChannel) != 0) 1169 { 1170 if (getBlueF4(white) != getBlueF4(black)) 1171 { 1172 ePos = ScaleQuantumToMap(getBlue(oValue)); 1173 eValue = stretch_map[ePos]; 1174 blue = getBlue(eValue); 1175 } 1176 } 1177 1178 if ((channel & AlphaChannel) != 0) 1179 { 1180 if (getAlphaF4(white) != getAlphaF4(black)) 1181 { 1182 ePos = ScaleQuantumToMap(getAlpha(oValue)); 1183 eValue = stretch_map[ePos]; 1184 alpha = getAlpha(eValue); 1185 } 1186 } 1187 1188 //write back 1189 im[c]=(CLPixelType)(blue, green, red, alpha); 1190 1191 } 1192 ) 1193 1194/* 1195%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1196% % 1197% % 1198% % 1199% C o n v o l v e % 1200% % 1201% % 1202% % 1203%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1204*/ 1205 1206 STRINGIFY( 1207 __kernel 1208 void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output, 1209 const unsigned int imageWidth, const unsigned int imageHeight, 1210 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, 1211 const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) { 1212 1213 int2 blockID; 1214 blockID.x = get_global_id(0) / get_local_size(0); 1215 blockID.y = get_global_id(1) / get_local_size(1); 1216 1217 // image area processed by this workgroup 1218 int2 imageAreaOrg; 1219 imageAreaOrg.x = blockID.x * get_local_size(0); 1220 imageAreaOrg.y = blockID.y * get_local_size(1); 1221 1222 int2 midFilterDimen; 1223 midFilterDimen.x = (filterWidth-1)/2; 1224 midFilterDimen.y = (filterHeight-1)/2; 1225 1226 int2 cachedAreaOrg = imageAreaOrg - midFilterDimen; 1227 1228 // dimension of the local cache 1229 int2 cachedAreaDimen; 1230 cachedAreaDimen.x = get_local_size(0) + filterWidth - 1; 1231 cachedAreaDimen.y = get_local_size(1) + filterHeight - 1; 1232 1233 // cache the pixels accessed by this workgroup in local memory 1234 int localID = get_local_id(1)*get_local_size(0)+get_local_id(0); 1235 int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y; 1236 int groupSize = get_local_size(0) * get_local_size(1); 1237 for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) { 1238 1239 int2 cachedAreaIndex; 1240 cachedAreaIndex.x = i % cachedAreaDimen.x; 1241 cachedAreaIndex.y = i / cachedAreaDimen.x; 1242 1243 int2 imagePixelIndex; 1244 imagePixelIndex = cachedAreaOrg + cachedAreaIndex; 1245 1246 // only support EdgeVirtualPixelMethod through ClampToCanvas 1247 // TODO: implement other virtual pixel method 1248 imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth); 1249 imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight); 1250 1251 pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x]; 1252 } 1253 1254 // cache the filter 1255 for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) { 1256 filterCache[i] = filter[i]; 1257 } 1258 barrier(CLK_LOCAL_MEM_FENCE); 1259 1260 1261 int2 imageIndex; 1262 imageIndex.x = imageAreaOrg.x + get_local_id(0); 1263 imageIndex.y = imageAreaOrg.y + get_local_id(1); 1264 1265 // if out-of-range, stops here and quit 1266 if (imageIndex.x >= imageWidth 1267 || imageIndex.y >= imageHeight) { 1268 return; 1269 } 1270 1271 int filterIndex = 0; 1272 float4 sum = (float4)0.0f; 1273 float gamma = 0.0f; 1274 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 1275 int cacheIndexY = get_local_id(1); 1276 for (int j = 0; j < filterHeight; j++) { 1277 int cacheIndexX = get_local_id(0); 1278 for (int i = 0; i < filterWidth; i++) { 1279 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; 1280 float f = filterCache[filterIndex]; 1281 1282 sum.x += f * p.x; 1283 sum.y += f * p.y; 1284 sum.z += f * p.z; 1285 sum.w += f * p.w; 1286 1287 gamma += f; 1288 filterIndex++; 1289 cacheIndexX++; 1290 } 1291 cacheIndexY++; 1292 } 1293 } 1294 else { 1295 int cacheIndexY = get_local_id(1); 1296 for (int j = 0; j < filterHeight; j++) { 1297 int cacheIndexX = get_local_id(0); 1298 for (int i = 0; i < filterWidth; i++) { 1299 1300 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; 1301 float alpha = QuantumScale*p.w; 1302 float f = filterCache[filterIndex]; 1303 float g = alpha * f; 1304 1305 sum.x += g*p.x; 1306 sum.y += g*p.y; 1307 sum.z += g*p.z; 1308 sum.w += f*p.w; 1309 1310 gamma += g; 1311 filterIndex++; 1312 cacheIndexX++; 1313 } 1314 cacheIndexY++; 1315 } 1316 gamma = PerceptibleReciprocal(gamma); 1317 sum.xyz = gamma*sum.xyz; 1318 } 1319 CLPixelType outputPixel; 1320 outputPixel.x = ClampToQuantum(sum.x); 1321 outputPixel.y = ClampToQuantum(sum.y); 1322 outputPixel.z = ClampToQuantum(sum.z); 1323 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; 1324 1325 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; 1326 } 1327 ) 1328 1329 STRINGIFY( 1330 __kernel 1331 void Convolve(const __global CLPixelType *input, __global CLPixelType *output, 1332 const uint imageWidth, const uint imageHeight, 1333 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, 1334 const uint matte, const ChannelType channel) { 1335 1336 int2 imageIndex; 1337 imageIndex.x = get_global_id(0); 1338 imageIndex.y = get_global_id(1); 1339 1340 /* 1341 unsigned int imageWidth = get_global_size(0); 1342 unsigned int imageHeight = get_global_size(1); 1343 */ 1344 if (imageIndex.x >= imageWidth 1345 || imageIndex.y >= imageHeight) 1346 return; 1347 1348 int2 midFilterDimen; 1349 midFilterDimen.x = (filterWidth-1)/2; 1350 midFilterDimen.y = (filterHeight-1)/2; 1351 1352 int filterIndex = 0; 1353 float4 sum = (float4)0.0f; 1354 float gamma = 0.0f; 1355 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 1356 for (int j = 0; j < filterHeight; j++) { 1357 int2 inputPixelIndex; 1358 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; 1359 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); 1360 for (int i = 0; i < filterWidth; i++) { 1361 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; 1362 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); 1363 1364 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; 1365 float f = filter[filterIndex]; 1366 1367 sum.x += f * p.x; 1368 sum.y += f * p.y; 1369 sum.z += f * p.z; 1370 sum.w += f * p.w; 1371 1372 gamma += f; 1373 1374 filterIndex++; 1375 } 1376 } 1377 } 1378 else { 1379 1380 for (int j = 0; j < filterHeight; j++) { 1381 int2 inputPixelIndex; 1382 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; 1383 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); 1384 for (int i = 0; i < filterWidth; i++) { 1385 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; 1386 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); 1387 1388 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; 1389 float alpha = QuantumScale*p.w; 1390 float f = filter[filterIndex]; 1391 float g = alpha * f; 1392 1393 sum.x += g*p.x; 1394 sum.y += g*p.y; 1395 sum.z += g*p.z; 1396 sum.w += f*p.w; 1397 1398 gamma += g; 1399 1400 1401 filterIndex++; 1402 } 1403 } 1404 gamma = PerceptibleReciprocal(gamma); 1405 sum.xyz = gamma*sum.xyz; 1406 } 1407 1408 CLPixelType outputPixel; 1409 outputPixel.x = ClampToQuantum(sum.x); 1410 outputPixel.y = ClampToQuantum(sum.y); 1411 outputPixel.z = ClampToQuantum(sum.z); 1412 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; 1413 1414 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; 1415 } 1416 ) 1417 1418/* 1419%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1420% % 1421% % 1422% % 1423% D e s p e c k l e % 1424% % 1425% % 1426% % 1427%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1428*/ 1429 1430 STRINGIFY( 1431 1432 __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage 1433 , const unsigned int imageWidth, const unsigned int imageHeight 1434 , const int2 offset, const int polarity, const int matte) { 1435 1436 int x = get_global_id(0); 1437 int y = get_global_id(1); 1438 1439 CLPixelType v = inputImage[y*imageWidth+x]; 1440 1441 int2 neighbor; 1442 neighbor.y = y + offset.y; 1443 neighbor.x = x + offset.x; 1444 1445 int2 clampedNeighbor; 1446 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1447 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1448 1449 CLPixelType r = (clampedNeighbor.x == neighbor.x 1450 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1451 :(CLPixelType)0; 1452 1453 int sv[4]; 1454 sv[0] = (int)v.x; 1455 sv[1] = (int)v.y; 1456 sv[2] = (int)v.z; 1457 sv[3] = (int)v.w; 1458 1459 int sr[4]; 1460 sr[0] = (int)r.x; 1461 sr[1] = (int)r.y; 1462 sr[2] = (int)r.z; 1463 sr[3] = (int)r.w; 1464 1465 if (polarity > 0) { 1466 \n #pragma unroll 4\n 1467 for (unsigned int i = 0; i < 4; i++) { 1468 sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i]; 1469 } 1470 } 1471 else { 1472 \n #pragma unroll 4\n 1473 for (unsigned int i = 0; i < 4; i++) { 1474 sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i]; 1475 } 1476 1477 } 1478 1479 v.x = (CLQuantum)sv[0]; 1480 v.y = (CLQuantum)sv[1]; 1481 v.z = (CLQuantum)sv[2]; 1482 1483 if (matte!=0) 1484 v.w = (CLQuantum)sv[3]; 1485 1486 outputImage[y*imageWidth+x] = v; 1487 1488 } 1489 1490 1491 ) 1492 1493 STRINGIFY( 1494 1495 __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage 1496 , const unsigned int imageWidth, const unsigned int imageHeight 1497 , const int2 offset, const int polarity, const int matte) { 1498 1499 int x = get_global_id(0); 1500 int y = get_global_id(1); 1501 1502 CLPixelType v = inputImage[y*imageWidth+x]; 1503 1504 int2 neighbor, clampedNeighbor; 1505 1506 neighbor.y = y + offset.y; 1507 neighbor.x = x + offset.x; 1508 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1509 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1510 1511 CLPixelType r = (clampedNeighbor.x == neighbor.x 1512 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1513 :(CLPixelType)0; 1514 1515 1516 neighbor.y = y - offset.y; 1517 neighbor.x = x - offset.x; 1518 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1519 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1520 1521 CLPixelType s = (clampedNeighbor.x == neighbor.x 1522 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1523 :(CLPixelType)0; 1524 1525 1526 int sv[4]; 1527 sv[0] = (int)v.x; 1528 sv[1] = (int)v.y; 1529 sv[2] = (int)v.z; 1530 sv[3] = (int)v.w; 1531 1532 int sr[4]; 1533 sr[0] = (int)r.x; 1534 sr[1] = (int)r.y; 1535 sr[2] = (int)r.z; 1536 sr[3] = (int)r.w; 1537 1538 int ss[4]; 1539 ss[0] = (int)s.x; 1540 ss[1] = (int)s.y; 1541 ss[2] = (int)s.z; 1542 ss[3] = (int)s.w; 1543 1544 if (polarity > 0) { 1545 \n #pragma unroll 4\n 1546 for (unsigned int i = 0; i < 4; i++) { 1547 //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i]; 1548 // 1549 //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1550 //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1551 sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1552 } 1553 } 1554 else { 1555 \n #pragma unroll 4\n 1556 for (unsigned int i = 0; i < 4; i++) { 1557 //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i]; 1558 // 1559 //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); 1560 sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); 1561 } 1562 } 1563 1564 v.x = (CLQuantum)sv[0]; 1565 v.y = (CLQuantum)sv[1]; 1566 v.z = (CLQuantum)sv[2]; 1567 1568 if (matte!=0) 1569 v.w = (CLQuantum)sv[3]; 1570 1571 outputImage[y*imageWidth+x] = v; 1572 1573 } 1574 ) 1575 1576/* 1577%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1578% % 1579% % 1580% % 1581% E q u a l i z e % 1582% % 1583% % 1584% % 1585%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1586*/ 1587 1588 STRINGIFY( 1589 /* 1590 */ 1591 __kernel void Equalize(__global CLPixelType * restrict im, 1592 const ChannelType channel, 1593 __global CLPixelType * restrict equalize_map, 1594 const float4 white, const float4 black) 1595 { 1596 const int x = get_global_id(0); 1597 const int y = get_global_id(1); 1598 const int columns = get_global_size(0); 1599 const int c = x + y * columns; 1600 1601 uint ePos; 1602 CLPixelType oValue, eValue; 1603 CLQuantum red, green, blue, alpha; 1604 1605 //read from global 1606 oValue=im[c]; 1607 1608 if ((channel & SyncChannels) != 0) 1609 { 1610 if (getRedF4(white) != getRedF4(black)) 1611 { 1612 ePos = ScaleQuantumToMap(getRed(oValue)); 1613 eValue = equalize_map[ePos]; 1614 red = getRed(eValue); 1615 ePos = ScaleQuantumToMap(getGreen(oValue)); 1616 eValue = equalize_map[ePos]; 1617 green = getRed(eValue); 1618 ePos = ScaleQuantumToMap(getBlue(oValue)); 1619 eValue = equalize_map[ePos]; 1620 blue = getRed(eValue); 1621 ePos = ScaleQuantumToMap(getAlpha(oValue)); 1622 eValue = equalize_map[ePos]; 1623 alpha = getRed(eValue); 1624 1625 //write back 1626 im[c]=(CLPixelType)(blue, green, red, alpha); 1627 } 1628 1629 } 1630 1631 // for equalizing, we always need all channels? 1632 // otherwise something more 1633 1634 } 1635 ) 1636 1637/* 1638%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1639% % 1640% % 1641% % 1642% F u n c t i o n % 1643% % 1644% % 1645% % 1646%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1647*/ 1648 1649 STRINGIFY( 1650 /* 1651 apply FunctionImageChannel(braightness-contrast) 1652 */ 1653 CLQuantum ApplyFunction(float pixel,const MagickFunction function, 1654 const unsigned int number_parameters,__constant float *parameters) 1655 { 1656 float result = 0.0f; 1657 1658 switch (function) 1659 { 1660 case PolynomialFunction: 1661 { 1662 for (unsigned int i=0; i < number_parameters; i++) 1663 result = result*QuantumScale*pixel + parameters[i]; 1664 result *= QuantumRange; 1665 break; 1666 } 1667 case SinusoidFunction: 1668 { 1669 float freq,phase,ampl,bias; 1670 freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1671 phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f; 1672 ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f; 1673 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1674 result = QuantumRange*(ampl*sin(2.0f*MagickPI* 1675 (freq*QuantumScale*pixel + phase/360.0f)) + bias); 1676 break; 1677 } 1678 case ArcsinFunction: 1679 { 1680 float width,range,center,bias; 1681 width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1682 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; 1683 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; 1684 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1685 1686 result = 2.0f/width*(QuantumScale*pixel - center); 1687 result = range/MagickPI*asin(result)+bias; 1688 result = ( result <= -1.0f ) ? bias - range/2.0f : result; 1689 result = ( result >= 1.0f ) ? bias + range/2.0f : result; 1690 result *= QuantumRange; 1691 break; 1692 } 1693 case ArctanFunction: 1694 { 1695 float slope,range,center,bias; 1696 slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1697 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; 1698 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; 1699 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1700 result = MagickPI*slope*(QuantumScale*pixel-center); 1701 result = QuantumRange*(range/MagickPI*atan(result) + bias); 1702 break; 1703 } 1704 case UndefinedFunction: 1705 break; 1706 } 1707 return(ClampToQuantum(result)); 1708 } 1709 ) 1710 1711 STRINGIFY( 1712 /* 1713 Improve brightness / contrast of the image 1714 channel : define which channel is improved 1715 function : the function called to enchance the brightness contrast 1716 number_parameters : numbers of parameters 1717 parameters : the parameter 1718 */ 1719 __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels, 1720 const ChannelType channel,const MagickFunction function,const unsigned int number_parameters, 1721 __constant float *parameters) 1722 { 1723 const unsigned int x = get_global_id(0); 1724 const unsigned int y = get_global_id(1); 1725 const unsigned int columns = get_global_size(0); 1726 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 1727 1728 float red; 1729 float green; 1730 float blue; 1731 float alpha; 1732 1733 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 1734 1735 if ((channel & RedChannel) != 0) 1736 red=ApplyFunction(red, function, number_parameters, parameters); 1737 1738 if (number_channels > 2) 1739 { 1740 if ((channel & GreenChannel) != 0) 1741 green=ApplyFunction(green, function, number_parameters, parameters); 1742 1743 if ((channel & BlueChannel) != 0) 1744 blue=ApplyFunction(blue, function, number_parameters, parameters); 1745 } 1746 1747 if (((number_channels == 4) || (number_channels == 2)) && 1748 ((channel & AlphaChannel) != 0)) 1749 alpha=ApplyFunction(alpha, function, number_parameters, parameters); 1750 1751 WriteChannels(p, number_channels, channel, red, green, blue, alpha); 1752 } 1753 ) 1754 1755/* 1756%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1757% % 1758% % 1759% % 1760% G r a y s c a l e % 1761% % 1762% % 1763% % 1764%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1765*/ 1766 1767 STRINGIFY( 1768 __kernel void Grayscale(__global CLQuantum *image,const int number_channels, 1769 const unsigned int colorspace,const unsigned int method) 1770 { 1771 const unsigned int x = get_global_id(0); 1772 const unsigned int y = get_global_id(1); 1773 const unsigned int columns = get_global_size(0); 1774 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 1775 1776 float 1777 blue, 1778 green, 1779 red; 1780 1781 red=getPixelRed(p); 1782 green=getPixelGreen(p); 1783 blue=getPixelBlue(p); 1784 1785 CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue)); 1786 1787 setPixelRed(p,intensity); 1788 setPixelGreen(p,intensity); 1789 setPixelBlue(p,intensity); 1790 } 1791 ) 1792 1793/* 1794%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1795% % 1796% % 1797% % 1798% L o c a l C o n t r a s t % 1799% % 1800% % 1801% % 1802%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1803*/ 1804 1805 STRINGIFY( 1806 1807 __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage, 1808 const int radius, 1809 const int imageWidth, 1810 const int imageHeight) 1811 { 1812 const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f)); 1813 1814 int x = get_local_id(0); 1815 int y = get_global_id(1); 1816 1817 global CLPixelType *src = srcImage + y * imageWidth; 1818 1819 for (int i = x; i < imageWidth; i += get_local_size(0)) { 1820 float sum = 0.0f; 1821 float weight = 1.0f; 1822 1823 int j = i - radius; 1824 while ((j + 7) < i) { 1825 for (int k = 0; k < 8; ++k) // Unroll 8x 1826 sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)])); 1827 weight += 8.0f; 1828 j+=8; 1829 } 1830 while (j < i) { 1831 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)])); 1832 weight += 1.0f; 1833 ++j; 1834 } 1835 1836 while ((j + 7) < radius + i) { 1837 for (int k = 0; k < 8; ++k) // Unroll 8x 1838 sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)])); 1839 weight -= 8.0f; 1840 j+=8; 1841 } 1842 while (j < radius + i) { 1843 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)])); 1844 weight -= 1.0f; 1845 ++j; 1846 } 1847 1848 tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1)); 1849 } 1850 } 1851 ) 1852 1853 STRINGIFY( 1854 __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage, 1855 const int radius, 1856 const float strength, 1857 const int imageWidth, 1858 const int imageHeight) 1859 { 1860 const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f); 1861 1862 int x = get_global_id(0); 1863 int y = get_global_id(1); 1864 1865 if ((x >= imageWidth) || (y >= imageHeight)) 1866 return; 1867 1868 global float *src = blurImage + x; 1869 1870 float sum = 0.0f; 1871 float weight = 1.0f; 1872 1873 int j = y - radius; 1874 while ((j + 7) < y) { 1875 for (int k = 0; k < 8; ++k) // Unroll 8x 1876 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth]; 1877 weight += 8.0f; 1878 j+=8; 1879 } 1880 while (j < y) { 1881 sum += weight * src[mirrorBottom(j) * imageWidth]; 1882 weight += 1.0f; 1883 ++j; 1884 } 1885 1886 while ((j + 7) < radius + y) { 1887 for (int k = 0; k < 8; ++k) // Unroll 8x 1888 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth]; 1889 weight -= 8.0f; 1890 j+=8; 1891 } 1892 while (j < radius + y) { 1893 sum += weight * src[mirrorTop(j, imageHeight) * imageWidth]; 1894 weight -= 1.0f; 1895 ++j; 1896 } 1897 1898 CLPixelType pixel = srcImage[x + y * imageWidth]; 1899 float srcVal = dot(RGB, convert_float4(pixel)); 1900 float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f); 1901 mult = (srcVal + mult) / srcVal; 1902 1903 pixel.x = ClampToQuantum(pixel.x * mult); 1904 pixel.y = ClampToQuantum(pixel.y * mult); 1905 pixel.z = ClampToQuantum(pixel.z * mult); 1906 1907 dstImage[x + y * imageWidth] = pixel; 1908 } 1909 ) 1910 1911/* 1912%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1913% % 1914% % 1915% % 1916% M o d u l a t e % 1917% % 1918% % 1919% % 1920%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1921*/ 1922 1923 STRINGIFY( 1924 1925 inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue, 1926 float *hue, float *saturation, float *lightness) 1927 { 1928 float 1929 c, 1930 tmax, 1931 tmin; 1932 1933 /* 1934 Convert RGB to HSL colorspace. 1935 */ 1936 tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue)); 1937 tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue)); 1938 1939 c=tmax-tmin; 1940 1941 *lightness=(tmax+tmin)/2.0; 1942 if (c <= 0.0) 1943 { 1944 *hue=0.0; 1945 *saturation=0.0; 1946 return; 1947 } 1948 1949 if (tmax == (QuantumScale*red)) 1950 { 1951 *hue=(QuantumScale*green-QuantumScale*blue)/c; 1952 if ((QuantumScale*green) < (QuantumScale*blue)) 1953 *hue+=6.0; 1954 } 1955 else 1956 if (tmax == (QuantumScale*green)) 1957 *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c; 1958 else 1959 *hue=4.0+(QuantumScale*red-QuantumScale*green)/c; 1960 1961 *hue*=60.0/360.0; 1962 if (*lightness <= 0.5) 1963 *saturation=c/(2.0*(*lightness)); 1964 else 1965 *saturation=c/(2.0-2.0*(*lightness)); 1966 } 1967 1968 inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness, 1969 CLQuantum *red,CLQuantum *green,CLQuantum *blue) 1970 { 1971 float 1972 b, 1973 c, 1974 g, 1975 h, 1976 tmin, 1977 r, 1978 x; 1979 1980 /* 1981 Convert HSL to RGB colorspace. 1982 */ 1983 h=hue*360.0; 1984 if (lightness <= 0.5) 1985 c=2.0*lightness*saturation; 1986 else 1987 c=(2.0-2.0*lightness)*saturation; 1988 tmin=lightness-0.5*c; 1989 h-=360.0*floor(h/360.0); 1990 h/=60.0; 1991 x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0)); 1992 switch ((int) floor(h) % 6) 1993 { 1994 case 0: 1995 { 1996 r=tmin+c; 1997 g=tmin+x; 1998 b=tmin; 1999 break; 2000 } 2001 case 1: 2002 { 2003 r=tmin+x; 2004 g=tmin+c; 2005 b=tmin; 2006 break; 2007 } 2008 case 2: 2009 { 2010 r=tmin; 2011 g=tmin+c; 2012 b=tmin+x; 2013 break; 2014 } 2015 case 3: 2016 { 2017 r=tmin; 2018 g=tmin+x; 2019 b=tmin+c; 2020 break; 2021 } 2022 case 4: 2023 { 2024 r=tmin+x; 2025 g=tmin; 2026 b=tmin+c; 2027 break; 2028 } 2029 case 5: 2030 { 2031 r=tmin+c; 2032 g=tmin; 2033 b=tmin+x; 2034 break; 2035 } 2036 default: 2037 { 2038 r=0.0; 2039 g=0.0; 2040 b=0.0; 2041 } 2042 } 2043 *red=ClampToQuantum(QuantumRange*r); 2044 *green=ClampToQuantum(QuantumRange*g); 2045 *blue=ClampToQuantum(QuantumRange*b); 2046 } 2047 2048 inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, 2049 CLQuantum *red,CLQuantum *green,CLQuantum *blue) 2050 { 2051 float 2052 hue, 2053 lightness, 2054 saturation; 2055 2056 /* 2057 Increase or decrease color lightness, saturation, or hue. 2058 */ 2059 ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness); 2060 hue+=0.5*(0.01*percent_hue-1.0); 2061 while (hue < 0.0) 2062 hue+=1.0; 2063 while (hue >= 1.0) 2064 hue-=1.0; 2065 saturation*=0.01*percent_saturation; 2066 lightness*=0.01*percent_lightness; 2067 ConvertHSLToRGB(hue,saturation,lightness,red,green,blue); 2068 } 2069 2070 __kernel void Modulate(__global CLPixelType *im, 2071 const float percent_brightness, 2072 const float percent_hue, 2073 const float percent_saturation, 2074 const int colorspace) 2075 { 2076 2077 const int x = get_global_id(0); 2078 const int y = get_global_id(1); 2079 const int columns = get_global_size(0); 2080 const int c = x + y * columns; 2081 2082 CLPixelType pixel = im[c]; 2083 2084 CLQuantum 2085 blue, 2086 green, 2087 red; 2088 2089 red=getRed(pixel); 2090 green=getGreen(pixel); 2091 blue=getBlue(pixel); 2092 2093 switch (colorspace) 2094 { 2095 case HSLColorspace: 2096 default: 2097 { 2098 ModulateHSL(percent_hue, percent_saturation, percent_brightness, 2099 &red, &green, &blue); 2100 } 2101 2102 } 2103 2104 CLPixelType filteredPixel; 2105 2106 setRed(&filteredPixel, red); 2107 setGreen(&filteredPixel, green); 2108 setBlue(&filteredPixel, blue); 2109 filteredPixel.w = pixel.w; 2110 2111 im[c] = filteredPixel; 2112 } 2113 ) 2114 2115/* 2116%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2117% % 2118% % 2119% % 2120% M o t i o n B l u r % 2121% % 2122% % 2123% % 2124%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2125*/ 2126 2127 STRINGIFY( 2128 __kernel 2129 void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output, 2130 const unsigned int imageWidth, const unsigned int imageHeight, 2131 const __global float *filter, const unsigned int width, const __global int2* offset, 2132 const float4 bias, 2133 const ChannelType channel, const unsigned int matte) { 2134 2135 int2 currentPixel; 2136 currentPixel.x = get_global_id(0); 2137 currentPixel.y = get_global_id(1); 2138 2139 if (currentPixel.x >= imageWidth 2140 || currentPixel.y >= imageHeight) 2141 return; 2142 2143 float4 pixel; 2144 pixel.x = (float)bias.x; 2145 pixel.y = (float)bias.y; 2146 pixel.z = (float)bias.z; 2147 pixel.w = (float)bias.w; 2148 2149 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 2150 2151 for (int i = 0; i < width; i++) { 2152 // only support EdgeVirtualPixelMethod through ClampToCanvas 2153 // TODO: implement other virtual pixel method 2154 int2 samplePixel = currentPixel + offset[i]; 2155 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); 2156 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); 2157 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; 2158 2159 pixel.x += (filter[i] * (float)samplePixelValue.x); 2160 pixel.y += (filter[i] * (float)samplePixelValue.y); 2161 pixel.z += (filter[i] * (float)samplePixelValue.z); 2162 pixel.w += (filter[i] * (float)samplePixelValue.w); 2163 } 2164 2165 CLPixelType outputPixel; 2166 outputPixel.x = ClampToQuantum(pixel.x); 2167 outputPixel.y = ClampToQuantum(pixel.y); 2168 outputPixel.z = ClampToQuantum(pixel.z); 2169 outputPixel.w = ClampToQuantum(pixel.w); 2170 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; 2171 } 2172 else { 2173 2174 float gamma = 0.0f; 2175 for (int i = 0; i < width; i++) { 2176 // only support EdgeVirtualPixelMethod through ClampToCanvas 2177 // TODO: implement other virtual pixel method 2178 int2 samplePixel = currentPixel + offset[i]; 2179 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); 2180 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); 2181 2182 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; 2183 2184 float alpha = QuantumScale*samplePixelValue.w; 2185 float k = filter[i]; 2186 pixel.x = pixel.x + k * alpha * samplePixelValue.x; 2187 pixel.y = pixel.y + k * alpha * samplePixelValue.y; 2188 pixel.z = pixel.z + k * alpha * samplePixelValue.z; 2189 2190 pixel.w += k * alpha * samplePixelValue.w; 2191 2192 gamma+=k*alpha; 2193 } 2194 gamma = PerceptibleReciprocal(gamma); 2195 pixel.xyz = gamma*pixel.xyz; 2196 2197 CLPixelType outputPixel; 2198 outputPixel.x = ClampToQuantum(pixel.x); 2199 outputPixel.y = ClampToQuantum(pixel.y); 2200 outputPixel.z = ClampToQuantum(pixel.z); 2201 outputPixel.w = ClampToQuantum(pixel.w); 2202 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; 2203 } 2204 } 2205 ) 2206 2207/* 2208%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2209% % 2210% % 2211% % 2212% R e s i z e % 2213% % 2214% % 2215% % 2216%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2217*/ 2218 2219 STRINGIFY( 2220 // Based on Box from resize.c 2221 float BoxResizeFilter(const float x) 2222 { 2223 return 1.0f; 2224 } 2225 ) 2226 2227 STRINGIFY( 2228 // Based on CubicBC from resize.c 2229 float CubicBC(const float x,const __global float* resizeFilterCoefficients) 2230 { 2231 /* 2232 Cubic Filters using B,C determined values: 2233 Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter 2234 Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears 2235 Spline B = 1 C = 0 B-Spline Gaussian approximation 2236 Hermite B = 0 C = 0 B-Spline interpolator 2237 2238 See paper by Mitchell and Netravali, Reconstruction Filters in Computer 2239 Graphics Computer Graphics, Volume 22, Number 4, August 1988 2240 http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/ 2241 Mitchell.pdf. 2242 2243 Coefficents are determined from B,C values: 2244 P0 = ( 6 - 2*B )/6 = coeff[0] 2245 P1 = 0 2246 P2 = (-18 +12*B + 6*C )/6 = coeff[1] 2247 P3 = ( 12 - 9*B - 6*C )/6 = coeff[2] 2248 Q0 = ( 8*B +24*C )/6 = coeff[3] 2249 Q1 = ( -12*B -48*C )/6 = coeff[4] 2250 Q2 = ( 6*B +30*C )/6 = coeff[5] 2251 Q3 = ( - 1*B - 6*C )/6 = coeff[6] 2252 2253 which are used to define the filter: 2254 2255 P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1 2256 Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2 2257 2258 which ensures function is continuous in value and derivative (slope). 2259 */ 2260 if (x < 1.0) 2261 return(resizeFilterCoefficients[0]+x*(x* 2262 (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2]))); 2263 if (x < 2.0) 2264 return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x* 2265 (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6]))); 2266 return(0.0); 2267 } 2268 ) 2269 2270 STRINGIFY( 2271 float Sinc(const float x) 2272 { 2273 if (x != 0.0f) 2274 { 2275 const float alpha=(float) (MagickPI*x); 2276 return sinpi(x)/alpha; 2277 } 2278 return(1.0f); 2279 } 2280 ) 2281 2282 STRINGIFY( 2283 float Triangle(const float x) 2284 { 2285 /* 2286 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or 2287 a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function 2288 for Sinc(). 2289 */ 2290 return ((x<1.0f)?(1.0f-x):0.0f); 2291 } 2292 ) 2293 2294 2295 STRINGIFY( 2296 float Hann(const float x) 2297 { 2298 /* 2299 Cosine window function: 2300 0.5+0.5*cos(pi*x). 2301 */ 2302 const float cosine=cos((MagickPI*x)); 2303 return(0.5f+0.5f*cosine); 2304 } 2305 ) 2306 2307 STRINGIFY( 2308 float Hamming(const float x) 2309 { 2310 /* 2311 Offset cosine window function: 2312 .54 + .46 cos(pi x). 2313 */ 2314 const float cosine=cos((MagickPI*x)); 2315 return(0.54f+0.46f*cosine); 2316 } 2317 ) 2318 2319 STRINGIFY( 2320 float Blackman(const float x) 2321 { 2322 /* 2323 Blackman: 2nd order cosine windowing function: 2324 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x) 2325 2326 Refactored by Chantal Racette and Nicolas Robidoux to one trig call and 2327 five flops. 2328 */ 2329 const float cosine=cos((MagickPI*x)); 2330 return(0.34f+cosine*(0.5f+cosine*0.16f)); 2331 } 2332 ) 2333 2334 STRINGIFY( 2335 inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients) 2336 { 2337 switch (filterType) 2338 { 2339 /* Call Sinc even for SincFast to get better precision on GPU 2340 and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/ 2341 case SincWeightingFunction: 2342 case SincFastWeightingFunction: 2343 return Sinc(x); 2344 case CubicBCWeightingFunction: 2345 return CubicBC(x,filterCoefficients); 2346 case BoxWeightingFunction: 2347 return BoxResizeFilter(x); 2348 case TriangleWeightingFunction: 2349 return Triangle(x); 2350 case HannWeightingFunction: 2351 return Hann(x); 2352 case HammingWeightingFunction: 2353 return Hamming(x); 2354 case BlackmanWeightingFunction: 2355 return Blackman(x); 2356 2357 default: 2358 return 0.0f; 2359 } 2360 } 2361 ) 2362 2363 2364 STRINGIFY( 2365 inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType 2366 , const ResizeWeightingFunctionType resizeWindowType 2367 , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x) 2368 { 2369 float scale; 2370 float xBlur = fabs(x/resizeFilterBlur); 2371 if (resizeWindowSupport < MagickEpsilon 2372 || resizeWindowType == BoxWeightingFunction) 2373 { 2374 scale = 1.0f; 2375 } 2376 else 2377 { 2378 scale = resizeFilterScale; 2379 scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients); 2380 } 2381 float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients); 2382 return weight; 2383 } 2384 2385 ) 2386 2387 ; 2388 const char *accelerateKernels2 = 2389 2390 STRINGIFY( 2391 2392 inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { 2393 return (numWorkItems/pixelPerWorkgroup); 2394 } 2395 2396 // returns the index of the pixel for the current workitem to compute. 2397 // returns -1 if this workitem doesn't need to participate in any computation 2398 inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { 2399 const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems); 2400 int pixelIndex = itemID/numWorkItemsPerPixel; 2401 pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1; 2402 return pixelIndex; 2403 } 2404 2405 ) 2406 2407 STRINGIFY( 2408 __kernel __attribute__((reqd_work_group_size(256, 1, 1))) 2409 void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, 2410 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, 2411 const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor, 2412 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, 2413 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, 2414 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, 2415 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, 2416 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) 2417 { 2418 // calculate the range of resized image pixels computed by this workgroup 2419 const unsigned int startX = get_group_id(0)*pixelPerWorkgroup; 2420 const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns); 2421 const unsigned int actualNumPixelToCompute = stopX - startX; 2422 2423 // calculate the range of input image pixels to cache 2424 float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f); 2425 const float support = MagickMax(scale*resizeFilterSupport,0.5f); 2426 scale = PerceptibleReciprocal(scale); 2427 2428 const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0)); 2429 const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns); 2430 2431 // cache the input pixels into local memory 2432 const unsigned int y = get_global_id(1); 2433 const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y); 2434 const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels; 2435 event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0); 2436 wait_group_events(1, &e); 2437 2438 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; 2439 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; 2440 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) 2441 { 2442 const unsigned int chunkStartX = startX + chunk*pixelChunkSize; 2443 const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX); 2444 const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX; 2445 2446 // determine which resized pixel computed by this workitem 2447 const unsigned int itemID = get_local_id(0); 2448 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0)); 2449 2450 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0)); 2451 2452 float4 filteredPixel = (float4)0.0f; 2453 float density = 0.0f; 2454 float gamma = 0.0f; 2455 // -1 means this workitem doesn't participate in the computation 2456 if (pixelIndex != -1) 2457 { 2458 // x coordinated of the resized pixel computed by this workitem 2459 const int x = chunkStartX + pixelIndex; 2460 2461 // calculate how many steps required for this pixel 2462 const float bisect = (x+0.5)/xFactor+MagickEpsilon; 2463 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); 2464 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns); 2465 const unsigned int n = stop - start; 2466 2467 // calculate how many steps this workitem will contribute 2468 unsigned int numStepsPerWorkItem = n / numItems; 2469 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); 2470 2471 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; 2472 if (startStep < n) 2473 { 2474 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); 2475 2476 unsigned int cacheIndex = start+startStep-cacheRangeStartX; 2477 2478 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) 2479 { 2480 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, 2481 (ResizeWeightingFunctionType) resizeFilterType, 2482 (ResizeWeightingFunctionType) resizeWindowType, 2483 resizeFilterScale, resizeFilterWindowSupport, 2484 resizeFilterBlur, scale*(start + i - bisect + 0.5)); 2485 2486 float4 cp = (float4) 0; 2487 2488 __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels); 2489 cp.x = (float) *(p); 2490 if (number_channels > 2) 2491 { 2492 cp.y = (float) *(p + 1); 2493 cp.z = (float) *(p + 2); 2494 } 2495 2496 if (alpha_index != 0) 2497 { 2498 cp.w = (float) *(p + alpha_index); 2499 2500 float alpha = weight * QuantumScale * cp.w; 2501 2502 filteredPixel.x += alpha * cp.x; 2503 filteredPixel.y += alpha * cp.y; 2504 filteredPixel.z += alpha * cp.z; 2505 filteredPixel.w += weight * cp.w; 2506 gamma += alpha; 2507 } 2508 else 2509 filteredPixel += ((float4) weight)*cp; 2510 2511 density += weight; 2512 } 2513 } 2514 } 2515 2516 // initialize the accumulators to zero 2517 if (itemID < actualNumPixelInThisChunk) { 2518 outputPixelCache[itemID] = (float4)0.0f; 2519 densityCache[itemID] = 0.0f; 2520 if (alpha_index != 0) 2521 gammaCache[itemID] = 0.0f; 2522 } 2523 barrier(CLK_LOCAL_MEM_FENCE); 2524 2525 // accumulatte the filtered pixel value and the density 2526 for (unsigned int i = 0; i < numItems; i++) { 2527 if (pixelIndex != -1) { 2528 if (itemID%numItems == i) { 2529 outputPixelCache[pixelIndex]+=filteredPixel; 2530 densityCache[pixelIndex]+=density; 2531 if (alpha_index != 0) 2532 gammaCache[pixelIndex]+=gamma; 2533 } 2534 } 2535 barrier(CLK_LOCAL_MEM_FENCE); 2536 } 2537 2538 if (itemID < actualNumPixelInThisChunk) 2539 { 2540 float4 filteredPixel = outputPixelCache[itemID]; 2541 2542 float gamma = 0.0f; 2543 if (alpha_index != 0) 2544 gamma = gammaCache[itemID]; 2545 2546 float density = densityCache[itemID]; 2547 if ((density != 0.0f) && (density != 1.0f)) 2548 { 2549 density = PerceptibleReciprocal(density); 2550 filteredPixel *= (float4) density; 2551 gamma *= density; 2552 } 2553 2554 if (alpha_index != 0) 2555 { 2556 gamma = PerceptibleReciprocal(gamma); 2557 filteredPixel.x *= gamma; 2558 filteredPixel.y *= gamma; 2559 filteredPixel.z *= gamma; 2560 } 2561 2562 WriteFloat4(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, AllChannels, filteredPixel); 2563 } 2564 } 2565 } 2566 ) 2567 2568 2569 STRINGIFY( 2570 __kernel __attribute__((reqd_work_group_size(1, 256, 1))) 2571 void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, 2572 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, 2573 const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor, 2574 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, 2575 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, 2576 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, 2577 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, 2578 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) 2579 { 2580 // calculate the range of resized image pixels computed by this workgroup 2581 const unsigned int startY = get_group_id(1)*pixelPerWorkgroup; 2582 const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows); 2583 const unsigned int actualNumPixelToCompute = stopY - startY; 2584 2585 // calculate the range of input image pixels to cache 2586 float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f); 2587 const float support = MagickMax(scale*resizeFilterSupport,0.5f); 2588 scale = PerceptibleReciprocal(scale); 2589 2590 const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0)); 2591 const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows); 2592 2593 // cache the input pixels into local memory 2594 const unsigned int x = get_global_id(0); 2595 unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY); 2596 unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY; 2597 unsigned int stride = inputColumns * number_channels; 2598 for (unsigned int i = 0; i < number_channels; i++) 2599 { 2600 event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0); 2601 wait_group_events(1,&e); 2602 } 2603 2604 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; 2605 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; 2606 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) 2607 { 2608 const unsigned int chunkStartY = startY + chunk*pixelChunkSize; 2609 const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY); 2610 const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY; 2611 2612 // determine which resized pixel computed by this workitem 2613 const unsigned int itemID = get_local_id(1); 2614 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1)); 2615 2616 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1)); 2617 2618 float4 filteredPixel = (float4)0.0f; 2619 float density = 0.0f; 2620 float gamma = 0.0f; 2621 // -1 means this workitem doesn't participate in the computation 2622 if (pixelIndex != -1) 2623 { 2624 // x coordinated of the resized pixel computed by this workitem 2625 const int y = chunkStartY + pixelIndex; 2626 2627 // calculate how many steps required for this pixel 2628 const float bisect = (y+0.5)/yFactor+MagickEpsilon; 2629 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); 2630 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows); 2631 const unsigned int n = stop - start; 2632 2633 // calculate how many steps this workitem will contribute 2634 unsigned int numStepsPerWorkItem = n / numItems; 2635 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); 2636 2637 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; 2638 if (startStep < n) 2639 { 2640 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); 2641 2642 unsigned int cacheIndex = start+startStep-cacheRangeStartY; 2643 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) 2644 { 2645 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, 2646 (ResizeWeightingFunctionType) resizeFilterType, 2647 (ResizeWeightingFunctionType) resizeWindowType, 2648 resizeFilterScale, resizeFilterWindowSupport, 2649 resizeFilterBlur, scale*(start + i - bisect + 0.5)); 2650 2651 float4 cp = (float4)0.0f; 2652 2653 __local CLQuantum *p = inputImageCache + cacheIndex; 2654 cp.x = (float) *(p); 2655 if (number_channels > 2) 2656 { 2657 cp.y = (float) *(p + rangeLength); 2658 cp.z = (float) *(p + (rangeLength * 2)); 2659 } 2660 2661 if (alpha_index != 0) 2662 { 2663 cp.w = (float) *(p + (rangeLength * alpha_index)); 2664 2665 float alpha = weight * QuantumScale * cp.w; 2666 2667 filteredPixel.x += alpha * cp.x; 2668 filteredPixel.y += alpha * cp.y; 2669 filteredPixel.z += alpha * cp.z; 2670 filteredPixel.w += weight * cp.w; 2671 gamma += alpha; 2672 } 2673 else 2674 filteredPixel += ((float4) weight)*cp; 2675 2676 density += weight; 2677 } 2678 } 2679 } 2680 2681 // initialize the accumulators to zero 2682 if (itemID < actualNumPixelInThisChunk) { 2683 outputPixelCache[itemID] = (float4)0.0f; 2684 densityCache[itemID] = 0.0f; 2685 if (alpha_index != 0) 2686 gammaCache[itemID] = 0.0f; 2687 } 2688 barrier(CLK_LOCAL_MEM_FENCE); 2689 2690 // accumulatte the filtered pixel value and the density 2691 for (unsigned int i = 0; i < numItems; i++) { 2692 if (pixelIndex != -1) { 2693 if (itemID%numItems == i) { 2694 outputPixelCache[pixelIndex]+=filteredPixel; 2695 densityCache[pixelIndex]+=density; 2696 if (alpha_index != 0) 2697 gammaCache[pixelIndex]+=gamma; 2698 } 2699 } 2700 barrier(CLK_LOCAL_MEM_FENCE); 2701 } 2702 2703 if (itemID < actualNumPixelInThisChunk) 2704 { 2705 float4 filteredPixel = outputPixelCache[itemID]; 2706 2707 float gamma = 0.0f; 2708 if (alpha_index != 0) 2709 gamma = gammaCache[itemID]; 2710 2711 float density = densityCache[itemID]; 2712 if ((density != 0.0f) && (density != 1.0f)) 2713 { 2714 density = PerceptibleReciprocal(density); 2715 filteredPixel *= (float4) density; 2716 gamma *= density; 2717 } 2718 2719 if (alpha_index != 0) 2720 { 2721 gamma = PerceptibleReciprocal(gamma); 2722 filteredPixel.x *= gamma; 2723 filteredPixel.y *= gamma; 2724 filteredPixel.z *= gamma; 2725 } 2726 2727 WriteFloat4(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, AllChannels, filteredPixel); 2728 } 2729 } 2730 } 2731 ) 2732 2733/* 2734%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2735% % 2736% % 2737% % 2738% R o t a t i o n a l B l u r % 2739% % 2740% % 2741% % 2742%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2743*/ 2744 2745 STRINGIFY( 2746 __kernel void RotationalBlur(const __global CLQuantum *image, 2747 const unsigned int number_channels,const unsigned int channel, 2748 const float2 blurCenter,__constant float *cos_theta, 2749 __constant float *sin_theta,const unsigned int cossin_theta_size, 2750 __global CLQuantum *filteredImage) 2751 { 2752 const int x = get_global_id(0); 2753 const int y = get_global_id(1); 2754 const int columns = get_global_size(0); 2755 const int rows = get_global_size(1); 2756 unsigned int step = 1; 2757 float center_x = (float) x - blurCenter.x; 2758 float center_y = (float) y - blurCenter.y; 2759 float radius = hypot(center_x, center_y); 2760 2761 float blur_radius = hypot(blurCenter.x, blurCenter.y); 2762 2763 if (radius > MagickEpsilon) 2764 { 2765 step = (unsigned int) (blur_radius / radius); 2766 if (step == 0) 2767 step = 1; 2768 if (step >= cossin_theta_size) 2769 step = cossin_theta_size-1; 2770 } 2771 2772 float4 result = 0.0f; 2773 float normalize = 0.0f; 2774 float gamma = 0.0f; 2775 2776 for (unsigned int i=0; i<cossin_theta_size; i+=step) 2777 { 2778 int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns); 2779 int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows); 2780 2781 float4 pixel = ReadFloat4(image, number_channels, columns, cx, cy, AllChannels); 2782 2783 if ((number_channels == 4) || (number_channels == 2)) 2784 { 2785 float alpha = (float)(QuantumScale*pixel.w); 2786 2787 gamma += alpha; 2788 2789 result.x += alpha * pixel.x; 2790 result.y += alpha * pixel.y; 2791 result.z += alpha * pixel.z; 2792 result.w += pixel.w; 2793 } 2794 else 2795 result += pixel; 2796 2797 normalize += 1.0f; 2798 } 2799 2800 normalize = PerceptibleReciprocal(normalize); 2801 2802 if ((number_channels == 4) || (number_channels == 2)) 2803 { 2804 gamma = PerceptibleReciprocal(gamma); 2805 result.x *= gamma; 2806 result.y *= gamma; 2807 result.z *= gamma; 2808 result.w *= normalize; 2809 } 2810 else 2811 result *= normalize; 2812 2813 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); 2814 } 2815 ) 2816 2817/* 2818%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2819% % 2820% % 2821% % 2822% U n s h a r p M a s k % 2823% % 2824% % 2825% % 2826%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2827*/ 2828 2829 STRINGIFY( 2830 __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image, 2831 const __global float4 *blurRowData,const unsigned int number_channels, 2832 const ChannelType channel,const unsigned int columns, 2833 const unsigned int rows,__local float4* cachedData, 2834 __local float* cachedFilter,const __global float *filter, 2835 const unsigned int width,const float gain, const float threshold, 2836 __global CLQuantum *filteredImage) 2837 { 2838 const unsigned int radius = (width-1)/2; 2839 2840 // cache the pixel shared by the workgroup 2841 const int groupX = get_group_id(0); 2842 const int groupStartY = get_group_id(1)*get_local_size(1) - radius; 2843 const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius; 2844 2845 if ((groupStartY >= 0) && (groupStopY < rows)) 2846 { 2847 event_t e = async_work_group_strided_copy(cachedData, 2848 blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0); 2849 wait_group_events(1,&e); 2850 } 2851 else 2852 { 2853 for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) 2854 cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX]; 2855 2856 barrier(CLK_LOCAL_MEM_FENCE); 2857 } 2858 // cache the filter as well 2859 event_t e = async_work_group_copy(cachedFilter,filter,width,0); 2860 wait_group_events(1,&e); 2861 2862 // only do the work if this is not a patched item 2863 const int cy = get_global_id(1); 2864 2865 if (cy < rows) 2866 { 2867 float4 blurredPixel = (float4) 0.0f; 2868 2869 int i = 0; 2870 2871 for ( ; i+7 < width; ) 2872 { 2873 for (int j=0; j < 8; j++, i++) 2874 blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)]; 2875 } 2876 2877 for ( ; i < width; i++) 2878 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)]; 2879 2880 float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel); 2881 float4 outputPixel = inputImagePixel - blurredPixel; 2882 2883 float quantumThreshold = QuantumRange*threshold; 2884 2885 int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold); 2886 outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask); 2887 2888 //write back 2889 WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel); 2890 } 2891 } 2892 ) 2893 2894 STRINGIFY( 2895 __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels, 2896 const ChannelType channel,__constant float *filter,const unsigned int width, 2897 const unsigned int columns,const unsigned int rows,__local float4 *pixels, 2898 const float gain,const float threshold,__global CLQuantum *filteredImage) 2899 { 2900 const unsigned int x = get_global_id(0); 2901 const unsigned int y = get_global_id(1); 2902 2903 const unsigned int radius = (width - 1) / 2; 2904 2905 int row = y - radius; 2906 int baseRow = get_group_id(1) * get_local_size(1) - radius; 2907 int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius; 2908 2909 while (row < endRow) { 2910 int srcy = (row < 0) ? -row : row; // mirror pad 2911 srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy; 2912 2913 float4 value = 0.0f; 2914 2915 int ix = x - radius; 2916 int i = 0; 2917 2918 while (i + 7 < width) { 2919 for (int j = 0; j < 8; ++j) { // unrolled 2920 int srcx = ix + j; 2921 srcx = (srcx < 0) ? -srcx : srcx; 2922 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; 2923 value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); 2924 } 2925 ix += 8; 2926 i += 8; 2927 } 2928 2929 while (i < width) { 2930 int srcx = (ix < 0) ? -ix : ix; // mirror pad 2931 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; 2932 value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); 2933 ++i; 2934 ++ix; 2935 } 2936 pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value; 2937 row += get_local_size(1); 2938 } 2939 2940 barrier(CLK_LOCAL_MEM_FENCE); 2941 2942 const int px = get_local_id(0); 2943 const int py = get_local_id(1); 2944 const int prp = get_local_size(0); 2945 float4 value = (float4)(0.0f); 2946 2947 int i = 0; 2948 while (i + 7 < width) { 2949 for (int j = 0; j < 8; ++j) // unrolled 2950 value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp]; 2951 i += 8; 2952 } 2953 while (i < width) { 2954 value += (float4)(filter[i]) * pixels[px + (py + i) * prp]; 2955 ++i; 2956 } 2957 2958 float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel); 2959 float4 diff = srcPixel - value; 2960 2961 float quantumThreshold = QuantumRange*threshold; 2962 2963 int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold); 2964 value = select(srcPixel + diff * gain, srcPixel, mask); 2965 2966 if ((x < columns) && (y < rows)) 2967 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value); 2968 } 2969 ) 2970 2971 STRINGIFY( 2972 __kernel __attribute__((reqd_work_group_size(64, 4, 1))) 2973 void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage, 2974 const unsigned int number_channels,const unsigned int max_channels, 2975 const float threshold,const int passes,const unsigned int imageWidth, 2976 const unsigned int imageHeight) 2977 { 2978 const int pad = (1 << (passes - 1)); 2979 const int tileSize = 64; 2980 const int tileRowPixels = 64; 2981 const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 }; 2982 2983 CLQuantum stage[48]; // 16 * 3 (we only need 3 channels) 2984 2985 local float buffer[64 * 64]; 2986 2987 int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0); 2988 int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad; 2989 2990 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 2991 int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) + 2992 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels; 2993 2994 for (int channel = 0; channel < max_channels; ++channel) 2995 stage[(i / 4) + (16 * channel)] = srcImage[pos + channel]; 2996 } 2997 2998 for (int channel = 0; channel < max_channels; ++channel) { 2999 // Load LDS 3000 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3001 buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]); 3002 3003 // Process 3004 3005 float tmp[16]; 3006 float accum[16]; 3007 float pixel; 3008 3009 for (int i = 0; i < 16; i++) 3010 accum[i]=0.0f; 3011 3012 for (int pass = 0; pass < passes; ++pass) { 3013 const int radius = 1 << pass; 3014 const int x = get_local_id(0); 3015 const float thresh = threshold * noise[pass]; 3016 3017 // Apply horizontal hat 3018 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3019 const int offset = i * tileRowPixels; 3020 if (pass == 0) 3021 tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass 3022 pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]); 3023 barrier(CLK_LOCAL_MEM_FENCE); 3024 buffer[x + offset] = pixel; 3025 } 3026 barrier(CLK_LOCAL_MEM_FENCE); 3027 3028 // Apply vertical hat 3029 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3030 pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]); 3031 float delta = tmp[i / 4] - pixel; 3032 tmp[i / 4] = pixel; // hold output in tmp until all workitems are done 3033 if (delta < -thresh) 3034 delta += thresh; 3035 else if (delta > thresh) 3036 delta -= thresh; 3037 else 3038 delta = 0; 3039 accum[i / 4] += delta; 3040 } 3041 barrier(CLK_LOCAL_MEM_FENCE); 3042 3043 if (pass < passes - 1) 3044 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3045 buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass 3046 else // last pass 3047 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3048 accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output 3049 barrier(CLK_LOCAL_MEM_FENCE); 3050 } 3051 3052 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3053 stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]); 3054 3055 barrier(CLK_LOCAL_MEM_FENCE); 3056 } 3057 3058 // Write from stage to output 3059 3060 if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) { 3061 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3062 if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) { 3063 int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels)); 3064 for (int channel = 0; channel < max_channels; ++channel) { 3065 dstImage[pos + channel] = stage[(i / 4) + (16 * channel)]; 3066 } 3067 } 3068 } 3069 } 3070 } 3071 ) 3072 3073 ; 3074 3075#endif // MAGICKCORE_OPENCL_SUPPORT 3076 3077#if defined(__cplusplus) || defined(c_plusplus) 3078} 3079#endif 3080 3081#endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 3082