1 /* 2 Copyright 1999-2016 ImageMagick Studio LLC, a non-profit organization 3 dedicated to making software imaging solutions freely available. 4 5 You may not use this file except in compliance with the License. 6 obtain a copy of the License at 7 8 http://www.imagemagick.org/script/license.php 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 16 MagickCore private kernels for accelerated functions. 17 */ 18 19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 21 22 #if defined(__cplusplus) || defined(c_plusplus) 23 extern "C" { 24 #endif 25 26 #if defined(MAGICKCORE_OPENCL_SUPPORT) 27 28 /* 29 Define declarations. 30 */ 31 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n" 32 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n" 33 #define OPENCL_ELSE() "\n #""else " " \n" 34 #define OPENCL_ENDIF() "\n #""endif " " \n" 35 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n" 36 #define STRINGIFY(...) #__VA_ARGS__ "\n" 37 38 /* 39 Typedef declarations. 40 */ 41 42 typedef struct _FloatPixelPacket 43 { 44 MagickRealType 45 red, 46 green, 47 blue, 48 alpha, 49 black; 50 } FloatPixelPacket; 51 52 const char *accelerateKernels = 53 54 /* 55 Define declarations. 56 */ 57 OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f)) 58 OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f)) 59 OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f)) 60 OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f)) 61 OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f)) 62 OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f)) 63 OPENCL_DEFINE(SigmaRandom, (attenuate)) 64 OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f)) 65 OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y))) 66 OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y))) 67 68 /* 69 Typedef declarations. 70 */ 71 STRINGIFY( 72 typedef enum 73 { 74 UndefinedColorspace, 75 RGBColorspace, /* Linear RGB colorspace */ 76 GRAYColorspace, /* greyscale (linear) image (faked 1 channel) */ 77 TransparentColorspace, 78 OHTAColorspace, 79 LabColorspace, 80 XYZColorspace, 81 YCbCrColorspace, 82 YCCColorspace, 83 YIQColorspace, 84 YPbPrColorspace, 85 YUVColorspace, 86 CMYKColorspace, /* negared linear RGB with black separated */ 87 sRGBColorspace, /* Default: non-lienar sRGB colorspace */ 88 HSBColorspace, 89 HSLColorspace, 90 HWBColorspace, 91 Rec601LumaColorspace, 92 Rec601YCbCrColorspace, 93 Rec709LumaColorspace, 94 Rec709YCbCrColorspace, 95 LogColorspace, 96 CMYColorspace, /* negated linear RGB colorspace */ 97 LuvColorspace, 98 HCLColorspace, 99 LCHColorspace, /* alias for LCHuv */ 100 LMSColorspace, 101 LCHabColorspace, /* Cylindrical (Polar) Lab */ 102 LCHuvColorspace, /* Cylindrical (Polar) Luv */ 103 scRGBColorspace, 104 HSIColorspace, 105 HSVColorspace, /* alias for HSB */ 106 HCLpColorspace, 107 YDbDrColorspace 108 } ColorspaceType; 109 ) 110 111 STRINGIFY( 112 typedef enum 113 { 114 UndefinedCompositeOp, 115 NoCompositeOp, 116 ModulusAddCompositeOp, 117 AtopCompositeOp, 118 BlendCompositeOp, 119 BumpmapCompositeOp, 120 ChangeMaskCompositeOp, 121 ClearCompositeOp, 122 ColorBurnCompositeOp, 123 ColorDodgeCompositeOp, 124 ColorizeCompositeOp, 125 CopyBlackCompositeOp, 126 CopyBlueCompositeOp, 127 CopyCompositeOp, 128 CopyCyanCompositeOp, 129 CopyGreenCompositeOp, 130 CopyMagentaCompositeOp, 131 CopyOpacityCompositeOp, 132 CopyRedCompositeOp, 133 CopyYellowCompositeOp, 134 DarkenCompositeOp, 135 DstAtopCompositeOp, 136 DstCompositeOp, 137 DstInCompositeOp, 138 DstOutCompositeOp, 139 DstOverCompositeOp, 140 DifferenceCompositeOp, 141 DisplaceCompositeOp, 142 DissolveCompositeOp, 143 ExclusionCompositeOp, 144 HardLightCompositeOp, 145 HueCompositeOp, 146 InCompositeOp, 147 LightenCompositeOp, 148 LinearLightCompositeOp, 149 LuminizeCompositeOp, 150 MinusDstCompositeOp, 151 ModulateCompositeOp, 152 MultiplyCompositeOp, 153 OutCompositeOp, 154 OverCompositeOp, 155 OverlayCompositeOp, 156 PlusCompositeOp, 157 ReplaceCompositeOp, 158 SaturateCompositeOp, 159 ScreenCompositeOp, 160 SoftLightCompositeOp, 161 SrcAtopCompositeOp, 162 SrcCompositeOp, 163 SrcInCompositeOp, 164 SrcOutCompositeOp, 165 SrcOverCompositeOp, 166 ModulusSubtractCompositeOp, 167 ThresholdCompositeOp, 168 XorCompositeOp, 169 /* These are new operators, added after the above was last sorted. 170 * The list should be re-sorted only when a new library version is 171 * created. 172 */ 173 DivideDstCompositeOp, 174 DistortCompositeOp, 175 BlurCompositeOp, 176 PegtopLightCompositeOp, 177 VividLightCompositeOp, 178 PinLightCompositeOp, 179 LinearDodgeCompositeOp, 180 LinearBurnCompositeOp, 181 MathematicsCompositeOp, 182 DivideSrcCompositeOp, 183 MinusSrcCompositeOp, 184 DarkenIntensityCompositeOp, 185 LightenIntensityCompositeOp 186 } CompositeOperator; 187 ) 188 189 STRINGIFY( 190 typedef enum 191 { 192 UndefinedFunction, 193 ArcsinFunction, 194 ArctanFunction, 195 PolynomialFunction, 196 SinusoidFunction 197 } MagickFunction; 198 ) 199 200 STRINGIFY( 201 typedef enum 202 { 203 UndefinedNoise, 204 UniformNoise, 205 GaussianNoise, 206 MultiplicativeGaussianNoise, 207 ImpulseNoise, 208 LaplacianNoise, 209 PoissonNoise, 210 RandomNoise 211 } NoiseType; 212 ) 213 214 STRINGIFY( 215 typedef enum 216 { 217 UndefinedPixelIntensityMethod = 0, 218 AveragePixelIntensityMethod, 219 BrightnessPixelIntensityMethod, 220 LightnessPixelIntensityMethod, 221 Rec601LumaPixelIntensityMethod, 222 Rec601LuminancePixelIntensityMethod, 223 Rec709LumaPixelIntensityMethod, 224 Rec709LuminancePixelIntensityMethod, 225 RMSPixelIntensityMethod, 226 MSPixelIntensityMethod 227 } PixelIntensityMethod; 228 ) 229 230 STRINGIFY( 231 typedef enum { 232 BoxWeightingFunction = 0, 233 TriangleWeightingFunction, 234 CubicBCWeightingFunction, 235 HannWeightingFunction, 236 HammingWeightingFunction, 237 BlackmanWeightingFunction, 238 GaussianWeightingFunction, 239 QuadraticWeightingFunction, 240 JincWeightingFunction, 241 SincWeightingFunction, 242 SincFastWeightingFunction, 243 KaiserWeightingFunction, 244 WelshWeightingFunction, 245 BohmanWeightingFunction, 246 LagrangeWeightingFunction, 247 CosineWeightingFunction, 248 } ResizeWeightingFunctionType; 249 ) 250 251 STRINGIFY( 252 typedef enum 253 { 254 UndefinedChannel = 0x0000, 255 RedChannel = 0x0001, 256 GrayChannel = 0x0001, 257 CyanChannel = 0x0001, 258 GreenChannel = 0x0002, 259 MagentaChannel = 0x0002, 260 BlueChannel = 0x0004, 261 YellowChannel = 0x0004, 262 BlackChannel = 0x0008, 263 AlphaChannel = 0x0010, 264 OpacityChannel = 0x0010, 265 IndexChannel = 0x0020, /* Color Index Table? */ 266 ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */ 267 WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */ 268 MetaChannel = 0x0100, /* ???? */ 269 CompositeChannels = 0x001F, 270 AllChannels = 0x7ffffff, 271 /* 272 Special purpose channel types. 273 FUTURE: are these needed any more - they are more like hacks 274 SyncChannels for example is NOT a real channel but a 'flag' 275 It really says -- "User has not defined channels" 276 Though it does have extra meaning in the "-auto-level" operator 277 */ 278 TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */ 279 RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */ 280 GrayChannels = 0x0400, 281 SyncChannels = 0x20000, /* channels modified as a single unit */ 282 DefaultChannels = AllChannels 283 } ChannelType; /* must correspond to PixelChannel */ 284 ) 285 286 /* 287 Helper functions. 288 */ 289 290 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8)) 291 292 STRINGIFY( 293 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 294 { 295 return((CLQuantum) value); 296 } 297 ) 298 299 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16)) 300 301 STRINGIFY( 302 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 303 { 304 return((CLQuantum) (257.0f*value)); 305 } 306 ) 307 308 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32)) 309 310 STRINGIFY( 311 inline CLQuantum ScaleCharToQuantum(const unsigned char value) 312 { 313 return((CLQuantum) (16843009.0*value)); 314 } 315 ) 316 317 OPENCL_ENDIF() 318 319 STRINGIFY( 320 inline int ClampToCanvas(const int offset,const int range) 321 { 322 return clamp(offset, (int)0, range-1); 323 } 324 ) 325 326 STRINGIFY( 327 inline CLQuantum ClampToQuantum(const float value) 328 { 329 return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f); 330 } 331 ) 332 333 STRINGIFY( 334 inline uint ScaleQuantumToMap(CLQuantum value) 335 { 336 if (value >= (CLQuantum) MaxMap) 337 return ((uint)MaxMap); 338 else 339 return ((uint)value); 340 } 341 ) 342 343 STRINGIFY( 344 inline float PerceptibleReciprocal(const float x) 345 { 346 float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0; 347 return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon)); 348 } 349 ) 350 351 STRINGIFY( 352 inline float RoundToUnity(const float value) 353 { 354 return clamp(value,0.0f,1.0f); 355 } 356 ) 357 358 STRINGIFY( 359 360 inline unsigned int getPixelIndex(const unsigned int number_channels, 361 const unsigned int columns, const unsigned int x, const unsigned int y) 362 { 363 return (x * number_channels) + (y * columns * number_channels); 364 } 365 366 inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; } 367 inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); } 368 inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); } 369 inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); } 370 371 inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; } 372 inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; } 373 inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; } 374 inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; } 375 376 inline CLQuantum getBlue(CLPixelType p) { return p.x; } 377 inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; } 378 inline float getBlueF4(float4 p) { return p.x; } 379 inline void setBlueF4(float4* p, float value) { (*p).x = value; } 380 381 inline CLQuantum getGreen(CLPixelType p) { return p.y; } 382 inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; } 383 inline float getGreenF4(float4 p) { return p.y; } 384 inline void setGreenF4(float4* p, float value) { (*p).y = value; } 385 386 inline CLQuantum getRed(CLPixelType p) { return p.z; } 387 inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; } 388 inline float getRedF4(float4 p) { return p.z; } 389 inline void setRedF4(float4* p, float value) { (*p).z = value; } 390 391 inline CLQuantum getAlpha(CLPixelType p) { return p.w; } 392 inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; } 393 inline float getAlphaF4(float4 p) { return p.w; } 394 inline void setAlphaF4(float4* p, float value) { (*p).w = value; } 395 396 inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels, 397 const ChannelType channel, float *red, float *green, float *blue, float *alpha) 398 { 399 if ((channel & RedChannel) != 0) 400 *red=getPixelRed(p); 401 402 if (number_channels > 2) 403 { 404 if ((channel & GreenChannel) != 0) 405 *green=getPixelGreen(p); 406 407 if ((channel & BlueChannel) != 0) 408 *blue=getPixelBlue(p); 409 } 410 411 if (((number_channels == 4) || (number_channels == 2)) && 412 ((channel & AlphaChannel) != 0)) 413 *alpha=getPixelAlpha(p,number_channels); 414 } 415 416 inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels, 417 const unsigned int columns, const unsigned int x, const unsigned int y) 418 { 419 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 420 421 float4 pixel; 422 423 pixel.x=getPixelRed(p); 424 425 if (number_channels > 2) 426 { 427 pixel.y=getPixelGreen(p); 428 pixel.z=getPixelBlue(p); 429 } 430 431 if ((number_channels == 4) || (number_channels == 2)) 432 pixel.w=getPixelAlpha(p,number_channels); 433 return(pixel); 434 } 435 436 inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels, 437 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel) 438 { 439 const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 440 441 float red = 0.0f; 442 float green = 0.0f; 443 float blue = 0.0f; 444 float alpha = 0.0f; 445 446 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 447 return (float4)(red, green, blue, alpha); 448 } 449 450 inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels, 451 const ChannelType channel, float red, float green, float blue, float alpha) 452 { 453 if ((channel & RedChannel) != 0) 454 setPixelRed(p,ClampToQuantum(red)); 455 456 if (number_channels > 2) 457 { 458 if ((channel & GreenChannel) != 0) 459 setPixelGreen(p,ClampToQuantum(green)); 460 461 if ((channel & BlueChannel) != 0) 462 setPixelBlue(p,ClampToQuantum(blue)); 463 } 464 465 if (((number_channels == 4) || (number_channels == 2)) && 466 ((channel & AlphaChannel) != 0)) 467 setPixelAlpha(p,number_channels,ClampToQuantum(alpha)); 468 } 469 470 inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels, 471 const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel) 472 { 473 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 474 475 setPixelRed(p,ClampToQuantum(pixel.x)); 476 477 if (number_channels > 2) 478 { 479 setPixelGreen(p,ClampToQuantum(pixel.y)); 480 setPixelBlue(p,ClampToQuantum(pixel.z)); 481 } 482 483 if ((number_channels == 4) || (number_channels == 2)) 484 setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w)); 485 } 486 487 inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels, 488 const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel, 489 float4 pixel) 490 { 491 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 492 WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w); 493 } 494 495 inline float GetPixelIntensity(const unsigned int colorspace, 496 const unsigned int method,float red,float green,float blue) 497 { 498 float intensity; 499 500 if (colorspace == GRAYColorspace) 501 return red; 502 503 switch (method) 504 { 505 case AveragePixelIntensityMethod: 506 { 507 intensity=(red+green+blue)/3.0; 508 break; 509 } 510 case BrightnessPixelIntensityMethod: 511 { 512 intensity=MagickMax(MagickMax(red,green),blue); 513 break; 514 } 515 case LightnessPixelIntensityMethod: 516 { 517 intensity=(MagickMin(MagickMin(red,green),blue)+ 518 MagickMax(MagickMax(red,green),blue))/2.0; 519 break; 520 } 521 case MSPixelIntensityMethod: 522 { 523 intensity=(float) (((float) red*red+green*green+blue*blue)/ 524 (3.0*QuantumRange)); 525 break; 526 } 527 case Rec601LumaPixelIntensityMethod: 528 { 529 /* 530 if (image->colorspace == RGBColorspace) 531 { 532 red=EncodePixelGamma(red); 533 green=EncodePixelGamma(green); 534 blue=EncodePixelGamma(blue); 535 } 536 */ 537 intensity=0.298839*red+0.586811*green+0.114350*blue; 538 break; 539 } 540 case Rec601LuminancePixelIntensityMethod: 541 { 542 /* 543 if (image->colorspace == sRGBColorspace) 544 { 545 red=DecodePixelGamma(red); 546 green=DecodePixelGamma(green); 547 blue=DecodePixelGamma(blue); 548 } 549 */ 550 intensity=0.298839*red+0.586811*green+0.114350*blue; 551 break; 552 } 553 case Rec709LumaPixelIntensityMethod: 554 default: 555 { 556 /* 557 if (image->colorspace == RGBColorspace) 558 { 559 red=EncodePixelGamma(red); 560 green=EncodePixelGamma(green); 561 blue=EncodePixelGamma(blue); 562 } 563 */ 564 intensity=0.212656*red+0.715158*green+0.072186*blue; 565 break; 566 } 567 case Rec709LuminancePixelIntensityMethod: 568 { 569 /* 570 if (image->colorspace == sRGBColorspace) 571 { 572 red=DecodePixelGamma(red); 573 green=DecodePixelGamma(green); 574 blue=DecodePixelGamma(blue); 575 } 576 */ 577 intensity=0.212656*red+0.715158*green+0.072186*blue; 578 break; 579 } 580 case RMSPixelIntensityMethod: 581 { 582 intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/ 583 sqrt(3.0)); 584 break; 585 } 586 } 587 588 return intensity; 589 } 590 591 inline int mirrorBottom(int value) 592 { 593 return (value < 0) ? - (value) : value; 594 } 595 596 inline int mirrorTop(int value, int width) 597 { 598 return (value >= width) ? (2 * width - value - 1) : value; 599 } 600 ) 601 602 /* 603 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 604 % % 605 % % 606 % % 607 % A d d N o i s e % 608 % % 609 % % 610 % % 611 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 612 */ 613 614 STRINGIFY( 615 /* 616 Part of MWC64X by David Thomas, dt10@imperial.ac.uk 617 This is provided under BSD, full license is with the main package. 618 See http://www.doc.ic.ac.uk/~dt10/research 619 */ 620 621 // Pre: a<M, b<M 622 // Post: r=(a+b) mod M 623 ulong MWC_AddMod64(ulong a, ulong b, ulong M) 624 { 625 ulong v=a+b; 626 //if( (v>=M) || (v<a) ) 627 if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug. 628 v=v-M; 629 return v; 630 } 631 632 // Pre: a<M,b<M 633 // Post: r=(a*b) mod M 634 // This could be done more efficently, but it is portable, and should 635 // be easy to understand. It can be replaced with any of the better 636 // modular multiplication algorithms (for example if you know you have 637 // double precision available or something). 638 ulong MWC_MulMod64(ulong a, ulong b, ulong M) 639 { 640 ulong r=0; 641 while(a!=0){ 642 if(a&1) 643 r=MWC_AddMod64(r,b,M); 644 b=MWC_AddMod64(b,b,M); 645 a=a>>1; 646 } 647 return r; 648 } 649 650 // Pre: a<M, e>=0 651 // Post: r=(a^b) mod M 652 // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on 653 // most architectures 654 ulong MWC_PowMod64(ulong a, ulong e, ulong M) 655 { 656 ulong sqr=a, acc=1; 657 while(e!=0){ 658 if(e&1) 659 acc=MWC_MulMod64(acc,sqr,M); 660 sqr=MWC_MulMod64(sqr,sqr,M); 661 e=e>>1; 662 } 663 return acc; 664 } 665 666 uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance) 667 { 668 ulong m=MWC_PowMod64(A, distance, M); 669 ulong x=curr.x*(ulong)A+curr.y; 670 x=MWC_MulMod64(x, m, M); 671 return (uint2)((uint)(x/A), (uint)(x%A)); 672 } 673 674 uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap) 675 { 676 // This is an arbitrary constant for starting LCG jumping from. I didn't 677 // want to start from 1, as then you end up with the two or three first values 678 // being a bit poor in ones - once you've decided that, one constant is as 679 // good as any another. There is no deep mathematical reason for it, I just 680 // generated a random number. 681 enum{ MWC_BASEID = 4077358422479273989UL }; 682 683 ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap; 684 ulong m=MWC_PowMod64(A, dist, M); 685 686 ulong x=MWC_MulMod64(MWC_BASEID, m, M); 687 return (uint2)((uint)(x/A), (uint)(x%A)); 688 } 689 690 //! Represents the state of a particular generator 691 typedef struct{ uint x; uint c; } mwc64x_state_t; 692 693 enum{ MWC64X_A = 4294883355U }; 694 enum{ MWC64X_M = 18446383549859758079UL }; 695 696 void MWC64X_Step(mwc64x_state_t *s) 697 { 698 uint X=s->x, C=s->c; 699 700 uint Xn=MWC64X_A*X+C; 701 uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar 702 uint Cn=mad_hi(MWC64X_A,X,carry); 703 704 s->x=Xn; 705 s->c=Cn; 706 } 707 708 void MWC64X_Skip(mwc64x_state_t *s, ulong distance) 709 { 710 uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance); 711 s->x=tmp.x; 712 s->c=tmp.y; 713 } 714 715 void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset) 716 { 717 uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset); 718 s->x=tmp.x; 719 s->c=tmp.y; 720 } 721 722 //! Return a 32-bit integer in the range [0..2^32) 723 uint MWC64X_NextUint(mwc64x_state_t *s) 724 { 725 uint res=s->x ^ s->c; 726 MWC64X_Step(s); 727 return res; 728 } 729 730 // 731 // End of MWC64X excerpt 732 // 733 734 float mwcReadPseudoRandomValue(mwc64x_state_t* rng) 735 { 736 return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0 737 } 738 739 float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate) 740 { 741 float 742 alpha, 743 beta, 744 noise, 745 sigma; 746 747 noise = 0.0f; 748 alpha=mwcReadPseudoRandomValue(r); 749 switch(noise_type) 750 { 751 case UniformNoise: 752 default: 753 { 754 noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f)); 755 break; 756 } 757 case GaussianNoise: 758 { 759 float 760 gamma, 761 tau; 762 763 if (alpha == 0.0f) 764 alpha=1.0f; 765 beta=mwcReadPseudoRandomValue(r); 766 gamma=sqrt(-2.0f*log(alpha)); 767 sigma=gamma*cospi((2.0f*beta)); 768 tau=gamma*sinpi((2.0f*beta)); 769 noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau; 770 break; 771 } 772 case ImpulseNoise: 773 { 774 if (alpha < (SigmaImpulse/2.0f)) 775 noise=0.0f; 776 else 777 if (alpha >= (1.0f-(SigmaImpulse/2.0f))) 778 noise=QuantumRange; 779 else 780 noise=pixel; 781 break; 782 } 783 case LaplacianNoise: 784 { 785 if (alpha <= 0.5f) 786 { 787 if (alpha <= MagickEpsilon) 788 noise=(pixel-QuantumRange); 789 else 790 noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+ 791 0.5f); 792 break; 793 } 794 beta=1.0f-alpha; 795 if (beta <= (0.5f*MagickEpsilon)) 796 noise=(pixel+QuantumRange); 797 else 798 noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f); 799 break; 800 } 801 case MultiplicativeGaussianNoise: 802 { 803 sigma=1.0f; 804 if (alpha > MagickEpsilon) 805 sigma=sqrt(-2.0f*log(alpha)); 806 beta=mwcReadPseudoRandomValue(r); 807 noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma* 808 cospi((2.0f*beta))/2.0f); 809 break; 810 } 811 case PoissonNoise: 812 { 813 float 814 poisson; 815 unsigned int i; 816 poisson=exp(-SigmaPoisson*QuantumScale*pixel); 817 for (i=0; alpha > poisson; i++) 818 { 819 beta=mwcReadPseudoRandomValue(r); 820 alpha*=beta; 821 } 822 noise=(QuantumRange*i/SigmaPoisson); 823 break; 824 } 825 case RandomNoise: 826 { 827 noise=(QuantumRange*SigmaRandom*alpha); 828 break; 829 } 830 } 831 return noise; 832 } 833 834 __kernel 835 void AddNoise(const __global CLQuantum *image, 836 const unsigned int number_channels,const ChannelType channel, 837 const unsigned int length,const unsigned int pixelsPerWorkItem, 838 const NoiseType noise_type,const float attenuate,const unsigned int seed0, 839 const unsigned int seed1,const unsigned int numRandomNumbersPerPixel, 840 __global CLQuantum *filteredImage) 841 { 842 mwc64x_state_t rng; 843 rng.x = seed0; 844 rng.c = seed1; 845 846 uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use 847 uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream); 848 MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams 849 850 uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels); 851 uint count = pixelsPerWorkItem; 852 853 while (count > 0 && pos < length) 854 { 855 const __global CLQuantum *p = image + pos; 856 __global CLQuantum *q = filteredImage + pos; 857 858 float red; 859 float green; 860 float blue; 861 float alpha; 862 863 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 864 865 if ((channel & RedChannel) != 0) 866 red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate); 867 868 if (number_channels > 2) 869 { 870 if ((channel & GreenChannel) != 0) 871 green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate); 872 873 if ((channel & BlueChannel) != 0) 874 blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate); 875 } 876 877 if (((number_channels == 4) || (number_channels == 2)) && 878 ((channel & AlphaChannel) != 0)) 879 alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate); 880 881 WriteChannels(q, number_channels, channel, red, green, blue, alpha); 882 883 pos += (get_local_size(0) * number_channels); 884 count--; 885 } 886 } 887 ) 888 889 /* 890 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 891 % % 892 % % 893 % % 894 % B l u r % 895 % % 896 % % 897 % % 898 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 899 */ 900 901 STRINGIFY( 902 /* 903 Reduce image noise and reduce detail levels by row 904 */ 905 __kernel void BlurRow(const __global CLQuantum *image, 906 const unsigned int number_channels,const ChannelType channel, 907 __constant float *filter,const unsigned int width, 908 const unsigned int imageColumns,const unsigned int imageRows, 909 __local float4 *temp,__global float4 *tempImage) 910 { 911 const int x = get_global_id(0); 912 const int y = get_global_id(1); 913 914 const int columns = imageColumns; 915 916 const unsigned int radius = (width-1)/2; 917 const int wsize = get_local_size(0); 918 const unsigned int loadSize = wsize+width; 919 920 //group coordinate 921 const int groupX=get_local_size(0)*get_group_id(0); 922 923 //parallel load and clamp 924 for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0)) 925 { 926 int cx = ClampToCanvas(i + groupX - radius, columns); 927 temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel); 928 } 929 930 // barrier 931 barrier(CLK_LOCAL_MEM_FENCE); 932 933 // only do the work if this is not a patched item 934 if (get_global_id(0) < columns) 935 { 936 // compute 937 float4 result = (float4) 0; 938 939 int i = 0; 940 941 for ( ; i+7 < width; ) 942 { 943 for (int j=0; j < 8; j++) 944 result+=filter[i+j]*temp[i+j+get_local_id(0)]; 945 i+=8; 946 } 947 948 for ( ; i < width; i++) 949 result+=filter[i]*temp[i+get_local_id(0)]; 950 951 // write back to global 952 tempImage[y*columns+x] = result; 953 } 954 } 955 ) 956 957 STRINGIFY( 958 /* 959 Reduce image noise and reduce detail levels by line 960 */ 961 __kernel void BlurColumn(const __global float4 *blurRowData, 962 const unsigned int number_channels,const ChannelType channel, 963 __constant float *filter,const unsigned int width, 964 const unsigned int imageColumns,const unsigned int imageRows, 965 __local float4 *temp,__global CLQuantum *filteredImage) 966 { 967 const int x = get_global_id(0); 968 const int y = get_global_id(1); 969 970 const int columns = imageColumns; 971 const int rows = imageRows; 972 973 unsigned int radius = (width-1)/2; 974 const int wsize = get_local_size(1); 975 const unsigned int loadSize = wsize+width; 976 977 //group coordinate 978 const int groupX=get_local_size(0)*get_group_id(0); 979 const int groupY=get_local_size(1)*get_group_id(1); 980 //notice that get_local_size(0) is 1, so 981 //groupX=get_group_id(0); 982 983 //parallel load and clamp 984 for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1)) 985 temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX]; 986 987 // barrier 988 barrier(CLK_LOCAL_MEM_FENCE); 989 990 // only do the work if this is not a patched item 991 if (get_global_id(1) < rows) 992 { 993 // compute 994 float4 result = (float4) 0; 995 996 int i = 0; 997 998 for ( ; i+7 < width; ) 999 { 1000 for (int j=0; j < 8; j++) 1001 result+=filter[i+j]*temp[i+j+get_local_id(1)]; 1002 i+=8; 1003 } 1004 1005 for ( ; i < width; i++) 1006 result+=filter[i]*temp[i+get_local_id(1)]; 1007 1008 // write back to global 1009 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); 1010 } 1011 } 1012 ) 1013 1014 /* 1015 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1016 % % 1017 % % 1018 % % 1019 % C o n t r a s t % 1020 % % 1021 % % 1022 % % 1023 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1024 */ 1025 1026 STRINGIFY( 1027 1028 inline float3 ConvertRGBToHSB(CLPixelType pixel) { 1029 float3 HueSaturationBrightness; 1030 HueSaturationBrightness.x = 0.0f; // Hue 1031 HueSaturationBrightness.y = 0.0f; // Saturation 1032 HueSaturationBrightness.z = 0.0f; // Brightness 1033 1034 float r=(float) getRed(pixel); 1035 float g=(float) getGreen(pixel); 1036 float b=(float) getBlue(pixel); 1037 1038 float tmin=MagickMin(MagickMin(r,g),b); 1039 float tmax=MagickMax(MagickMax(r,g),b); 1040 1041 if (tmax!=0.0f) { 1042 float delta=tmax-tmin; 1043 HueSaturationBrightness.y=delta/tmax; 1044 HueSaturationBrightness.z=QuantumScale*tmax; 1045 1046 if (delta != 0.0f) { 1047 HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f)); 1048 HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta; 1049 HueSaturationBrightness.x/=6.0f; 1050 HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f; 1051 } 1052 } 1053 return HueSaturationBrightness; 1054 } 1055 1056 inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) { 1057 1058 float hue = HueSaturationBrightness.x; 1059 float brightness = HueSaturationBrightness.z; 1060 float saturation = HueSaturationBrightness.y; 1061 1062 CLPixelType rgb; 1063 1064 if (saturation == 0.0f) { 1065 setRed(&rgb,ClampToQuantum(QuantumRange*brightness)); 1066 setGreen(&rgb,getRed(rgb)); 1067 setBlue(&rgb,getRed(rgb)); 1068 } 1069 else { 1070 1071 float h=6.0f*(hue-floor(hue)); 1072 float f=h-floor(h); 1073 float p=brightness*(1.0f-saturation); 1074 float q=brightness*(1.0f-saturation*f); 1075 float t=brightness*(1.0f-(saturation*(1.0f-f))); 1076 1077 float clampedBrightness = ClampToQuantum(QuantumRange*brightness); 1078 float clamped_t = ClampToQuantum(QuantumRange*t); 1079 float clamped_p = ClampToQuantum(QuantumRange*p); 1080 float clamped_q = ClampToQuantum(QuantumRange*q); 1081 int ih = (int)h; 1082 setRed(&rgb, (ih == 1)?clamped_q: 1083 (ih == 2 || ih == 3)?clamped_p: 1084 (ih == 4)?clamped_t: 1085 clampedBrightness); 1086 1087 setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness: 1088 (ih == 3)?clamped_q: 1089 (ih == 4 || ih == 5)?clamped_p: 1090 clamped_t); 1091 1092 setBlue(&rgb, (ih == 2)?clamped_t: 1093 (ih == 3 || ih == 4)?clampedBrightness: 1094 (ih == 5)?clamped_q: 1095 clamped_p); 1096 } 1097 return rgb; 1098 } 1099 1100 __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen) 1101 { 1102 1103 const int sign = sharpen!=0?1:-1; 1104 const int x = get_global_id(0); 1105 const int y = get_global_id(1); 1106 const int columns = get_global_size(0); 1107 const int c = x + y * columns; 1108 1109 CLPixelType pixel = im[c]; 1110 float3 HueSaturationBrightness = ConvertRGBToHSB(pixel); 1111 float brightness = HueSaturationBrightness.z; 1112 brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness); 1113 brightness = clamp(brightness,0.0f,1.0f); 1114 HueSaturationBrightness.z = brightness; 1115 1116 CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness); 1117 filteredPixel.w = pixel.w; 1118 im[c] = filteredPixel; 1119 } 1120 ) 1121 1122 /* 1123 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1124 % % 1125 % % 1126 % % 1127 % C o n t r a s t S t r e t c h % 1128 % % 1129 % % 1130 % % 1131 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1132 */ 1133 1134 STRINGIFY( 1135 /* 1136 */ 1137 __kernel void Histogram(__global CLPixelType * restrict im, 1138 const ChannelType channel, 1139 const unsigned int colorspace, 1140 const unsigned int method, 1141 __global uint4 * restrict histogram) 1142 { 1143 const int x = get_global_id(0); 1144 const int y = get_global_id(1); 1145 const int columns = get_global_size(0); 1146 const int c = x + y * columns; 1147 if ((channel & SyncChannels) != 0) 1148 { 1149 float red=(float)getRed(im[c]); 1150 float green=(float)getGreen(im[c]); 1151 float blue=(float)getBlue(im[c]); 1152 1153 float intensity = GetPixelIntensity(colorspace, method, red, green, blue); 1154 uint pos = ScaleQuantumToMap(ClampToQuantum(intensity)); 1155 atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position 1156 } 1157 else 1158 { 1159 // for equalizing, we always need all channels? 1160 // otherwise something more 1161 } 1162 } 1163 ) 1164 1165 STRINGIFY( 1166 /* 1167 */ 1168 __kernel void ContrastStretch(__global CLPixelType * restrict im, 1169 const ChannelType channel, 1170 __global CLPixelType * restrict stretch_map, 1171 const float4 white, const float4 black) 1172 { 1173 const int x = get_global_id(0); 1174 const int y = get_global_id(1); 1175 const int columns = get_global_size(0); 1176 const int c = x + y * columns; 1177 1178 uint ePos; 1179 CLPixelType oValue, eValue; 1180 CLQuantum red, green, blue, alpha; 1181 1182 //read from global 1183 oValue=im[c]; 1184 1185 if ((channel & RedChannel) != 0) 1186 { 1187 if (getRedF4(white) != getRedF4(black)) 1188 { 1189 ePos = ScaleQuantumToMap(getRed(oValue)); 1190 eValue = stretch_map[ePos]; 1191 red = getRed(eValue); 1192 } 1193 } 1194 1195 if ((channel & GreenChannel) != 0) 1196 { 1197 if (getGreenF4(white) != getGreenF4(black)) 1198 { 1199 ePos = ScaleQuantumToMap(getGreen(oValue)); 1200 eValue = stretch_map[ePos]; 1201 green = getGreen(eValue); 1202 } 1203 } 1204 1205 if ((channel & BlueChannel) != 0) 1206 { 1207 if (getBlueF4(white) != getBlueF4(black)) 1208 { 1209 ePos = ScaleQuantumToMap(getBlue(oValue)); 1210 eValue = stretch_map[ePos]; 1211 blue = getBlue(eValue); 1212 } 1213 } 1214 1215 if ((channel & AlphaChannel) != 0) 1216 { 1217 if (getAlphaF4(white) != getAlphaF4(black)) 1218 { 1219 ePos = ScaleQuantumToMap(getAlpha(oValue)); 1220 eValue = stretch_map[ePos]; 1221 alpha = getAlpha(eValue); 1222 } 1223 } 1224 1225 //write back 1226 im[c]=(CLPixelType)(blue, green, red, alpha); 1227 1228 } 1229 ) 1230 1231 /* 1232 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1233 % % 1234 % % 1235 % % 1236 % C o n v o l v e % 1237 % % 1238 % % 1239 % % 1240 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1241 */ 1242 1243 STRINGIFY( 1244 __kernel 1245 void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output, 1246 const unsigned int imageWidth, const unsigned int imageHeight, 1247 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, 1248 const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) { 1249 1250 int2 blockID; 1251 blockID.x = get_global_id(0) / get_local_size(0); 1252 blockID.y = get_global_id(1) / get_local_size(1); 1253 1254 // image area processed by this workgroup 1255 int2 imageAreaOrg; 1256 imageAreaOrg.x = blockID.x * get_local_size(0); 1257 imageAreaOrg.y = blockID.y * get_local_size(1); 1258 1259 int2 midFilterDimen; 1260 midFilterDimen.x = (filterWidth-1)/2; 1261 midFilterDimen.y = (filterHeight-1)/2; 1262 1263 int2 cachedAreaOrg = imageAreaOrg - midFilterDimen; 1264 1265 // dimension of the local cache 1266 int2 cachedAreaDimen; 1267 cachedAreaDimen.x = get_local_size(0) + filterWidth - 1; 1268 cachedAreaDimen.y = get_local_size(1) + filterHeight - 1; 1269 1270 // cache the pixels accessed by this workgroup in local memory 1271 int localID = get_local_id(1)*get_local_size(0)+get_local_id(0); 1272 int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y; 1273 int groupSize = get_local_size(0) * get_local_size(1); 1274 for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) { 1275 1276 int2 cachedAreaIndex; 1277 cachedAreaIndex.x = i % cachedAreaDimen.x; 1278 cachedAreaIndex.y = i / cachedAreaDimen.x; 1279 1280 int2 imagePixelIndex; 1281 imagePixelIndex = cachedAreaOrg + cachedAreaIndex; 1282 1283 // only support EdgeVirtualPixelMethod through ClampToCanvas 1284 // TODO: implement other virtual pixel method 1285 imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth); 1286 imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight); 1287 1288 pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x]; 1289 } 1290 1291 // cache the filter 1292 for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) { 1293 filterCache[i] = filter[i]; 1294 } 1295 barrier(CLK_LOCAL_MEM_FENCE); 1296 1297 1298 int2 imageIndex; 1299 imageIndex.x = imageAreaOrg.x + get_local_id(0); 1300 imageIndex.y = imageAreaOrg.y + get_local_id(1); 1301 1302 // if out-of-range, stops here and quit 1303 if (imageIndex.x >= imageWidth 1304 || imageIndex.y >= imageHeight) { 1305 return; 1306 } 1307 1308 int filterIndex = 0; 1309 float4 sum = (float4)0.0f; 1310 float gamma = 0.0f; 1311 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 1312 int cacheIndexY = get_local_id(1); 1313 for (int j = 0; j < filterHeight; j++) { 1314 int cacheIndexX = get_local_id(0); 1315 for (int i = 0; i < filterWidth; i++) { 1316 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; 1317 float f = filterCache[filterIndex]; 1318 1319 sum.x += f * p.x; 1320 sum.y += f * p.y; 1321 sum.z += f * p.z; 1322 sum.w += f * p.w; 1323 1324 gamma += f; 1325 filterIndex++; 1326 cacheIndexX++; 1327 } 1328 cacheIndexY++; 1329 } 1330 } 1331 else { 1332 int cacheIndexY = get_local_id(1); 1333 for (int j = 0; j < filterHeight; j++) { 1334 int cacheIndexX = get_local_id(0); 1335 for (int i = 0; i < filterWidth; i++) { 1336 1337 CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX]; 1338 float alpha = QuantumScale*p.w; 1339 float f = filterCache[filterIndex]; 1340 float g = alpha * f; 1341 1342 sum.x += g*p.x; 1343 sum.y += g*p.y; 1344 sum.z += g*p.z; 1345 sum.w += f*p.w; 1346 1347 gamma += g; 1348 filterIndex++; 1349 cacheIndexX++; 1350 } 1351 cacheIndexY++; 1352 } 1353 gamma = PerceptibleReciprocal(gamma); 1354 sum.xyz = gamma*sum.xyz; 1355 } 1356 CLPixelType outputPixel; 1357 outputPixel.x = ClampToQuantum(sum.x); 1358 outputPixel.y = ClampToQuantum(sum.y); 1359 outputPixel.z = ClampToQuantum(sum.z); 1360 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; 1361 1362 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; 1363 } 1364 ) 1365 1366 STRINGIFY( 1367 __kernel 1368 void Convolve(const __global CLPixelType *input, __global CLPixelType *output, 1369 const uint imageWidth, const uint imageHeight, 1370 __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight, 1371 const uint matte, const ChannelType channel) { 1372 1373 int2 imageIndex; 1374 imageIndex.x = get_global_id(0); 1375 imageIndex.y = get_global_id(1); 1376 1377 /* 1378 unsigned int imageWidth = get_global_size(0); 1379 unsigned int imageHeight = get_global_size(1); 1380 */ 1381 if (imageIndex.x >= imageWidth 1382 || imageIndex.y >= imageHeight) 1383 return; 1384 1385 int2 midFilterDimen; 1386 midFilterDimen.x = (filterWidth-1)/2; 1387 midFilterDimen.y = (filterHeight-1)/2; 1388 1389 int filterIndex = 0; 1390 float4 sum = (float4)0.0f; 1391 float gamma = 0.0f; 1392 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 1393 for (int j = 0; j < filterHeight; j++) { 1394 int2 inputPixelIndex; 1395 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; 1396 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); 1397 for (int i = 0; i < filterWidth; i++) { 1398 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; 1399 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); 1400 1401 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; 1402 float f = filter[filterIndex]; 1403 1404 sum.x += f * p.x; 1405 sum.y += f * p.y; 1406 sum.z += f * p.z; 1407 sum.w += f * p.w; 1408 1409 gamma += f; 1410 1411 filterIndex++; 1412 } 1413 } 1414 } 1415 else { 1416 1417 for (int j = 0; j < filterHeight; j++) { 1418 int2 inputPixelIndex; 1419 inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j; 1420 inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight); 1421 for (int i = 0; i < filterWidth; i++) { 1422 inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i; 1423 inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth); 1424 1425 CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x]; 1426 float alpha = QuantumScale*p.w; 1427 float f = filter[filterIndex]; 1428 float g = alpha * f; 1429 1430 sum.x += g*p.x; 1431 sum.y += g*p.y; 1432 sum.z += g*p.z; 1433 sum.w += f*p.w; 1434 1435 gamma += g; 1436 1437 1438 filterIndex++; 1439 } 1440 } 1441 gamma = PerceptibleReciprocal(gamma); 1442 sum.xyz = gamma*sum.xyz; 1443 } 1444 1445 CLPixelType outputPixel; 1446 outputPixel.x = ClampToQuantum(sum.x); 1447 outputPixel.y = ClampToQuantum(sum.y); 1448 outputPixel.z = ClampToQuantum(sum.z); 1449 outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w; 1450 1451 output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel; 1452 } 1453 ) 1454 1455 /* 1456 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1457 % % 1458 % % 1459 % % 1460 % D e s p e c k l e % 1461 % % 1462 % % 1463 % % 1464 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1465 */ 1466 1467 STRINGIFY( 1468 1469 __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage 1470 , const unsigned int imageWidth, const unsigned int imageHeight 1471 , const int2 offset, const int polarity, const int matte) { 1472 1473 int x = get_global_id(0); 1474 int y = get_global_id(1); 1475 1476 CLPixelType v = inputImage[y*imageWidth+x]; 1477 1478 int2 neighbor; 1479 neighbor.y = y + offset.y; 1480 neighbor.x = x + offset.x; 1481 1482 int2 clampedNeighbor; 1483 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1484 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1485 1486 CLPixelType r = (clampedNeighbor.x == neighbor.x 1487 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1488 :(CLPixelType)0; 1489 1490 int sv[4]; 1491 sv[0] = (int)v.x; 1492 sv[1] = (int)v.y; 1493 sv[2] = (int)v.z; 1494 sv[3] = (int)v.w; 1495 1496 int sr[4]; 1497 sr[0] = (int)r.x; 1498 sr[1] = (int)r.y; 1499 sr[2] = (int)r.z; 1500 sr[3] = (int)r.w; 1501 1502 if (polarity > 0) { 1503 \n #pragma unroll 4\n 1504 for (unsigned int i = 0; i < 4; i++) { 1505 sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i]; 1506 } 1507 } 1508 else { 1509 \n #pragma unroll 4\n 1510 for (unsigned int i = 0; i < 4; i++) { 1511 sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i]; 1512 } 1513 1514 } 1515 1516 v.x = (CLQuantum)sv[0]; 1517 v.y = (CLQuantum)sv[1]; 1518 v.z = (CLQuantum)sv[2]; 1519 1520 if (matte!=0) 1521 v.w = (CLQuantum)sv[3]; 1522 1523 outputImage[y*imageWidth+x] = v; 1524 1525 } 1526 1527 1528 ) 1529 1530 STRINGIFY( 1531 1532 __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage 1533 , const unsigned int imageWidth, const unsigned int imageHeight 1534 , const int2 offset, const int polarity, const int matte) { 1535 1536 int x = get_global_id(0); 1537 int y = get_global_id(1); 1538 1539 CLPixelType v = inputImage[y*imageWidth+x]; 1540 1541 int2 neighbor, clampedNeighbor; 1542 1543 neighbor.y = y + offset.y; 1544 neighbor.x = x + offset.x; 1545 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1546 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1547 1548 CLPixelType r = (clampedNeighbor.x == neighbor.x 1549 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1550 :(CLPixelType)0; 1551 1552 1553 neighbor.y = y - offset.y; 1554 neighbor.x = x - offset.x; 1555 clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth); 1556 clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight); 1557 1558 CLPixelType s = (clampedNeighbor.x == neighbor.x 1559 && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x] 1560 :(CLPixelType)0; 1561 1562 1563 int sv[4]; 1564 sv[0] = (int)v.x; 1565 sv[1] = (int)v.y; 1566 sv[2] = (int)v.z; 1567 sv[3] = (int)v.w; 1568 1569 int sr[4]; 1570 sr[0] = (int)r.x; 1571 sr[1] = (int)r.y; 1572 sr[2] = (int)r.z; 1573 sr[3] = (int)r.w; 1574 1575 int ss[4]; 1576 ss[0] = (int)s.x; 1577 ss[1] = (int)s.y; 1578 ss[2] = (int)s.z; 1579 ss[3] = (int)s.w; 1580 1581 if (polarity > 0) { 1582 \n #pragma unroll 4\n 1583 for (unsigned int i = 0; i < 4; i++) { 1584 //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i]; 1585 // 1586 //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1587 //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1588 sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1)); 1589 } 1590 } 1591 else { 1592 \n #pragma unroll 4\n 1593 for (unsigned int i = 0; i < 4; i++) { 1594 //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i]; 1595 // 1596 //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); 1597 sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1)); 1598 } 1599 } 1600 1601 v.x = (CLQuantum)sv[0]; 1602 v.y = (CLQuantum)sv[1]; 1603 v.z = (CLQuantum)sv[2]; 1604 1605 if (matte!=0) 1606 v.w = (CLQuantum)sv[3]; 1607 1608 outputImage[y*imageWidth+x] = v; 1609 1610 } 1611 ) 1612 1613 /* 1614 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1615 % % 1616 % % 1617 % % 1618 % E q u a l i z e % 1619 % % 1620 % % 1621 % % 1622 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1623 */ 1624 1625 STRINGIFY( 1626 /* 1627 */ 1628 __kernel void Equalize(__global CLPixelType * restrict im, 1629 const ChannelType channel, 1630 __global CLPixelType * restrict equalize_map, 1631 const float4 white, const float4 black) 1632 { 1633 const int x = get_global_id(0); 1634 const int y = get_global_id(1); 1635 const int columns = get_global_size(0); 1636 const int c = x + y * columns; 1637 1638 uint ePos; 1639 CLPixelType oValue, eValue; 1640 CLQuantum red, green, blue, alpha; 1641 1642 //read from global 1643 oValue=im[c]; 1644 1645 if ((channel & SyncChannels) != 0) 1646 { 1647 if (getRedF4(white) != getRedF4(black)) 1648 { 1649 ePos = ScaleQuantumToMap(getRed(oValue)); 1650 eValue = equalize_map[ePos]; 1651 red = getRed(eValue); 1652 ePos = ScaleQuantumToMap(getGreen(oValue)); 1653 eValue = equalize_map[ePos]; 1654 green = getRed(eValue); 1655 ePos = ScaleQuantumToMap(getBlue(oValue)); 1656 eValue = equalize_map[ePos]; 1657 blue = getRed(eValue); 1658 ePos = ScaleQuantumToMap(getAlpha(oValue)); 1659 eValue = equalize_map[ePos]; 1660 alpha = getRed(eValue); 1661 1662 //write back 1663 im[c]=(CLPixelType)(blue, green, red, alpha); 1664 } 1665 1666 } 1667 1668 // for equalizing, we always need all channels? 1669 // otherwise something more 1670 1671 } 1672 ) 1673 1674 /* 1675 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1676 % % 1677 % % 1678 % % 1679 % F u n c t i o n % 1680 % % 1681 % % 1682 % % 1683 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1684 */ 1685 1686 STRINGIFY( 1687 /* 1688 apply FunctionImageChannel(braightness-contrast) 1689 */ 1690 CLQuantum ApplyFunction(float pixel,const MagickFunction function, 1691 const unsigned int number_parameters,__constant float *parameters) 1692 { 1693 float result = 0.0f; 1694 1695 switch (function) 1696 { 1697 case PolynomialFunction: 1698 { 1699 for (unsigned int i=0; i < number_parameters; i++) 1700 result = result*QuantumScale*pixel + parameters[i]; 1701 result *= QuantumRange; 1702 break; 1703 } 1704 case SinusoidFunction: 1705 { 1706 float freq,phase,ampl,bias; 1707 freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1708 phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f; 1709 ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f; 1710 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1711 result = QuantumRange*(ampl*sin(2.0f*MagickPI* 1712 (freq*QuantumScale*pixel + phase/360.0f)) + bias); 1713 break; 1714 } 1715 case ArcsinFunction: 1716 { 1717 float width,range,center,bias; 1718 width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1719 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; 1720 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; 1721 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1722 1723 result = 2.0f/width*(QuantumScale*pixel - center); 1724 result = range/MagickPI*asin(result)+bias; 1725 result = ( result <= -1.0f ) ? bias - range/2.0f : result; 1726 result = ( result >= 1.0f ) ? bias + range/2.0f : result; 1727 result *= QuantumRange; 1728 break; 1729 } 1730 case ArctanFunction: 1731 { 1732 float slope,range,center,bias; 1733 slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f; 1734 center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f; 1735 range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f; 1736 bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f; 1737 result = MagickPI*slope*(QuantumScale*pixel-center); 1738 result = QuantumRange*(range/MagickPI*atan(result) + bias); 1739 break; 1740 } 1741 case UndefinedFunction: 1742 break; 1743 } 1744 return(ClampToQuantum(result)); 1745 } 1746 ) 1747 1748 STRINGIFY( 1749 /* 1750 Improve brightness / contrast of the image 1751 channel : define which channel is improved 1752 function : the function called to enchance the brightness contrast 1753 number_parameters : numbers of parameters 1754 parameters : the parameter 1755 */ 1756 __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels, 1757 const ChannelType channel,const MagickFunction function,const unsigned int number_parameters, 1758 __constant float *parameters) 1759 { 1760 const unsigned int x = get_global_id(0); 1761 const unsigned int y = get_global_id(1); 1762 const unsigned int columns = get_global_size(0); 1763 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 1764 1765 float red; 1766 float green; 1767 float blue; 1768 float alpha; 1769 1770 ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha); 1771 1772 if ((channel & RedChannel) != 0) 1773 red=ApplyFunction(red, function, number_parameters, parameters); 1774 1775 if (number_channels > 2) 1776 { 1777 if ((channel & GreenChannel) != 0) 1778 green=ApplyFunction(green, function, number_parameters, parameters); 1779 1780 if ((channel & BlueChannel) != 0) 1781 blue=ApplyFunction(blue, function, number_parameters, parameters); 1782 } 1783 1784 if (((number_channels == 4) || (number_channels == 2)) && 1785 ((channel & AlphaChannel) != 0)) 1786 alpha=ApplyFunction(alpha, function, number_parameters, parameters); 1787 1788 WriteChannels(p, number_channels, channel, red, green, blue, alpha); 1789 } 1790 ) 1791 1792 /* 1793 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1794 % % 1795 % % 1796 % % 1797 % G r a y s c a l e % 1798 % % 1799 % % 1800 % % 1801 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1802 */ 1803 1804 STRINGIFY( 1805 __kernel void Grayscale(__global CLQuantum *image,const int number_channels, 1806 const unsigned int colorspace,const unsigned int method) 1807 { 1808 const unsigned int x = get_global_id(0); 1809 const unsigned int y = get_global_id(1); 1810 const unsigned int columns = get_global_size(0); 1811 __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y); 1812 1813 float 1814 blue, 1815 green, 1816 red; 1817 1818 red=getPixelRed(p); 1819 green=getPixelGreen(p); 1820 blue=getPixelBlue(p); 1821 1822 CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue)); 1823 1824 setPixelRed(p,intensity); 1825 setPixelGreen(p,intensity); 1826 setPixelBlue(p,intensity); 1827 } 1828 ) 1829 1830 /* 1831 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1832 % % 1833 % % 1834 % % 1835 % L o c a l C o n t r a s t % 1836 % % 1837 % % 1838 % % 1839 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1840 */ 1841 1842 STRINGIFY( 1843 1844 __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage, 1845 const int radius, 1846 const int imageWidth, 1847 const int imageHeight) 1848 { 1849 const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f)); 1850 1851 int x = get_local_id(0); 1852 int y = get_global_id(1); 1853 1854 global CLPixelType *src = srcImage + y * imageWidth; 1855 1856 for (int i = x; i < imageWidth; i += get_local_size(0)) { 1857 float sum = 0.0f; 1858 float weight = 1.0f; 1859 1860 int j = i - radius; 1861 while ((j + 7) < i) { 1862 for (int k = 0; k < 8; ++k) // Unroll 8x 1863 sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)])); 1864 weight += 8.0f; 1865 j+=8; 1866 } 1867 while (j < i) { 1868 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)])); 1869 weight += 1.0f; 1870 ++j; 1871 } 1872 1873 while ((j + 7) < radius + i) { 1874 for (int k = 0; k < 8; ++k) // Unroll 8x 1875 sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)])); 1876 weight -= 8.0f; 1877 j+=8; 1878 } 1879 while (j < radius + i) { 1880 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)])); 1881 weight -= 1.0f; 1882 ++j; 1883 } 1884 1885 tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1)); 1886 } 1887 } 1888 ) 1889 1890 STRINGIFY( 1891 __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage, 1892 const int radius, 1893 const float strength, 1894 const int imageWidth, 1895 const int imageHeight) 1896 { 1897 const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f); 1898 1899 int x = get_global_id(0); 1900 int y = get_global_id(1); 1901 1902 if ((x >= imageWidth) || (y >= imageHeight)) 1903 return; 1904 1905 global float *src = blurImage + x; 1906 1907 float sum = 0.0f; 1908 float weight = 1.0f; 1909 1910 int j = y - radius; 1911 while ((j + 7) < y) { 1912 for (int k = 0; k < 8; ++k) // Unroll 8x 1913 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth]; 1914 weight += 8.0f; 1915 j+=8; 1916 } 1917 while (j < y) { 1918 sum += weight * src[mirrorBottom(j) * imageWidth]; 1919 weight += 1.0f; 1920 ++j; 1921 } 1922 1923 while ((j + 7) < radius + y) { 1924 for (int k = 0; k < 8; ++k) // Unroll 8x 1925 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth]; 1926 weight -= 8.0f; 1927 j+=8; 1928 } 1929 while (j < radius + y) { 1930 sum += weight * src[mirrorTop(j, imageHeight) * imageWidth]; 1931 weight -= 1.0f; 1932 ++j; 1933 } 1934 1935 CLPixelType pixel = srcImage[x + y * imageWidth]; 1936 float srcVal = dot(RGB, convert_float4(pixel)); 1937 float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f); 1938 mult = (srcVal + mult) / srcVal; 1939 1940 pixel.x = ClampToQuantum(pixel.x * mult); 1941 pixel.y = ClampToQuantum(pixel.y * mult); 1942 pixel.z = ClampToQuantum(pixel.z * mult); 1943 1944 dstImage[x + y * imageWidth] = pixel; 1945 } 1946 ) 1947 1948 /* 1949 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1950 % % 1951 % % 1952 % % 1953 % M o d u l a t e % 1954 % % 1955 % % 1956 % % 1957 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1958 */ 1959 1960 STRINGIFY( 1961 1962 inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue, 1963 float *hue, float *saturation, float *lightness) 1964 { 1965 float 1966 c, 1967 tmax, 1968 tmin; 1969 1970 /* 1971 Convert RGB to HSL colorspace. 1972 */ 1973 tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue)); 1974 tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue)); 1975 1976 c=tmax-tmin; 1977 1978 *lightness=(tmax+tmin)/2.0; 1979 if (c <= 0.0) 1980 { 1981 *hue=0.0; 1982 *saturation=0.0; 1983 return; 1984 } 1985 1986 if (tmax == (QuantumScale*red)) 1987 { 1988 *hue=(QuantumScale*green-QuantumScale*blue)/c; 1989 if ((QuantumScale*green) < (QuantumScale*blue)) 1990 *hue+=6.0; 1991 } 1992 else 1993 if (tmax == (QuantumScale*green)) 1994 *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c; 1995 else 1996 *hue=4.0+(QuantumScale*red-QuantumScale*green)/c; 1997 1998 *hue*=60.0/360.0; 1999 if (*lightness <= 0.5) 2000 *saturation=c/(2.0*(*lightness)); 2001 else 2002 *saturation=c/(2.0-2.0*(*lightness)); 2003 } 2004 2005 inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness, 2006 CLQuantum *red,CLQuantum *green,CLQuantum *blue) 2007 { 2008 float 2009 b, 2010 c, 2011 g, 2012 h, 2013 tmin, 2014 r, 2015 x; 2016 2017 /* 2018 Convert HSL to RGB colorspace. 2019 */ 2020 h=hue*360.0; 2021 if (lightness <= 0.5) 2022 c=2.0*lightness*saturation; 2023 else 2024 c=(2.0-2.0*lightness)*saturation; 2025 tmin=lightness-0.5*c; 2026 h-=360.0*floor(h/360.0); 2027 h/=60.0; 2028 x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0)); 2029 switch ((int) floor(h) % 6) 2030 { 2031 case 0: 2032 { 2033 r=tmin+c; 2034 g=tmin+x; 2035 b=tmin; 2036 break; 2037 } 2038 case 1: 2039 { 2040 r=tmin+x; 2041 g=tmin+c; 2042 b=tmin; 2043 break; 2044 } 2045 case 2: 2046 { 2047 r=tmin; 2048 g=tmin+c; 2049 b=tmin+x; 2050 break; 2051 } 2052 case 3: 2053 { 2054 r=tmin; 2055 g=tmin+x; 2056 b=tmin+c; 2057 break; 2058 } 2059 case 4: 2060 { 2061 r=tmin+x; 2062 g=tmin; 2063 b=tmin+c; 2064 break; 2065 } 2066 case 5: 2067 { 2068 r=tmin+c; 2069 g=tmin; 2070 b=tmin+x; 2071 break; 2072 } 2073 default: 2074 { 2075 r=0.0; 2076 g=0.0; 2077 b=0.0; 2078 } 2079 } 2080 *red=ClampToQuantum(QuantumRange*r); 2081 *green=ClampToQuantum(QuantumRange*g); 2082 *blue=ClampToQuantum(QuantumRange*b); 2083 } 2084 2085 inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness, 2086 CLQuantum *red,CLQuantum *green,CLQuantum *blue) 2087 { 2088 float 2089 hue, 2090 lightness, 2091 saturation; 2092 2093 /* 2094 Increase or decrease color lightness, saturation, or hue. 2095 */ 2096 ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness); 2097 hue+=0.5*(0.01*percent_hue-1.0); 2098 while (hue < 0.0) 2099 hue+=1.0; 2100 while (hue >= 1.0) 2101 hue-=1.0; 2102 saturation*=0.01*percent_saturation; 2103 lightness*=0.01*percent_lightness; 2104 ConvertHSLToRGB(hue,saturation,lightness,red,green,blue); 2105 } 2106 2107 __kernel void Modulate(__global CLPixelType *im, 2108 const float percent_brightness, 2109 const float percent_hue, 2110 const float percent_saturation, 2111 const int colorspace) 2112 { 2113 2114 const int x = get_global_id(0); 2115 const int y = get_global_id(1); 2116 const int columns = get_global_size(0); 2117 const int c = x + y * columns; 2118 2119 CLPixelType pixel = im[c]; 2120 2121 CLQuantum 2122 blue, 2123 green, 2124 red; 2125 2126 red=getRed(pixel); 2127 green=getGreen(pixel); 2128 blue=getBlue(pixel); 2129 2130 switch (colorspace) 2131 { 2132 case HSLColorspace: 2133 default: 2134 { 2135 ModulateHSL(percent_hue, percent_saturation, percent_brightness, 2136 &red, &green, &blue); 2137 } 2138 2139 } 2140 2141 CLPixelType filteredPixel; 2142 2143 setRed(&filteredPixel, red); 2144 setGreen(&filteredPixel, green); 2145 setBlue(&filteredPixel, blue); 2146 filteredPixel.w = pixel.w; 2147 2148 im[c] = filteredPixel; 2149 } 2150 ) 2151 2152 /* 2153 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2154 % % 2155 % % 2156 % % 2157 % M o t i o n B l u r % 2158 % % 2159 % % 2160 % % 2161 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2162 */ 2163 2164 STRINGIFY( 2165 __kernel 2166 void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output, 2167 const unsigned int imageWidth, const unsigned int imageHeight, 2168 const __global float *filter, const unsigned int width, const __global int2* offset, 2169 const float4 bias, 2170 const ChannelType channel, const unsigned int matte) { 2171 2172 int2 currentPixel; 2173 currentPixel.x = get_global_id(0); 2174 currentPixel.y = get_global_id(1); 2175 2176 if (currentPixel.x >= imageWidth 2177 || currentPixel.y >= imageHeight) 2178 return; 2179 2180 float4 pixel; 2181 pixel.x = (float)bias.x; 2182 pixel.y = (float)bias.y; 2183 pixel.z = (float)bias.z; 2184 pixel.w = (float)bias.w; 2185 2186 if (((channel & AlphaChannel) == 0) || (matte == 0)) { 2187 2188 for (int i = 0; i < width; i++) { 2189 // only support EdgeVirtualPixelMethod through ClampToCanvas 2190 // TODO: implement other virtual pixel method 2191 int2 samplePixel = currentPixel + offset[i]; 2192 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); 2193 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); 2194 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; 2195 2196 pixel.x += (filter[i] * (float)samplePixelValue.x); 2197 pixel.y += (filter[i] * (float)samplePixelValue.y); 2198 pixel.z += (filter[i] * (float)samplePixelValue.z); 2199 pixel.w += (filter[i] * (float)samplePixelValue.w); 2200 } 2201 2202 CLPixelType outputPixel; 2203 outputPixel.x = ClampToQuantum(pixel.x); 2204 outputPixel.y = ClampToQuantum(pixel.y); 2205 outputPixel.z = ClampToQuantum(pixel.z); 2206 outputPixel.w = ClampToQuantum(pixel.w); 2207 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; 2208 } 2209 else { 2210 2211 float gamma = 0.0f; 2212 for (int i = 0; i < width; i++) { 2213 // only support EdgeVirtualPixelMethod through ClampToCanvas 2214 // TODO: implement other virtual pixel method 2215 int2 samplePixel = currentPixel + offset[i]; 2216 samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth); 2217 samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight); 2218 2219 CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x]; 2220 2221 float alpha = QuantumScale*samplePixelValue.w; 2222 float k = filter[i]; 2223 pixel.x = pixel.x + k * alpha * samplePixelValue.x; 2224 pixel.y = pixel.y + k * alpha * samplePixelValue.y; 2225 pixel.z = pixel.z + k * alpha * samplePixelValue.z; 2226 2227 pixel.w += k * alpha * samplePixelValue.w; 2228 2229 gamma+=k*alpha; 2230 } 2231 gamma = PerceptibleReciprocal(gamma); 2232 pixel.xyz = gamma*pixel.xyz; 2233 2234 CLPixelType outputPixel; 2235 outputPixel.x = ClampToQuantum(pixel.x); 2236 outputPixel.y = ClampToQuantum(pixel.y); 2237 outputPixel.z = ClampToQuantum(pixel.z); 2238 outputPixel.w = ClampToQuantum(pixel.w); 2239 output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel; 2240 } 2241 } 2242 ) 2243 2244 /* 2245 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2246 % % 2247 % % 2248 % % 2249 % R e s i z e % 2250 % % 2251 % % 2252 % % 2253 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2254 */ 2255 2256 STRINGIFY( 2257 // Based on Box from resize.c 2258 float BoxResizeFilter(const float x) 2259 { 2260 return 1.0f; 2261 } 2262 ) 2263 2264 STRINGIFY( 2265 // Based on CubicBC from resize.c 2266 float CubicBC(const float x,const __global float* resizeFilterCoefficients) 2267 { 2268 /* 2269 Cubic Filters using B,C determined values: 2270 Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter 2271 Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears 2272 Spline B = 1 C = 0 B-Spline Gaussian approximation 2273 Hermite B = 0 C = 0 B-Spline interpolator 2274 2275 See paper by Mitchell and Netravali, Reconstruction Filters in Computer 2276 Graphics Computer Graphics, Volume 22, Number 4, August 1988 2277 http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/ 2278 Mitchell.pdf. 2279 2280 Coefficents are determined from B,C values: 2281 P0 = ( 6 - 2*B )/6 = coeff[0] 2282 P1 = 0 2283 P2 = (-18 +12*B + 6*C )/6 = coeff[1] 2284 P3 = ( 12 - 9*B - 6*C )/6 = coeff[2] 2285 Q0 = ( 8*B +24*C )/6 = coeff[3] 2286 Q1 = ( -12*B -48*C )/6 = coeff[4] 2287 Q2 = ( 6*B +30*C )/6 = coeff[5] 2288 Q3 = ( - 1*B - 6*C )/6 = coeff[6] 2289 2290 which are used to define the filter: 2291 2292 P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1 2293 Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2 2294 2295 which ensures function is continuous in value and derivative (slope). 2296 */ 2297 if (x < 1.0) 2298 return(resizeFilterCoefficients[0]+x*(x* 2299 (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2]))); 2300 if (x < 2.0) 2301 return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x* 2302 (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6]))); 2303 return(0.0); 2304 } 2305 ) 2306 2307 STRINGIFY( 2308 float Sinc(const float x) 2309 { 2310 if (x != 0.0f) 2311 { 2312 const float alpha=(float) (MagickPI*x); 2313 return sinpi(x)/alpha; 2314 } 2315 return(1.0f); 2316 } 2317 ) 2318 2319 STRINGIFY( 2320 float Triangle(const float x) 2321 { 2322 /* 2323 1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or 2324 a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function 2325 for Sinc(). 2326 */ 2327 return ((x<1.0f)?(1.0f-x):0.0f); 2328 } 2329 ) 2330 2331 2332 STRINGIFY( 2333 float Hann(const float x) 2334 { 2335 /* 2336 Cosine window function: 2337 0.5+0.5*cos(pi*x). 2338 */ 2339 const float cosine=cos((MagickPI*x)); 2340 return(0.5f+0.5f*cosine); 2341 } 2342 ) 2343 2344 STRINGIFY( 2345 float Hamming(const float x) 2346 { 2347 /* 2348 Offset cosine window function: 2349 .54 + .46 cos(pi x). 2350 */ 2351 const float cosine=cos((MagickPI*x)); 2352 return(0.54f+0.46f*cosine); 2353 } 2354 ) 2355 2356 STRINGIFY( 2357 float Blackman(const float x) 2358 { 2359 /* 2360 Blackman: 2nd order cosine windowing function: 2361 0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x) 2362 2363 Refactored by Chantal Racette and Nicolas Robidoux to one trig call and 2364 five flops. 2365 */ 2366 const float cosine=cos((MagickPI*x)); 2367 return(0.34f+cosine*(0.5f+cosine*0.16f)); 2368 } 2369 ) 2370 2371 STRINGIFY( 2372 inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients) 2373 { 2374 switch (filterType) 2375 { 2376 /* Call Sinc even for SincFast to get better precision on GPU 2377 and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/ 2378 case SincWeightingFunction: 2379 case SincFastWeightingFunction: 2380 return Sinc(x); 2381 case CubicBCWeightingFunction: 2382 return CubicBC(x,filterCoefficients); 2383 case BoxWeightingFunction: 2384 return BoxResizeFilter(x); 2385 case TriangleWeightingFunction: 2386 return Triangle(x); 2387 case HannWeightingFunction: 2388 return Hann(x); 2389 case HammingWeightingFunction: 2390 return Hamming(x); 2391 case BlackmanWeightingFunction: 2392 return Blackman(x); 2393 2394 default: 2395 return 0.0f; 2396 } 2397 } 2398 ) 2399 2400 2401 STRINGIFY( 2402 inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType 2403 , const ResizeWeightingFunctionType resizeWindowType 2404 , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x) 2405 { 2406 float scale; 2407 float xBlur = fabs(x/resizeFilterBlur); 2408 if (resizeWindowSupport < MagickEpsilon 2409 || resizeWindowType == BoxWeightingFunction) 2410 { 2411 scale = 1.0f; 2412 } 2413 else 2414 { 2415 scale = resizeFilterScale; 2416 scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients); 2417 } 2418 float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients); 2419 return weight; 2420 } 2421 2422 ) 2423 2424 ; 2425 const char *accelerateKernels2 = 2426 2427 STRINGIFY( 2428 2429 inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { 2430 return (numWorkItems/pixelPerWorkgroup); 2431 } 2432 2433 // returns the index of the pixel for the current workitem to compute. 2434 // returns -1 if this workitem doesn't need to participate in any computation 2435 inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) { 2436 const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems); 2437 int pixelIndex = itemID/numWorkItemsPerPixel; 2438 pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1; 2439 return pixelIndex; 2440 } 2441 2442 ) 2443 2444 STRINGIFY( 2445 __kernel __attribute__((reqd_work_group_size(256, 1, 1))) 2446 void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, 2447 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, 2448 const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor, 2449 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, 2450 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, 2451 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, 2452 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, 2453 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) 2454 { 2455 // calculate the range of resized image pixels computed by this workgroup 2456 const unsigned int startX = get_group_id(0)*pixelPerWorkgroup; 2457 const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns); 2458 const unsigned int actualNumPixelToCompute = stopX - startX; 2459 2460 // calculate the range of input image pixels to cache 2461 float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f); 2462 const float support = MagickMax(scale*resizeFilterSupport,0.5f); 2463 scale = PerceptibleReciprocal(scale); 2464 2465 const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0)); 2466 const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns); 2467 2468 // cache the input pixels into local memory 2469 const unsigned int y = get_global_id(1); 2470 const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y); 2471 const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels; 2472 event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0); 2473 wait_group_events(1, &e); 2474 2475 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; 2476 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; 2477 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) 2478 { 2479 const unsigned int chunkStartX = startX + chunk*pixelChunkSize; 2480 const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX); 2481 const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX; 2482 2483 // determine which resized pixel computed by this workitem 2484 const unsigned int itemID = get_local_id(0); 2485 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0)); 2486 2487 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0)); 2488 2489 float4 filteredPixel = (float4)0.0f; 2490 float density = 0.0f; 2491 float gamma = 0.0f; 2492 // -1 means this workitem doesn't participate in the computation 2493 if (pixelIndex != -1) 2494 { 2495 // x coordinated of the resized pixel computed by this workitem 2496 const int x = chunkStartX + pixelIndex; 2497 2498 // calculate how many steps required for this pixel 2499 const float bisect = (x+0.5)/xFactor+MagickEpsilon; 2500 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); 2501 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns); 2502 const unsigned int n = stop - start; 2503 2504 // calculate how many steps this workitem will contribute 2505 unsigned int numStepsPerWorkItem = n / numItems; 2506 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); 2507 2508 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; 2509 if (startStep < n) 2510 { 2511 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); 2512 2513 unsigned int cacheIndex = start+startStep-cacheRangeStartX; 2514 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) 2515 { 2516 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, 2517 (ResizeWeightingFunctionType) resizeFilterType, 2518 (ResizeWeightingFunctionType) resizeWindowType, 2519 resizeFilterScale, resizeFilterWindowSupport, 2520 resizeFilterBlur, scale*(start + i - bisect + 0.5)); 2521 2522 float4 cp = (float4)0.0f; 2523 2524 __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels); 2525 cp.x = (float) *(p); 2526 if (number_channels > 2) 2527 { 2528 cp.y = (float) *(p + 1); 2529 cp.z = (float) *(p + 2); 2530 } 2531 2532 if (alpha_index != 0) 2533 { 2534 cp.w = (float) *(p + alpha_index); 2535 2536 float alpha = weight * QuantumScale * cp.w; 2537 2538 filteredPixel.x += alpha * cp.x; 2539 filteredPixel.y += alpha * cp.y; 2540 filteredPixel.z += alpha * cp.z; 2541 filteredPixel.w += weight * cp.w; 2542 gamma += alpha; 2543 } 2544 else 2545 filteredPixel += ((float4) weight)*cp; 2546 2547 density += weight; 2548 } 2549 } 2550 } 2551 2552 // initialize the accumulators to zero 2553 if (itemID < actualNumPixelInThisChunk) { 2554 outputPixelCache[itemID] = (float4)0.0f; 2555 densityCache[itemID] = 0.0f; 2556 if (alpha_index != 0) 2557 gammaCache[itemID] = 0.0f; 2558 } 2559 barrier(CLK_LOCAL_MEM_FENCE); 2560 2561 // accumulatte the filtered pixel value and the density 2562 for (unsigned int i = 0; i < numItems; i++) { 2563 if (pixelIndex != -1) { 2564 if (itemID%numItems == i) { 2565 outputPixelCache[pixelIndex]+=filteredPixel; 2566 densityCache[pixelIndex]+=density; 2567 if (alpha_index != 0) 2568 gammaCache[pixelIndex]+=gamma; 2569 } 2570 } 2571 barrier(CLK_LOCAL_MEM_FENCE); 2572 } 2573 2574 if (itemID < actualNumPixelInThisChunk) 2575 { 2576 float4 filteredPixel = outputPixelCache[itemID]; 2577 2578 float gamma = 0.0f; 2579 if (alpha_index != 0) 2580 gamma = gammaCache[itemID]; 2581 2582 float density = densityCache[itemID]; 2583 if ((density != 0.0f) && (density != 1.0f)) 2584 { 2585 density = PerceptibleReciprocal(density); 2586 filteredPixel *= (float4) density; 2587 if (alpha_index != 0) 2588 gamma *= density; 2589 } 2590 2591 if (alpha_index != 0) 2592 { 2593 gamma = PerceptibleReciprocal(gamma); 2594 filteredPixel.x *= gamma; 2595 filteredPixel.y *= gamma; 2596 filteredPixel.z *= gamma; 2597 } 2598 2599 WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel); 2600 } 2601 } 2602 } 2603 ) 2604 2605 2606 STRINGIFY( 2607 __kernel __attribute__((reqd_work_group_size(1, 256, 1))) 2608 void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels, 2609 const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage, 2610 const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor, 2611 const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients, 2612 const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport, 2613 const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels, 2614 const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize, 2615 __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache) 2616 { 2617 // calculate the range of resized image pixels computed by this workgroup 2618 const unsigned int startY = get_group_id(1)*pixelPerWorkgroup; 2619 const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows); 2620 const unsigned int actualNumPixelToCompute = stopY - startY; 2621 2622 // calculate the range of input image pixels to cache 2623 float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f); 2624 const float support = MagickMax(scale*resizeFilterSupport,0.5f); 2625 scale = PerceptibleReciprocal(scale); 2626 2627 const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0)); 2628 const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows); 2629 2630 // cache the input pixels into local memory 2631 const unsigned int x = get_global_id(0); 2632 unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY); 2633 unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY; 2634 unsigned int stride = inputColumns * number_channels; 2635 for (unsigned int i = 0; i < number_channels; i++) 2636 { 2637 event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0); 2638 wait_group_events(1,&e); 2639 } 2640 2641 unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0; 2642 unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize; 2643 for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++) 2644 { 2645 const unsigned int chunkStartY = startY + chunk*pixelChunkSize; 2646 const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY); 2647 const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY; 2648 2649 // determine which resized pixel computed by this workitem 2650 const unsigned int itemID = get_local_id(1); 2651 const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1)); 2652 2653 const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1)); 2654 2655 float4 filteredPixel = (float4)0.0f; 2656 float density = 0.0f; 2657 float gamma = 0.0f; 2658 // -1 means this workitem doesn't participate in the computation 2659 if (pixelIndex != -1) 2660 { 2661 // x coordinated of the resized pixel computed by this workitem 2662 const int y = chunkStartY + pixelIndex; 2663 2664 // calculate how many steps required for this pixel 2665 const float bisect = (y+0.5)/yFactor+MagickEpsilon; 2666 const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f); 2667 const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows); 2668 const unsigned int n = stop - start; 2669 2670 // calculate how many steps this workitem will contribute 2671 unsigned int numStepsPerWorkItem = n / numItems; 2672 numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1); 2673 2674 const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem; 2675 if (startStep < n) 2676 { 2677 const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n); 2678 2679 unsigned int cacheIndex = start+startStep-cacheRangeStartY; 2680 for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++) 2681 { 2682 float weight = getResizeFilterWeight(resizeFilterCubicCoefficients, 2683 (ResizeWeightingFunctionType) resizeFilterType, 2684 (ResizeWeightingFunctionType) resizeWindowType, 2685 resizeFilterScale, resizeFilterWindowSupport, 2686 resizeFilterBlur, scale*(start + i - bisect + 0.5)); 2687 2688 float4 cp = (float4)0.0f; 2689 2690 __local CLQuantum *p = inputImageCache + cacheIndex; 2691 cp.x = (float) *(p); 2692 if (number_channels > 2) 2693 { 2694 cp.y = (float) *(p + rangeLength); 2695 cp.z = (float) *(p + (rangeLength * 2)); 2696 } 2697 2698 if (alpha_index != 0) 2699 { 2700 cp.w = (float) *(p + (rangeLength * alpha_index)); 2701 2702 float alpha = weight * QuantumScale * cp.w; 2703 2704 filteredPixel.x += alpha * cp.x; 2705 filteredPixel.y += alpha * cp.y; 2706 filteredPixel.z += alpha * cp.z; 2707 filteredPixel.w += weight * cp.w; 2708 gamma += alpha; 2709 } 2710 else 2711 filteredPixel += ((float4) weight)*cp; 2712 2713 density += weight; 2714 } 2715 } 2716 } 2717 2718 // initialize the accumulators to zero 2719 if (itemID < actualNumPixelInThisChunk) { 2720 outputPixelCache[itemID] = (float4)0.0f; 2721 densityCache[itemID] = 0.0f; 2722 if (alpha_index != 0) 2723 gammaCache[itemID] = 0.0f; 2724 } 2725 barrier(CLK_LOCAL_MEM_FENCE); 2726 2727 // accumulatte the filtered pixel value and the density 2728 for (unsigned int i = 0; i < numItems; i++) { 2729 if (pixelIndex != -1) { 2730 if (itemID%numItems == i) { 2731 outputPixelCache[pixelIndex]+=filteredPixel; 2732 densityCache[pixelIndex]+=density; 2733 if (alpha_index != 0) 2734 gammaCache[pixelIndex]+=gamma; 2735 } 2736 } 2737 barrier(CLK_LOCAL_MEM_FENCE); 2738 } 2739 2740 if (itemID < actualNumPixelInThisChunk) 2741 { 2742 float4 filteredPixel = outputPixelCache[itemID]; 2743 2744 float gamma = 0.0f; 2745 if (alpha_index != 0) 2746 gamma = gammaCache[itemID]; 2747 2748 float density = densityCache[itemID]; 2749 if ((density != 0.0f) && (density != 1.0f)) 2750 { 2751 density = PerceptibleReciprocal(density); 2752 filteredPixel *= (float4) density; 2753 if (alpha_index != 0) 2754 gamma *= density; 2755 } 2756 2757 if (alpha_index != 0) 2758 { 2759 gamma = PerceptibleReciprocal(gamma); 2760 filteredPixel.x *= gamma; 2761 filteredPixel.y *= gamma; 2762 filteredPixel.z *= gamma; 2763 } 2764 2765 WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel); 2766 } 2767 } 2768 } 2769 ) 2770 2771 /* 2772 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2773 % % 2774 % % 2775 % % 2776 % R o t a t i o n a l B l u r % 2777 % % 2778 % % 2779 % % 2780 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2781 */ 2782 2783 STRINGIFY( 2784 __kernel void RotationalBlur(const __global CLQuantum *image, 2785 const unsigned int number_channels,const unsigned int channel, 2786 const float2 blurCenter,__constant float *cos_theta, 2787 __constant float *sin_theta,const unsigned int cossin_theta_size, 2788 __global CLQuantum *filteredImage) 2789 { 2790 const int x = get_global_id(0); 2791 const int y = get_global_id(1); 2792 const int columns = get_global_size(0); 2793 const int rows = get_global_size(1); 2794 unsigned int step = 1; 2795 float center_x = (float) x - blurCenter.x; 2796 float center_y = (float) y - blurCenter.y; 2797 float radius = hypot(center_x, center_y); 2798 2799 float blur_radius = hypot(blurCenter.x, blurCenter.y); 2800 2801 if (radius > MagickEpsilon) 2802 { 2803 step = (unsigned int) (blur_radius / radius); 2804 if (step == 0) 2805 step = 1; 2806 if (step >= cossin_theta_size) 2807 step = cossin_theta_size-1; 2808 } 2809 2810 float4 result = 0.0f; 2811 float normalize = 0.0f; 2812 float gamma = 0.0f; 2813 2814 for (unsigned int i=0; i<cossin_theta_size; i+=step) 2815 { 2816 int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns); 2817 int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows); 2818 2819 float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy); 2820 2821 if ((number_channels == 4) || (number_channels == 2)) 2822 { 2823 float alpha = (float)(QuantumScale*pixel.w); 2824 2825 gamma += alpha; 2826 2827 result.x += alpha * pixel.x; 2828 result.y += alpha * pixel.y; 2829 result.z += alpha * pixel.z; 2830 result.w += pixel.w; 2831 } 2832 else 2833 result += pixel; 2834 2835 normalize += 1.0f; 2836 } 2837 2838 normalize = PerceptibleReciprocal(normalize); 2839 2840 if ((number_channels == 4) || (number_channels == 2)) 2841 { 2842 gamma = PerceptibleReciprocal(gamma); 2843 result.x *= gamma; 2844 result.y *= gamma; 2845 result.z *= gamma; 2846 result.w *= normalize; 2847 } 2848 else 2849 result *= normalize; 2850 2851 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result); 2852 } 2853 ) 2854 2855 /* 2856 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2857 % % 2858 % % 2859 % % 2860 % U n s h a r p M a s k % 2861 % % 2862 % % 2863 % % 2864 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2865 */ 2866 2867 STRINGIFY( 2868 __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image, 2869 const __global float4 *blurRowData,const unsigned int number_channels, 2870 const ChannelType channel,const unsigned int columns, 2871 const unsigned int rows,__local float4* cachedData, 2872 __local float* cachedFilter,const __global float *filter, 2873 const unsigned int width,const float gain, const float threshold, 2874 __global CLQuantum *filteredImage) 2875 { 2876 const unsigned int radius = (width-1)/2; 2877 2878 // cache the pixel shared by the workgroup 2879 const int groupX = get_group_id(0); 2880 const int groupStartY = get_group_id(1)*get_local_size(1) - radius; 2881 const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius; 2882 2883 if ((groupStartY >= 0) && (groupStopY < rows)) 2884 { 2885 event_t e = async_work_group_strided_copy(cachedData, 2886 blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0); 2887 wait_group_events(1,&e); 2888 } 2889 else 2890 { 2891 for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1)) 2892 cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX]; 2893 2894 barrier(CLK_LOCAL_MEM_FENCE); 2895 } 2896 // cache the filter as well 2897 event_t e = async_work_group_copy(cachedFilter,filter,width,0); 2898 wait_group_events(1,&e); 2899 2900 // only do the work if this is not a patched item 2901 const int cy = get_global_id(1); 2902 2903 if (cy < rows) 2904 { 2905 float4 blurredPixel = (float4) 0.0f; 2906 2907 int i = 0; 2908 2909 for ( ; i+7 < width; ) 2910 { 2911 for (int j=0; j < 8; j++, i++) 2912 blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)]; 2913 } 2914 2915 for ( ; i < width; i++) 2916 blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)]; 2917 2918 float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel); 2919 float4 outputPixel = inputImagePixel - blurredPixel; 2920 2921 float quantumThreshold = QuantumRange*threshold; 2922 2923 int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold); 2924 outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask); 2925 2926 //write back 2927 WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel); 2928 } 2929 } 2930 ) 2931 2932 STRINGIFY( 2933 __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels, 2934 const ChannelType channel,__constant float *filter,const unsigned int width, 2935 const unsigned int columns,const unsigned int rows,__local float4 *pixels, 2936 const float gain,const float threshold,__global CLQuantum *filteredImage) 2937 { 2938 const unsigned int x = get_global_id(0); 2939 const unsigned int y = get_global_id(1); 2940 2941 const unsigned int radius = (width - 1) / 2; 2942 2943 int row = y - radius; 2944 int baseRow = get_group_id(1) * get_local_size(1) - radius; 2945 int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius; 2946 2947 while (row < endRow) { 2948 int srcy = (row < 0) ? -row : row; // mirror pad 2949 srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy; 2950 2951 float4 value = 0.0f; 2952 2953 int ix = x - radius; 2954 int i = 0; 2955 2956 while (i + 7 < width) { 2957 for (int j = 0; j < 8; ++j) { // unrolled 2958 int srcx = ix + j; 2959 srcx = (srcx < 0) ? -srcx : srcx; 2960 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; 2961 value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); 2962 } 2963 ix += 8; 2964 i += 8; 2965 } 2966 2967 while (i < width) { 2968 int srcx = (ix < 0) ? -ix : ix; // mirror pad 2969 srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx; 2970 value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel); 2971 ++i; 2972 ++ix; 2973 } 2974 pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value; 2975 row += get_local_size(1); 2976 } 2977 2978 barrier(CLK_LOCAL_MEM_FENCE); 2979 2980 const int px = get_local_id(0); 2981 const int py = get_local_id(1); 2982 const int prp = get_local_size(0); 2983 float4 value = (float4)(0.0f); 2984 2985 int i = 0; 2986 while (i + 7 < width) { 2987 for (int j = 0; j < 8; ++j) // unrolled 2988 value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp]; 2989 i += 8; 2990 } 2991 while (i < width) { 2992 value += (float4)(filter[i]) * pixels[px + (py + i) * prp]; 2993 ++i; 2994 } 2995 2996 float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel); 2997 float4 diff = srcPixel - value; 2998 2999 float quantumThreshold = QuantumRange*threshold; 3000 3001 int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold); 3002 value = select(srcPixel + diff * gain, srcPixel, mask); 3003 3004 if ((x < columns) && (y < rows)) 3005 WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value); 3006 } 3007 ) 3008 3009 STRINGIFY( 3010 __kernel __attribute__((reqd_work_group_size(64, 4, 1))) 3011 void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage, 3012 const unsigned int number_channels,const unsigned int max_channels, 3013 const float threshold,const int passes,const unsigned int imageWidth, 3014 const unsigned int imageHeight) 3015 { 3016 const int pad = (1 << (passes - 1)); 3017 const int tileSize = 64; 3018 const int tileRowPixels = 64; 3019 const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 }; 3020 3021 CLQuantum stage[48]; // 16 * 3 (we only need 3 channels) 3022 3023 local float buffer[64 * 64]; 3024 3025 int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0); 3026 int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad; 3027 3028 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3029 int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) + 3030 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels; 3031 3032 for (int channel = 0; channel < max_channels; ++channel) 3033 stage[(i / 4) + (16 * channel)] = srcImage[pos + channel]; 3034 } 3035 3036 for (int channel = 0; channel < max_channels; ++channel) { 3037 // Load LDS 3038 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3039 buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]); 3040 3041 // Process 3042 3043 float tmp[16]; 3044 float accum[16]; 3045 float pixel; 3046 3047 for (int i = 0; i < 16; i++) 3048 accum[i]=0.0f; 3049 3050 for (int pass = 0; pass < passes; ++pass) { 3051 const int radius = 1 << pass; 3052 const int x = get_local_id(0); 3053 const float thresh = threshold * noise[pass]; 3054 3055 // Apply horizontal hat 3056 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3057 const int offset = i * tileRowPixels; 3058 if (pass == 0) 3059 tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass 3060 pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]); 3061 barrier(CLK_LOCAL_MEM_FENCE); 3062 buffer[x + offset] = pixel; 3063 } 3064 barrier(CLK_LOCAL_MEM_FENCE); 3065 3066 // Apply vertical hat 3067 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3068 pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]); 3069 float delta = tmp[i / 4] - pixel; 3070 tmp[i / 4] = pixel; // hold output in tmp until all workitems are done 3071 if (delta < -thresh) 3072 delta += thresh; 3073 else if (delta > thresh) 3074 delta -= thresh; 3075 else 3076 delta = 0; 3077 accum[i / 4] += delta; 3078 } 3079 barrier(CLK_LOCAL_MEM_FENCE); 3080 3081 if (pass < passes - 1) 3082 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3083 buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass 3084 else // last pass 3085 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3086 accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output 3087 barrier(CLK_LOCAL_MEM_FENCE); 3088 } 3089 3090 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) 3091 stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]); 3092 3093 barrier(CLK_LOCAL_MEM_FENCE); 3094 } 3095 3096 // Write from stage to output 3097 3098 if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) { 3099 for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) { 3100 if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) { 3101 int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels)); 3102 for (int channel = 0; channel < max_channels; ++channel) { 3103 dstImage[pos + channel] = stage[(i / 4) + (16 * channel)]; 3104 } 3105 } 3106 } 3107 } 3108 } 3109 ) 3110 3111 ; 3112 3113 #endif // MAGICKCORE_OPENCL_SUPPORT 3114 3115 #if defined(__cplusplus) || defined(c_plusplus) 3116 } 3117 #endif 3118 3119 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H 3120