1 /*
2   Copyright 1999-2021 ImageMagick Studio LLC, a non-profit organization
3   dedicated to making software imaging solutions freely available.
4 
5   You may not use this file except in compliance with the License.  You may
6   obtain a copy of the License at
7 
8     https://imagemagick.org/script/license.php
9 
10   Unless required by applicable law or agreed to in writing, software
11   distributed under the License is distributed on an "AS IS" BASIS,
12   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   See the License for the specific language governing permissions and
14   limitations under the License.
15 
16   MagickCore private kernels for accelerated functions.
17 */
18 
19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21 
22 #if defined(__cplusplus) || defined(c_plusplus)
23 extern "C" {
24 #endif
25 
26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
27 
28 /*
29   Define declarations.
30 */
31 #define OPENCL_DEFINE(VAR,...)	"\n #""define " #VAR " " #__VA_ARGS__ " \n"
32 #define OPENCL_ELIF(...)	"\n #""elif " #__VA_ARGS__ " \n"
33 #define OPENCL_ELSE()		"\n #""else " " \n"
34 #define OPENCL_ENDIF()		"\n #""endif " " \n"
35 #define OPENCL_IF(...)		"\n #""if " #__VA_ARGS__ " \n"
36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
37 
38 const char *accelerateKernels =
39 
40 /*
41   Define declarations.
42 */
43   OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
44   OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
45   OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
46   OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
47   OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
48   OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
49   OPENCL_DEFINE(SigmaRandom, (attenuate))
50   OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
51   OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
52   OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
53 
54 /*
55   Typedef declarations.
56 */
57   STRINGIFY(
58     typedef enum
59     {
60       UndefinedColorspace,
61       CMYColorspace,           /* negated linear RGB colorspace */
62       CMYKColorspace,          /* CMY with Black separation */
63       GRAYColorspace,          /* Single Channel greyscale (non-linear) image */
64       HCLColorspace,
65       HCLpColorspace,
66       HSBColorspace,
67       HSIColorspace,
68       HSLColorspace,
69       HSVColorspace,           /* alias for HSB */
70       HWBColorspace,
71       LabColorspace,
72       LCHColorspace,           /* alias for LCHuv */
73       LCHabColorspace,         /* Cylindrical (Polar) Lab */
74       LCHuvColorspace,         /* Cylindrical (Polar) Luv */
75       LogColorspace,
76       LMSColorspace,
77       LuvColorspace,
78       OHTAColorspace,
79       Rec601YCbCrColorspace,
80       Rec709YCbCrColorspace,
81       RGBColorspace,           /* Linear RGB colorspace */
82       scRGBColorspace,         /* ??? */
83       sRGBColorspace,          /* Default: non-linear sRGB colorspace */
84       TransparentColorspace,
85       xyYColorspace,
86       XYZColorspace,           /* IEEE Color Reference colorspace */
87       YCbCrColorspace,
88       YCCColorspace,
89       YDbDrColorspace,
90       YIQColorspace,
91       YPbPrColorspace,
92       YUVColorspace,
93       LinearGRAYColorspace     /* Single Channel greyscale (linear) image */
94     } ColorspaceType;
95   )
96 
97   STRINGIFY(
98     typedef enum
99     {
100       UndefinedCompositeOp,
101       AlphaCompositeOp,
102       AtopCompositeOp,
103       BlendCompositeOp,
104       BlurCompositeOp,
105       BumpmapCompositeOp,
106       ChangeMaskCompositeOp,
107       ClearCompositeOp,
108       ColorBurnCompositeOp,
109       ColorDodgeCompositeOp,
110       ColorizeCompositeOp,
111       CopyBlackCompositeOp,
112       CopyBlueCompositeOp,
113       CopyCompositeOp,
114       CopyCyanCompositeOp,
115       CopyGreenCompositeOp,
116       CopyMagentaCompositeOp,
117       CopyAlphaCompositeOp,
118       CopyRedCompositeOp,
119       CopyYellowCompositeOp,
120       DarkenCompositeOp,
121       DarkenIntensityCompositeOp,
122       DifferenceCompositeOp,
123       DisplaceCompositeOp,
124       DissolveCompositeOp,
125       DistortCompositeOp,
126       DivideDstCompositeOp,
127       DivideSrcCompositeOp,
128       DstAtopCompositeOp,
129       DstCompositeOp,
130       DstInCompositeOp,
131       DstOutCompositeOp,
132       DstOverCompositeOp,
133       ExclusionCompositeOp,
134       HardLightCompositeOp,
135       HardMixCompositeOp,
136       HueCompositeOp,
137       InCompositeOp,
138       IntensityCompositeOp,
139       LightenCompositeOp,
140       LightenIntensityCompositeOp,
141       LinearBurnCompositeOp,
142       LinearDodgeCompositeOp,
143       LinearLightCompositeOp,
144       LuminizeCompositeOp,
145       MathematicsCompositeOp,
146       MinusDstCompositeOp,
147       MinusSrcCompositeOp,
148       ModulateCompositeOp,
149       ModulusAddCompositeOp,
150       ModulusSubtractCompositeOp,
151       MultiplyCompositeOp,
152       NoCompositeOp,
153       OutCompositeOp,
154       OverCompositeOp,
155       OverlayCompositeOp,
156       PegtopLightCompositeOp,
157       PinLightCompositeOp,
158       PlusCompositeOp,
159       ReplaceCompositeOp,
160       SaturateCompositeOp,
161       ScreenCompositeOp,
162       SoftLightCompositeOp,
163       SrcAtopCompositeOp,
164       SrcCompositeOp,
165       SrcInCompositeOp,
166       SrcOutCompositeOp,
167       SrcOverCompositeOp,
168       ThresholdCompositeOp,
169       VividLightCompositeOp,
170       XorCompositeOp,
171       StereoCompositeOp
172     } CompositeOperator;
173   )
174 
175   STRINGIFY(
176     typedef enum
177     {
178       UndefinedFunction,
179       ArcsinFunction,
180       ArctanFunction,
181       PolynomialFunction,
182       SinusoidFunction
183     } MagickFunction;
184   )
185 
186   STRINGIFY(
187     typedef enum
188     {
189       UndefinedNoise,
190       UniformNoise,
191       GaussianNoise,
192       MultiplicativeGaussianNoise,
193       ImpulseNoise,
194       LaplacianNoise,
195       PoissonNoise,
196       RandomNoise
197     } NoiseType;
198   )
199 
200   STRINGIFY(
201     typedef enum
202     {
203       UndefinedPixelIntensityMethod = 0,
204       AveragePixelIntensityMethod,
205       BrightnessPixelIntensityMethod,
206       LightnessPixelIntensityMethod,
207       MSPixelIntensityMethod,
208       Rec601LumaPixelIntensityMethod,
209       Rec601LuminancePixelIntensityMethod,
210       Rec709LumaPixelIntensityMethod,
211       Rec709LuminancePixelIntensityMethod,
212       RMSPixelIntensityMethod
213     } PixelIntensityMethod;
214   )
215 
216   STRINGIFY(
217     typedef enum
218     {
219       BoxWeightingFunction = 0,
220       TriangleWeightingFunction,
221       CubicBCWeightingFunction,
222       HannWeightingFunction,
223       HammingWeightingFunction,
224       BlackmanWeightingFunction,
225       GaussianWeightingFunction,
226       QuadraticWeightingFunction,
227       JincWeightingFunction,
228       SincWeightingFunction,
229       SincFastWeightingFunction,
230       KaiserWeightingFunction,
231       WelchWeightingFunction,
232       BohmanWeightingFunction,
233       LagrangeWeightingFunction,
234       CosineWeightingFunction
235     } ResizeWeightingFunctionType;
236   )
237 
238   STRINGIFY(
239     typedef enum
240     {
241       UndefinedChannel = 0x0000,
242       RedChannel = 0x0001,
243       GrayChannel = 0x0001,
244       CyanChannel = 0x0001,
245       GreenChannel = 0x0002,
246       MagentaChannel = 0x0002,
247       BlueChannel = 0x0004,
248       YellowChannel = 0x0004,
249       BlackChannel = 0x0008,
250       AlphaChannel = 0x0010,
251       OpacityChannel = 0x0010,
252       IndexChannel = 0x0020,             /* Color Index Table? */
253       ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
254       WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
255       MetaChannel = 0x0100,              /* ???? */
256       CompositeChannels = 0x001F,
257       AllChannels = 0x7ffffff,
258       /*
259         Special purpose channel types.
260         FUTURE: are these needed any more - they are more like hacks
261         SyncChannels for example is NOT a real channel but a 'flag'
262         It really says -- "User has not defined channels"
263         Though it does have extra meaning in the "-auto-level" operator
264       */
265       TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
266       RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
267       GrayChannels = 0x0400,
268       SyncChannels = 0x20000,    /* channels modified as a single unit */
269       DefaultChannels = AllChannels
270     } ChannelType;  /* must correspond to PixelChannel */
271   )
272 
273 /*
274   Helper functions.
275 */
276 
277 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
278 
279   STRINGIFY(
280     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
281     {
282       return((CLQuantum) value);
283     }
284   )
285 
286 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
287 
288   STRINGIFY(
289     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
290     {
291       return((CLQuantum) (257.0f*value));
292     }
293   )
294 
295 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
296 
297   STRINGIFY(
298     static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
299     {
300       return((CLQuantum) (16843009.0*value));
301     }
302   )
303 
304 OPENCL_ENDIF()
305 
306 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
307 
308   STRINGIFY(
309     static inline CLQuantum ClampToQuantum(const float value)
310       {
311         return (CLQuantum) clamp(value, 0.0f, QuantumRange);
312       }
313   )
314 
315 OPENCL_ELSE()
316 
317   STRINGIFY(
318     static inline CLQuantum ClampToQuantum(const float value)
319       {
320         return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
321       }
322   )
323 
324 OPENCL_ENDIF()
325 
326   STRINGIFY(
327     static inline int ClampToCanvas(const int offset,const int range)
328       {
329         return clamp(offset, (int)0, range-1);
330       }
331   )
332 
333   STRINGIFY(
334     static inline uint ScaleQuantumToMap(CLQuantum value)
335       {
336         if (value >= (CLQuantum) MaxMap)
337           return ((uint)MaxMap);
338         else
339           return ((uint)value);
340       }
341   )
342 
343   STRINGIFY(
344     static inline float PerceptibleReciprocal(const float x)
345     {
346       float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
347       return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
348     }
349   )
350 
351   STRINGIFY(
352 
353   static inline unsigned int getPixelIndex(const unsigned int number_channels,
354     const unsigned int columns, const unsigned int x, const unsigned int y)
355   {
356     return (x * number_channels) + (y * columns * number_channels);
357   }
358 
359   static inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
360   static inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
361   static inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
362   static inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
363 
364   static inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
365   static inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
366   static inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
367   static inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
368 
369   static inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
370   static inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
371   static inline float getBlueF4(float4 p)                      { return p.x; }
372   static inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
373 
374   static inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
375   static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
376   static inline float getGreenF4(float4 p)                     { return p.y; }
377   static inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
378 
379   static inline CLQuantum getRed(CLPixelType p)                { return p.z; }
380   static inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
381   static inline float getRedF4(float4 p)                       { return p.z; }
382   static inline void setRedF4(float4* p, float value)          { (*p).z = value; }
383 
384   static inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
385   static inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
386   static inline float getAlphaF4(float4 p)                     { return p.w; }
387   static inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
388 
389   static inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
390     const ChannelType channel, float *red, float *green, float *blue, float *alpha)
391   {
392     if ((channel & RedChannel) != 0)
393       *red=getPixelRed(p);
394 
395     if (number_channels > 2)
396       {
397         if ((channel & GreenChannel) != 0)
398           *green=getPixelGreen(p);
399 
400         if ((channel & BlueChannel) != 0)
401           *blue=getPixelBlue(p);
402       }
403 
404     if (((number_channels == 4) || (number_channels == 2)) &&
405         ((channel & AlphaChannel) != 0))
406       *alpha=getPixelAlpha(p,number_channels);
407   }
408 
409   static inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
410     const unsigned int columns, const unsigned int x, const unsigned int y)
411   {
412     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
413 
414     float4 pixel;
415 
416     pixel.x=getPixelRed(p);
417 
418     if (number_channels > 2)
419       {
420         pixel.y=getPixelGreen(p);
421         pixel.z=getPixelBlue(p);
422       }
423 
424     if ((number_channels == 4) || (number_channels == 2))
425       pixel.w=getPixelAlpha(p,number_channels);
426     return(pixel);
427   }
428 
429   static inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
430     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
431   {
432     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
433 
434     float red = 0.0f;
435     float green = 0.0f;
436     float blue = 0.0f;
437     float alpha = 0.0f;
438 
439     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
440     return (float4)(red, green, blue, alpha);
441   }
442 
443   static inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
444     const ChannelType channel, float red, float green, float blue, float alpha)
445   {
446     if ((channel & RedChannel) != 0)
447       setPixelRed(p,ClampToQuantum(red));
448 
449     if (number_channels > 2)
450       {
451         if ((channel & GreenChannel) != 0)
452           setPixelGreen(p,ClampToQuantum(green));
453 
454         if ((channel & BlueChannel) != 0)
455           setPixelBlue(p,ClampToQuantum(blue));
456       }
457 
458     if (((number_channels == 4) || (number_channels == 2)) &&
459         ((channel & AlphaChannel) != 0))
460       setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
461   }
462 
463   static inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
464     const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
465   {
466     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
467 
468     setPixelRed(p,ClampToQuantum(pixel.x));
469 
470     if (number_channels > 2)
471       {
472         setPixelGreen(p,ClampToQuantum(pixel.y));
473         setPixelBlue(p,ClampToQuantum(pixel.z));
474       }
475 
476     if ((number_channels == 4) || (number_channels == 2))
477       setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
478   }
479 
480   static inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
481     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
482     float4 pixel)
483   {
484     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
485     WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
486   }
487 
488   static inline float GetPixelIntensity(const unsigned int colorspace,
489     const unsigned int method,float red,float green,float blue)
490   {
491     float intensity;
492 
493     if ((colorspace == GRAYColorspace) || (colorspace == LinearGRAYColorspace))
494       return red;
495 
496     switch (method)
497     {
498       case AveragePixelIntensityMethod:
499         {
500           intensity=(red+green+blue)/3.0;
501           break;
502         }
503       case BrightnessPixelIntensityMethod:
504         {
505           intensity=MagickMax(MagickMax(red,green),blue);
506           break;
507         }
508       case LightnessPixelIntensityMethod:
509         {
510           intensity=(MagickMin(MagickMin(red,green),blue)+
511               MagickMax(MagickMax(red,green),blue))/2.0;
512           break;
513         }
514       case MSPixelIntensityMethod:
515         {
516           intensity=(float) (((float) red*red+green*green+blue*blue)/
517               (3.0*QuantumRange));
518           break;
519         }
520       case Rec601LumaPixelIntensityMethod:
521         {
522           /*
523           if (image->colorspace == RGBColorspace)
524           {
525             red=EncodePixelGamma(red);
526             green=EncodePixelGamma(green);
527             blue=EncodePixelGamma(blue);
528           }
529           */
530           intensity=0.298839*red+0.586811*green+0.114350*blue;
531           break;
532         }
533       case Rec601LuminancePixelIntensityMethod:
534         {
535           /*
536           if (image->colorspace == sRGBColorspace)
537           {
538             red=DecodePixelGamma(red);
539             green=DecodePixelGamma(green);
540             blue=DecodePixelGamma(blue);
541           }
542           */
543           intensity=0.298839*red+0.586811*green+0.114350*blue;
544           break;
545         }
546       case Rec709LumaPixelIntensityMethod:
547       default:
548         {
549           /*
550           if (image->colorspace == RGBColorspace)
551           {
552             red=EncodePixelGamma(red);
553             green=EncodePixelGamma(green);
554             blue=EncodePixelGamma(blue);
555           }
556           */
557           intensity=0.212656*red+0.715158*green+0.072186*blue;
558           break;
559         }
560       case Rec709LuminancePixelIntensityMethod:
561         {
562           /*
563           if (image->colorspace == sRGBColorspace)
564           {
565             red=DecodePixelGamma(red);
566             green=DecodePixelGamma(green);
567             blue=DecodePixelGamma(blue);
568           }
569           */
570           intensity=0.212656*red+0.715158*green+0.072186*blue;
571           break;
572         }
573       case RMSPixelIntensityMethod:
574         {
575           intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
576               sqrt(3.0));
577           break;
578         }
579     }
580 
581     return intensity;
582   }
583 
584   static inline int mirrorBottom(int value)
585   {
586       return (value < 0) ? - (value) : value;
587   }
588 
589   static inline int mirrorTop(int value, int width)
590   {
591       return (value >= width) ? (2 * width - value - 1) : value;
592   }
593   )
594 
595 /*
596 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
597 %                                                                             %
598 %                                                                             %
599 %                                                                             %
600 %     A d d N o i s e                                                         %
601 %                                                                             %
602 %                                                                             %
603 %                                                                             %
604 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
605 */
606 
607   STRINGIFY(
608   /*
609   Part of MWC64X by David Thomas, dt10@imperial.ac.uk
610   This is provided under BSD, full license is with the main package.
611   See http://www.doc.ic.ac.uk/~dt10/research
612   */
613 
614   // Pre: a<M, b<M
615   // Post: r=(a+b) mod M
616   ulong MWC_AddMod64(ulong a, ulong b, ulong M)
617   {
618     ulong v=a+b;
619     //if( (v>=M) || (v<a) )
620     if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
621       v=v-M;
622     return v;
623   }
624 
625   // Pre: a<M,b<M
626   // Post: r=(a*b) mod M
627   // This could be done more efficently, but it is portable, and should
628   // be easy to understand. It can be replaced with any of the better
629   // modular multiplication algorithms (for example if you know you have
630   // double precision available or something).
631   ulong MWC_MulMod64(ulong a, ulong b, ulong M)
632   {
633     ulong r=0;
634     while(a!=0){
635       if(a&1)
636         r=MWC_AddMod64(r,b,M);
637       b=MWC_AddMod64(b,b,M);
638       a=a>>1;
639     }
640     return r;
641   }
642 
643   // Pre: a<M, e>=0
644   // Post: r=(a^b) mod M
645   // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
646   // most architectures
647   ulong MWC_PowMod64(ulong a, ulong e, ulong M)
648   {
649     ulong sqr=a, acc=1;
650     while(e!=0){
651       if(e&1)
652         acc=MWC_MulMod64(acc,sqr,M);
653         sqr=MWC_MulMod64(sqr,sqr,M);
654       e=e>>1;
655     }
656     return acc;
657   }
658 
659   uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
660   {
661     ulong m=MWC_PowMod64(A, distance, M);
662     ulong x=curr.x*(ulong)A+curr.y;
663     x=MWC_MulMod64(x, m, M);
664     return (uint2)((uint)(x/A), (uint)(x%A));
665   }
666 
667   uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
668   {
669     // This is an arbitrary constant for starting LCG jumping from. I didn't
670     // want to start from 1, as then you end up with the two or three first values
671     // being a bit poor in ones - once you've decided that, one constant is as
672     // good as any another. There is no deep mathematical reason for it, I just
673     // generated a random number.
674     enum{ MWC_BASEID = 4077358422479273989UL };
675 
676     ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
677     ulong m=MWC_PowMod64(A, dist, M);
678 
679     ulong x=MWC_MulMod64(MWC_BASEID, m, M);
680     return (uint2)((uint)(x/A), (uint)(x%A));
681   }
682 
683   //! Represents the state of a particular generator
684   typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
685 
686   void MWC64X_Step(mwc64x_state_t *s)
687   {
688     uint X=s->x, C=s->c;
689 
690     uint Xn=s->seed0*X+C;
691     uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
692     uint Cn=mad_hi(s->seed0,X,carry);
693 
694     s->x=Xn;
695     s->c=Cn;
696   }
697 
698   void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
699   {
700     uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
701     s->x=tmp.x;
702     s->c=tmp.y;
703   }
704 
705   void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
706   {
707     uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
708     s->x=tmp.x;
709     s->c=tmp.y;
710   }
711 
712   //! Return a 32-bit integer in the range [0..2^32)
713   uint MWC64X_NextUint(mwc64x_state_t *s)
714   {
715     uint res=s->x ^ s->c;
716     MWC64X_Step(s);
717     return res;
718   }
719 
720   //
721   // End of MWC64X excerpt
722   //
723 
724   float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
725   {
726     return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
727   }
728 
729   float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
730   {
731     float
732       alpha,
733       beta,
734       noise,
735       sigma;
736 
737     noise = 0.0f;
738     alpha=mwcReadPseudoRandomValue(r);
739     switch(noise_type)
740     {
741       case UniformNoise:
742       default:
743         {
744           noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
745           break;
746         }
747       case GaussianNoise:
748         {
749           float
750             gamma,
751             tau;
752 
753           if (alpha == 0.0f)
754             alpha=1.0f;
755           beta=mwcReadPseudoRandomValue(r);
756           gamma=sqrt(-2.0f*log(alpha));
757           sigma=gamma*cospi((2.0f*beta));
758           tau=gamma*sinpi((2.0f*beta));
759           noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
760           break;
761         }
762       case ImpulseNoise:
763       {
764         if (alpha < (SigmaImpulse/2.0f))
765           noise=0.0f;
766         else
767           if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
768             noise=QuantumRange;
769           else
770             noise=pixel;
771         break;
772       }
773       case LaplacianNoise:
774       {
775         if (alpha <= 0.5f)
776           {
777             if (alpha <= MagickEpsilon)
778               noise=(pixel-QuantumRange);
779             else
780               noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
781                 0.5f);
782             break;
783           }
784         beta=1.0f-alpha;
785         if (beta <= (0.5f*MagickEpsilon))
786           noise=(pixel+QuantumRange);
787         else
788           noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
789         break;
790       }
791       case MultiplicativeGaussianNoise:
792       {
793         sigma=1.0f;
794         if (alpha > MagickEpsilon)
795           sigma=sqrt(-2.0f*log(alpha));
796         beta=mwcReadPseudoRandomValue(r);
797         noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
798           cospi((2.0f*beta))/2.0f);
799         break;
800       }
801       case PoissonNoise:
802       {
803         float
804           poisson;
805         unsigned int i;
806         poisson=exp(-SigmaPoisson*QuantumScale*pixel);
807         for (i=0; alpha > poisson; i++)
808         {
809           beta=mwcReadPseudoRandomValue(r);
810           alpha*=beta;
811         }
812         noise=(QuantumRange*i*PerceptibleReciprocal(SigmaPoisson));
813         break;
814       }
815       case RandomNoise:
816       {
817         noise=(QuantumRange*SigmaRandom*alpha);
818         break;
819       }
820     }
821     return noise;
822   }
823 
824   __kernel
825   void AddNoise(const __global CLQuantum *image,
826     const unsigned int number_channels,const ChannelType channel,
827     const unsigned int length,const unsigned int pixelsPerWorkItem,
828     const NoiseType noise_type,const float attenuate,const unsigned int seed0,
829     const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
830     __global CLQuantum *filteredImage)
831   {
832     mwc64x_state_t rng;
833     rng.seed0 = seed0;
834     rng.seed1 = seed1;
835 
836     uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
837     uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
838     MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
839 
840     uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
841     uint count = pixelsPerWorkItem;
842 
843     while (count > 0 && pos < length)
844     {
845       const __global CLQuantum *p = image + pos;
846       __global CLQuantum *q = filteredImage + pos;
847 
848       float red;
849       float green;
850       float blue;
851       float alpha;
852 
853       ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
854 
855       if ((channel & RedChannel) != 0)
856         red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
857 
858       if (number_channels > 2)
859       {
860         if ((channel & GreenChannel) != 0)
861           green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
862 
863         if ((channel & BlueChannel) != 0)
864           blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
865       }
866 
867       if (((number_channels == 4) || (number_channels == 2)) &&
868           ((channel & AlphaChannel) != 0))
869         alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
870 
871       WriteChannels(q, number_channels, channel, red, green, blue, alpha);
872 
873       pos += (get_local_size(0) * number_channels);
874       count--;
875     }
876   }
877   )
878 
879 /*
880 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
881 %                                                                             %
882 %                                                                             %
883 %                                                                             %
884 %    B l u r                                                                  %
885 %                                                                             %
886 %                                                                             %
887 %                                                                             %
888 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
889 */
890 
891   STRINGIFY(
892   /*
893   Reduce image noise and reduce detail levels by row
894   */
895   __kernel void BlurRow(const __global CLQuantum *image,
896     const unsigned int number_channels,const ChannelType channel,
897     __constant float *filter,const unsigned int width,
898     const unsigned int imageColumns,const unsigned int imageRows,
899     __local float4 *temp,__global float4 *tempImage)
900   {
901     const int x = get_global_id(0);
902     const int y = get_global_id(1);
903 
904     const int columns = imageColumns;
905 
906     const unsigned int radius = (width-1)/2;
907     const int wsize = get_local_size(0);
908     const unsigned int loadSize = wsize+width;
909 
910     //group coordinate
911     const int groupX=get_local_size(0)*get_group_id(0);
912 
913     //parallel load and clamp
914     for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
915     {
916       int cx = ClampToCanvas(i + groupX - radius, columns);
917       temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
918     }
919 
920     // barrier
921     barrier(CLK_LOCAL_MEM_FENCE);
922 
923     // only do the work if this is not a patched item
924     if (get_global_id(0) < columns)
925     {
926       // compute
927       float4 result = (float4) 0;
928 
929       int i = 0;
930 
931       for ( ; i+7 < width; )
932       {
933         for (int j=0; j < 8; j++)
934           result+=filter[i+j]*temp[i+j+get_local_id(0)];
935         i+=8;
936       }
937 
938       for ( ; i < width; i++)
939         result+=filter[i]*temp[i+get_local_id(0)];
940 
941       // write back to global
942       tempImage[y*columns+x] = result;
943     }
944   }
945   )
946 
947   STRINGIFY(
948   /*
949   Reduce image noise and reduce detail levels by line
950   */
951   __kernel void BlurColumn(const __global float4 *blurRowData,
952     const unsigned int number_channels,const ChannelType channel,
953     __constant float *filter,const unsigned int width,
954     const unsigned int imageColumns,const unsigned int imageRows,
955     __local float4 *temp,__global CLQuantum *filteredImage)
956   {
957     const int x = get_global_id(0);
958     const int y = get_global_id(1);
959 
960     const int columns = imageColumns;
961     const int rows = imageRows;
962 
963     unsigned int radius = (width-1)/2;
964     const int wsize = get_local_size(1);
965     const unsigned int loadSize = wsize+width;
966 
967     //group coordinate
968     const int groupX=get_local_size(0)*get_group_id(0);
969     const int groupY=get_local_size(1)*get_group_id(1);
970     //notice that get_local_size(0) is 1, so
971     //groupX=get_group_id(0);
972 
973     //parallel load and clamp
974     for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
975       temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
976 
977     // barrier
978     barrier(CLK_LOCAL_MEM_FENCE);
979 
980     // only do the work if this is not a patched item
981     if (get_global_id(1) < rows)
982     {
983       // compute
984       float4 result = (float4) 0;
985 
986       int i = 0;
987 
988       for ( ; i+7 < width; )
989       {
990         for (int j=0; j < 8; j++)
991           result+=filter[i+j]*temp[i+j+get_local_id(1)];
992         i+=8;
993       }
994 
995       for ( ; i < width; i++)
996         result+=filter[i]*temp[i+get_local_id(1)];
997 
998       // write back to global
999       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
1000     }
1001   }
1002   )
1003 
1004 /*
1005 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1006 %                                                                             %
1007 %                                                                             %
1008 %                                                                             %
1009 %    C o n t r a s t                                                          %
1010 %                                                                             %
1011 %                                                                             %
1012 %                                                                             %
1013 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1014 */
1015 
1016   STRINGIFY(
1017 
1018   static inline float4 ConvertRGBToHSB(const float4 pixel)
1019   {
1020     float4 result=0.0f;
1021     result.w=pixel.w;
1022     float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z);
1023     if (tmax != 0.0f)
1024     {
1025       float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z);
1026       float delta=tmax-tmin;
1027 
1028       result.y=delta/tmax;
1029       result.z=QuantumScale*tmax;
1030       if (delta != 0.0f)
1031       {
1032         result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f));
1033         result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ?
1034           (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta;
1035         result.x/=6.0f;
1036         result.x+=(result.x < 0.0f) ? 0.0f : 1.0f;
1037       }
1038     }
1039     return(result);
1040   }
1041 
1042   static inline float4 ConvertHSBToRGB(const float4 pixel)
1043   {
1044     float hue=pixel.x;
1045     float saturation=pixel.y;
1046     float brightness=pixel.z;
1047 
1048     float4 result=pixel;
1049 
1050     if (saturation == 0.0f)
1051     {
1052       result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness);
1053     }
1054     else
1055     {
1056       float h=6.0f*(hue-floor(hue));
1057       float f=h-floor(h);
1058       float p=brightness*(1.0f-saturation);
1059       float q=brightness*(1.0f-saturation*f);
1060       float t=brightness*(1.0f-(saturation*(1.0f-f)));
1061       int ih = (int)h;
1062 
1063       if (ih == 1)
1064       {
1065         result.x=ClampToQuantum(QuantumRange*q);
1066         result.y=ClampToQuantum(QuantumRange*brightness);
1067         result.z=ClampToQuantum(QuantumRange*p);
1068       }
1069       else if (ih == 2)
1070       {
1071         result.x=ClampToQuantum(QuantumRange*p);
1072         result.y=ClampToQuantum(QuantumRange*brightness);
1073         result.z=ClampToQuantum(QuantumRange*t);
1074       }
1075       else if (ih == 3)
1076       {
1077         result.x=ClampToQuantum(QuantumRange*p);
1078         result.y=ClampToQuantum(QuantumRange*q);
1079         result.z=ClampToQuantum(QuantumRange*brightness);
1080       }
1081       else if (ih == 4)
1082       {
1083         result.x=ClampToQuantum(QuantumRange*t);
1084         result.y=ClampToQuantum(QuantumRange*p);
1085         result.z=ClampToQuantum(QuantumRange*brightness);
1086       }
1087       else if (ih == 5)
1088       {
1089         result.x=ClampToQuantum(QuantumRange*brightness);
1090         result.y=ClampToQuantum(QuantumRange*p);
1091         result.z=ClampToQuantum(QuantumRange*q);
1092       }
1093       else
1094       {
1095         result.x=ClampToQuantum(QuantumRange*brightness);
1096         result.y=ClampToQuantum(QuantumRange*t);
1097         result.z=ClampToQuantum(QuantumRange*p);
1098       }
1099     }
1100     return(result);
1101   }
1102 
1103   __kernel void Contrast(__global CLQuantum *image,
1104     const unsigned int number_channels,const int sign)
1105   {
1106     const int x=get_global_id(0);
1107     const int y=get_global_id(1);
1108     const unsigned int columns=get_global_size(0);
1109 
1110     float4 pixel=ReadAllChannels(image,number_channels,columns,x,y);
1111     if (number_channels < 3)
1112       pixel.y=pixel.z=pixel.x;
1113 
1114     pixel=ConvertRGBToHSB(pixel);
1115     float brightness=pixel.z;
1116     brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1117     brightness=clamp(brightness,0.0f,1.0f);
1118     pixel.z=brightness;
1119     pixel=ConvertHSBToRGB(pixel);
1120 
1121     WriteAllChannels(image,number_channels,columns,x,y,pixel);
1122   }
1123   )
1124 
1125 /*
1126 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1127 %                                                                             %
1128 %                                                                             %
1129 %                                                                             %
1130 %    C o n t r a s t S t r e t c h                                            %
1131 %                                                                             %
1132 %                                                                             %
1133 %                                                                             %
1134 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1135 */
1136 
1137     STRINGIFY(
1138     /*
1139     */
1140     __kernel void Histogram(__global CLPixelType * restrict im,
1141       const ChannelType channel,
1142       const unsigned int colorspace,
1143       const unsigned int method,
1144       __global uint4 * restrict histogram)
1145       {
1146         const int x = get_global_id(0);
1147         const int y = get_global_id(1);
1148         const int columns = get_global_size(0);
1149         const int c = x + y * columns;
1150         if ((channel & SyncChannels) != 0)
1151         {
1152           float red=(float)getRed(im[c]);
1153           float green=(float)getGreen(im[c]);
1154           float blue=(float)getBlue(im[c]);
1155 
1156           float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1157           uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1158           atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1159         }
1160         else
1161         {
1162           // for equalizing, we always need all channels?
1163           // otherwise something more
1164         }
1165       }
1166     )
1167 
1168     STRINGIFY(
1169     /*
1170     */
1171     __kernel void ContrastStretch(__global CLPixelType * restrict im,
1172       const ChannelType channel,
1173       __global CLPixelType * restrict stretch_map,
1174       const float4 white, const float4 black)
1175       {
1176         const int x = get_global_id(0);
1177         const int y = get_global_id(1);
1178         const int columns = get_global_size(0);
1179         const int c = x + y * columns;
1180 
1181         uint ePos;
1182         CLPixelType oValue, eValue;
1183         CLQuantum red, green, blue, alpha;
1184 
1185         //read from global
1186         oValue=im[c];
1187 
1188         if ((channel & RedChannel) != 0)
1189         {
1190           if (getRedF4(white) != getRedF4(black))
1191           {
1192             ePos = ScaleQuantumToMap(getRed(oValue));
1193             eValue = stretch_map[ePos];
1194             red = getRed(eValue);
1195           }
1196         }
1197 
1198         if ((channel & GreenChannel) != 0)
1199         {
1200           if (getGreenF4(white) != getGreenF4(black))
1201           {
1202             ePos = ScaleQuantumToMap(getGreen(oValue));
1203             eValue = stretch_map[ePos];
1204             green = getGreen(eValue);
1205           }
1206         }
1207 
1208         if ((channel & BlueChannel) != 0)
1209         {
1210           if (getBlueF4(white) != getBlueF4(black))
1211           {
1212             ePos = ScaleQuantumToMap(getBlue(oValue));
1213             eValue = stretch_map[ePos];
1214             blue = getBlue(eValue);
1215           }
1216         }
1217 
1218         if ((channel & AlphaChannel) != 0)
1219         {
1220           if (getAlphaF4(white) != getAlphaF4(black))
1221           {
1222             ePos = ScaleQuantumToMap(getAlpha(oValue));
1223             eValue = stretch_map[ePos];
1224             alpha = getAlpha(eValue);
1225           }
1226         }
1227 
1228         //write back
1229         im[c]=(CLPixelType)(blue, green, red, alpha);
1230 
1231       }
1232     )
1233 
1234 /*
1235 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1236 %                                                                             %
1237 %                                                                             %
1238 %                                                                             %
1239 %    C o n v o l v e                                                          %
1240 %                                                                             %
1241 %                                                                             %
1242 %                                                                             %
1243 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1244 */
1245 
1246   STRINGIFY(
1247     __kernel
1248     void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1249     const unsigned int imageWidth, const unsigned int imageHeight,
1250     __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1251     const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1252 
1253       int2 blockID;
1254       blockID.x = get_global_id(0) / get_local_size(0);
1255       blockID.y = get_global_id(1) / get_local_size(1);
1256 
1257       // image area processed by this workgroup
1258       int2 imageAreaOrg;
1259       imageAreaOrg.x = blockID.x * get_local_size(0);
1260       imageAreaOrg.y = blockID.y * get_local_size(1);
1261 
1262       int2 midFilterDimen;
1263       midFilterDimen.x = (filterWidth-1)/2;
1264       midFilterDimen.y = (filterHeight-1)/2;
1265 
1266       int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1267 
1268       // dimension of the local cache
1269       int2 cachedAreaDimen;
1270       cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1271       cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1272 
1273       // cache the pixels accessed by this workgroup in local memory
1274       int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1275       int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1276       int groupSize = get_local_size(0) * get_local_size(1);
1277       for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1278 
1279         int2 cachedAreaIndex;
1280         cachedAreaIndex.x = i % cachedAreaDimen.x;
1281         cachedAreaIndex.y = i / cachedAreaDimen.x;
1282 
1283         int2 imagePixelIndex;
1284         imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1285 
1286         // only support EdgeVirtualPixelMethod through ClampToCanvas
1287         // TODO: implement other virtual pixel method
1288         imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1289         imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1290 
1291         pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1292       }
1293 
1294       // cache the filter
1295       for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1296         filterCache[i] = filter[i];
1297       }
1298       barrier(CLK_LOCAL_MEM_FENCE);
1299 
1300 
1301       int2 imageIndex;
1302       imageIndex.x = imageAreaOrg.x + get_local_id(0);
1303       imageIndex.y = imageAreaOrg.y + get_local_id(1);
1304 
1305       // if out-of-range, stops here and quit
1306       if (imageIndex.x >= imageWidth
1307         || imageIndex.y >= imageHeight) {
1308           return;
1309       }
1310 
1311       int filterIndex = 0;
1312       float4 sum = (float4)0.0f;
1313       float gamma = 0.0f;
1314       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1315         int cacheIndexY = get_local_id(1);
1316         for (int j = 0; j < filterHeight; j++) {
1317           int cacheIndexX = get_local_id(0);
1318           for (int i = 0; i < filterWidth; i++) {
1319             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1320             float f = filterCache[filterIndex];
1321 
1322             sum.x += f * p.x;
1323             sum.y += f * p.y;
1324             sum.z += f * p.z;
1325             sum.w += f * p.w;
1326 
1327             gamma += f;
1328             filterIndex++;
1329             cacheIndexX++;
1330           }
1331           cacheIndexY++;
1332         }
1333       }
1334       else {
1335         int cacheIndexY = get_local_id(1);
1336         for (int j = 0; j < filterHeight; j++) {
1337           int cacheIndexX = get_local_id(0);
1338           for (int i = 0; i < filterWidth; i++) {
1339 
1340             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1341             float alpha = QuantumScale*p.w;
1342             float f = filterCache[filterIndex];
1343             float g = alpha * f;
1344 
1345             sum.x += g*p.x;
1346             sum.y += g*p.y;
1347             sum.z += g*p.z;
1348             sum.w += f*p.w;
1349 
1350             gamma += g;
1351             filterIndex++;
1352             cacheIndexX++;
1353           }
1354           cacheIndexY++;
1355         }
1356         gamma = PerceptibleReciprocal(gamma);
1357         sum.xyz = gamma*sum.xyz;
1358       }
1359       CLPixelType outputPixel;
1360       outputPixel.x = ClampToQuantum(sum.x);
1361       outputPixel.y = ClampToQuantum(sum.y);
1362       outputPixel.z = ClampToQuantum(sum.z);
1363       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1364 
1365       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1366     }
1367   )
1368 
1369   STRINGIFY(
1370     __kernel
1371     void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1372                   const uint imageWidth, const uint imageHeight,
1373                   __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1374                   const uint matte, const ChannelType channel) {
1375 
1376       int2 imageIndex;
1377       imageIndex.x = get_global_id(0);
1378       imageIndex.y = get_global_id(1);
1379 
1380       /*
1381       unsigned int imageWidth = get_global_size(0);
1382       unsigned int imageHeight = get_global_size(1);
1383       */
1384       if (imageIndex.x >= imageWidth
1385           || imageIndex.y >= imageHeight)
1386           return;
1387 
1388       int2 midFilterDimen;
1389       midFilterDimen.x = (filterWidth-1)/2;
1390       midFilterDimen.y = (filterHeight-1)/2;
1391 
1392       int filterIndex = 0;
1393       float4 sum = (float4)0.0f;
1394       float gamma = 0.0f;
1395       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1396         for (int j = 0; j < filterHeight; j++) {
1397           int2 inputPixelIndex;
1398           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1399           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1400           for (int i = 0; i < filterWidth; i++) {
1401             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1402             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1403 
1404             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1405             float f = filter[filterIndex];
1406 
1407             sum.x += f * p.x;
1408             sum.y += f * p.y;
1409             sum.z += f * p.z;
1410             sum.w += f * p.w;
1411 
1412             gamma += f;
1413 
1414             filterIndex++;
1415           }
1416         }
1417       }
1418       else {
1419 
1420         for (int j = 0; j < filterHeight; j++) {
1421           int2 inputPixelIndex;
1422           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1423           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1424           for (int i = 0; i < filterWidth; i++) {
1425             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1426             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1427 
1428             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1429             float alpha = QuantumScale*p.w;
1430             float f = filter[filterIndex];
1431             float g = alpha * f;
1432 
1433             sum.x += g*p.x;
1434             sum.y += g*p.y;
1435             sum.z += g*p.z;
1436             sum.w += f*p.w;
1437 
1438             gamma += g;
1439 
1440 
1441             filterIndex++;
1442           }
1443         }
1444         gamma = PerceptibleReciprocal(gamma);
1445         sum.xyz = gamma*sum.xyz;
1446       }
1447 
1448       CLPixelType outputPixel;
1449       outputPixel.x = ClampToQuantum(sum.x);
1450       outputPixel.y = ClampToQuantum(sum.y);
1451       outputPixel.z = ClampToQuantum(sum.z);
1452       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1453 
1454       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1455     }
1456   )
1457 
1458 /*
1459 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1460 %                                                                             %
1461 %                                                                             %
1462 %                                                                             %
1463 %     D e s p e c k l e                                                       %
1464 %                                                                             %
1465 %                                                                             %
1466 %                                                                             %
1467 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1468 */
1469 
1470   STRINGIFY(
1471 
1472   __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1473   , const unsigned int imageWidth, const unsigned int imageHeight
1474   , const int2 offset, const int polarity, const int matte) {
1475 
1476     int x = get_global_id(0);
1477     int y = get_global_id(1);
1478 
1479     CLPixelType v = inputImage[y*imageWidth+x];
1480 
1481     int2 neighbor;
1482     neighbor.y = y + offset.y;
1483     neighbor.x = x + offset.x;
1484 
1485     int2 clampedNeighbor;
1486     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1487     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1488 
1489     CLPixelType r = (clampedNeighbor.x == neighbor.x
1490                      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1491     :(CLPixelType)0;
1492 
1493     int sv[4];
1494     sv[0] = (int)v.x;
1495     sv[1] = (int)v.y;
1496     sv[2] = (int)v.z;
1497     sv[3] = (int)v.w;
1498 
1499     int sr[4];
1500     sr[0] = (int)r.x;
1501     sr[1] = (int)r.y;
1502     sr[2] = (int)r.z;
1503     sr[3] = (int)r.w;
1504 
1505     if (polarity > 0) {
1506       \n #pragma unroll 4\n
1507       for (unsigned int i = 0; i < 4; i++) {
1508         sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1509       }
1510     }
1511     else {
1512       \n #pragma unroll 4\n
1513       for (unsigned int i = 0; i < 4; i++) {
1514         sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1515       }
1516 
1517     }
1518 
1519     v.x = (CLQuantum)sv[0];
1520     v.y = (CLQuantum)sv[1];
1521     v.z = (CLQuantum)sv[2];
1522 
1523     if (matte!=0)
1524       v.w = (CLQuantum)sv[3];
1525 
1526     outputImage[y*imageWidth+x] = v;
1527 
1528     }
1529 
1530 
1531   )
1532 
1533   STRINGIFY(
1534 
1535   __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1536   , const unsigned int imageWidth, const unsigned int imageHeight
1537   , const int2 offset, const int polarity, const int matte) {
1538 
1539     int x = get_global_id(0);
1540     int y = get_global_id(1);
1541 
1542     CLPixelType v = inputImage[y*imageWidth+x];
1543 
1544     int2 neighbor, clampedNeighbor;
1545 
1546     neighbor.y = y + offset.y;
1547     neighbor.x = x + offset.x;
1548     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1549     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1550 
1551     CLPixelType r = (clampedNeighbor.x == neighbor.x
1552       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1553     :(CLPixelType)0;
1554 
1555 
1556     neighbor.y = y - offset.y;
1557     neighbor.x = x - offset.x;
1558     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1559     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1560 
1561     CLPixelType s = (clampedNeighbor.x == neighbor.x
1562       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1563     :(CLPixelType)0;
1564 
1565 
1566     int sv[4];
1567     sv[0] = (int)v.x;
1568     sv[1] = (int)v.y;
1569     sv[2] = (int)v.z;
1570     sv[3] = (int)v.w;
1571 
1572     int sr[4];
1573     sr[0] = (int)r.x;
1574     sr[1] = (int)r.y;
1575     sr[2] = (int)r.z;
1576     sr[3] = (int)r.w;
1577 
1578     int ss[4];
1579     ss[0] = (int)s.x;
1580     ss[1] = (int)s.y;
1581     ss[2] = (int)s.z;
1582     ss[3] = (int)s.w;
1583 
1584     if (polarity > 0) {
1585       \n #pragma unroll 4\n
1586       for (unsigned int i = 0; i < 4; i++) {
1587         //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1588         //
1589         //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1590         //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1591         sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1592       }
1593     }
1594     else {
1595       \n #pragma unroll 4\n
1596       for (unsigned int i = 0; i < 4; i++) {
1597         //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1598         //
1599         //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1600         sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1601       }
1602     }
1603 
1604     v.x = (CLQuantum)sv[0];
1605     v.y = (CLQuantum)sv[1];
1606     v.z = (CLQuantum)sv[2];
1607 
1608     if (matte!=0)
1609       v.w = (CLQuantum)sv[3];
1610 
1611     outputImage[y*imageWidth+x] = v;
1612 
1613     }
1614   )
1615 
1616 /*
1617 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1618 %                                                                             %
1619 %                                                                             %
1620 %                                                                             %
1621 %     E q u a l i z e                                                         %
1622 %                                                                             %
1623 %                                                                             %
1624 %                                                                             %
1625 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1626 */
1627 
1628     STRINGIFY(
1629     /*
1630     */
1631     __kernel void Equalize(__global CLPixelType * restrict im,
1632       const ChannelType channel,
1633       __global CLPixelType * restrict equalize_map,
1634       const float4 white, const float4 black)
1635       {
1636         const int x = get_global_id(0);
1637         const int y = get_global_id(1);
1638         const int columns = get_global_size(0);
1639         const int c = x + y * columns;
1640 
1641         uint ePos;
1642         CLPixelType oValue, eValue;
1643         CLQuantum red, green, blue, alpha;
1644 
1645         //read from global
1646         oValue=im[c];
1647 
1648         if ((channel & SyncChannels) != 0)
1649         {
1650           if (getRedF4(white) != getRedF4(black))
1651           {
1652             ePos = ScaleQuantumToMap(getRed(oValue));
1653             eValue = equalize_map[ePos];
1654             red = getRed(eValue);
1655             ePos = ScaleQuantumToMap(getGreen(oValue));
1656             eValue = equalize_map[ePos];
1657             green = getRed(eValue);
1658             ePos = ScaleQuantumToMap(getBlue(oValue));
1659             eValue = equalize_map[ePos];
1660             blue = getRed(eValue);
1661             ePos = ScaleQuantumToMap(getAlpha(oValue));
1662             eValue = equalize_map[ePos];
1663             alpha = getRed(eValue);
1664 
1665             //write back
1666             im[c]=(CLPixelType)(blue, green, red, alpha);
1667           }
1668 
1669         }
1670 
1671         // for equalizing, we always need all channels?
1672         // otherwise something more
1673 
1674      }
1675     )
1676 
1677 /*
1678 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1679 %                                                                             %
1680 %                                                                             %
1681 %                                                                             %
1682 %     F u n c t i o n                                                         %
1683 %                                                                             %
1684 %                                                                             %
1685 %                                                                             %
1686 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1687 */
1688 
1689   STRINGIFY(
1690   /*
1691   apply FunctionImageChannel(braightness-contrast)
1692   */
1693   CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1694     const unsigned int number_parameters,__constant float *parameters)
1695   {
1696     float result = 0.0f;
1697 
1698     switch (function)
1699     {
1700     case PolynomialFunction:
1701       {
1702         for (unsigned int i=0; i < number_parameters; i++)
1703           result = result*QuantumScale*pixel + parameters[i];
1704         result *= QuantumRange;
1705         break;
1706       }
1707     case SinusoidFunction:
1708       {
1709         float  freq,phase,ampl,bias;
1710         freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1711         phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1712         ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1713         bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1714         result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1715           (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1716         break;
1717       }
1718     case ArcsinFunction:
1719       {
1720         float  width,range,center,bias;
1721         width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1722         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1723         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1724         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1725 
1726         result = 2.0f/width*(QuantumScale*pixel - center);
1727         result = range/MagickPI*asin(result)+bias;
1728         result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1729         result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1730         result *= QuantumRange;
1731         break;
1732       }
1733     case ArctanFunction:
1734       {
1735         float slope,range,center,bias;
1736         slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1737         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1738         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1739         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1740         result = MagickPI*slope*(QuantumScale*pixel-center);
1741         result = QuantumRange*(range/MagickPI*atan(result) + bias);
1742         break;
1743       }
1744     case UndefinedFunction:
1745       break;
1746     }
1747     return(ClampToQuantum(result));
1748   }
1749   )
1750 
1751   STRINGIFY(
1752   /*
1753   Improve brightness / contrast of the image
1754   channel : define which channel is improved
1755   function : the function called to enchance the brightness contrast
1756   number_parameters : numbers of parameters
1757   parameters : the parameter
1758   */
1759   __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1760     const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1761     __constant float *parameters)
1762   {
1763     const unsigned int x = get_global_id(0);
1764     const unsigned int y = get_global_id(1);
1765     const unsigned int columns = get_global_size(0);
1766     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1767 
1768     float red;
1769     float green;
1770     float blue;
1771     float alpha;
1772 
1773     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1774 
1775     if ((channel & RedChannel) != 0)
1776       red=ApplyFunction(red, function, number_parameters, parameters);
1777 
1778     if (number_channels > 2)
1779       {
1780         if ((channel & GreenChannel) != 0)
1781           green=ApplyFunction(green, function, number_parameters, parameters);
1782 
1783         if ((channel & BlueChannel) != 0)
1784           blue=ApplyFunction(blue, function, number_parameters, parameters);
1785       }
1786 
1787     if (((number_channels == 4) || (number_channels == 2)) &&
1788         ((channel & AlphaChannel) != 0))
1789       alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1790 
1791     WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1792   }
1793   )
1794 
1795 /*
1796 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1797 %                                                                             %
1798 %                                                                             %
1799 %                                                                             %
1800 %     G r a y s c a l e                                                       %
1801 %                                                                             %
1802 %                                                                             %
1803 %                                                                             %
1804 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1805 */
1806 
1807   STRINGIFY(
1808   __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1809     const unsigned int colorspace,const unsigned int method)
1810   {
1811     const unsigned int x = get_global_id(0);
1812     const unsigned int y = get_global_id(1);
1813     const unsigned int columns = get_global_size(0);
1814     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1815 
1816     float
1817       blue,
1818       green,
1819       red;
1820 
1821     red=getPixelRed(p);
1822     green=getPixelGreen(p);
1823     blue=getPixelBlue(p);
1824 
1825     CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1826 
1827     setPixelRed(p,intensity);
1828     setPixelGreen(p,intensity);
1829     setPixelBlue(p,intensity);
1830   }
1831   )
1832 
1833 /*
1834 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1835 %                                                                             %
1836 %                                                                             %
1837 %                                                                             %
1838 %     L o c a l C o n t r a s t                                               %
1839 %                                                                             %
1840 %                                                                             %
1841 %                                                                             %
1842 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1843 */
1844 
1845     STRINGIFY(
1846 
1847       __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1848           const int radius,
1849           const int imageWidth,
1850           const int imageHeight)
1851       {
1852         const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1853 
1854         int x = get_local_id(0);
1855         int y = get_global_id(1);
1856 
1857         if ((x >= imageWidth) || (y >= imageHeight))
1858           return;
1859 
1860         global CLPixelType *src = srcImage + y * imageWidth;
1861 
1862         for (int i = x; i < imageWidth; i += get_local_size(0)) {
1863             float sum = 0.0f;
1864             float weight = 1.0f;
1865 
1866             int j = i - radius;
1867             while ((j + 7) < i) {
1868                 for (int k = 0; k < 8; ++k) // Unroll 8x
1869                     sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1870                 weight += 8.0f;
1871                 j+=8;
1872             }
1873             while (j < i) {
1874                 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1875                 weight += 1.0f;
1876                 ++j;
1877             }
1878 
1879             while ((j + 7) < radius + i) {
1880                 for (int k = 0; k < 8; ++k) // Unroll 8x
1881                     sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1882                 weight -= 8.0f;
1883                 j+=8;
1884             }
1885             while (j < radius + i) {
1886                 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1887                 weight -= 1.0f;
1888                 ++j;
1889             }
1890 
1891             tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1892         }
1893       }
1894     )
1895 
1896     STRINGIFY(
1897       __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
1898           const int radius,
1899           const float strength,
1900           const int imageWidth,
1901           const int imageHeight)
1902       {
1903         const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1904 
1905         int x = get_global_id(0);
1906         int y = get_global_id(1);
1907 
1908         if ((x >= imageWidth) || (y >= imageHeight))
1909                 return;
1910 
1911         global float *src = blurImage + x;
1912 
1913         float sum = 0.0f;
1914         float weight = 1.0f;
1915 
1916         int j = y - radius;
1917         while ((j + 7) < y) {
1918             for (int k = 0; k < 8; ++k) // Unroll 8x
1919                 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1920             weight += 8.0f;
1921             j+=8;
1922         }
1923         while (j < y) {
1924             sum += weight * src[mirrorBottom(j) * imageWidth];
1925             weight += 1.0f;
1926             ++j;
1927         }
1928 
1929         while ((j + 7) < radius + y) {
1930             for (int k = 0; k < 8; ++k) // Unroll 8x
1931                 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1932             weight -= 8.0f;
1933             j+=8;
1934         }
1935         while (j < radius + y) {
1936             sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1937             weight -= 1.0f;
1938             ++j;
1939         }
1940 
1941         CLPixelType pixel = srcImage[x + y * imageWidth];
1942         float srcVal = dot(RGB, convert_float4(pixel));
1943         float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1944         mult = (srcVal + mult) / srcVal;
1945 
1946         pixel.x = ClampToQuantum(pixel.x * mult);
1947         pixel.y = ClampToQuantum(pixel.y * mult);
1948         pixel.z = ClampToQuantum(pixel.z * mult);
1949 
1950         dstImage[x + y * imageWidth] = pixel;
1951       }
1952     )
1953 
1954 /*
1955 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1956 %                                                                             %
1957 %                                                                             %
1958 %                                                                             %
1959 %     M o d u l a t e                                                         %
1960 %                                                                             %
1961 %                                                                             %
1962 %                                                                             %
1963 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1964 */
1965 
1966   STRINGIFY(
1967 
1968   static inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1969     float *hue, float *saturation, float *lightness)
1970   {
1971   float
1972     c,
1973     tmax,
1974     tmin;
1975 
1976   /*
1977      Convert RGB to HSL colorspace.
1978      */
1979   tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1980   tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1981 
1982   c=tmax-tmin;
1983 
1984   *lightness=(tmax+tmin)/2.0;
1985   if (c <= 0.0)
1986   {
1987     *hue=0.0;
1988     *saturation=0.0;
1989     return;
1990   }
1991 
1992   if (tmax == (QuantumScale*red))
1993   {
1994     *hue=(QuantumScale*green-QuantumScale*blue)/c;
1995     if ((QuantumScale*green) < (QuantumScale*blue))
1996       *hue+=6.0;
1997   }
1998   else
1999     if (tmax == (QuantumScale*green))
2000       *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
2001     else
2002       *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2003 
2004   *hue*=60.0/360.0;
2005   if (*lightness <= 0.5)
2006     *saturation=c/(2.0*(*lightness));
2007   else
2008     *saturation=c/(2.0-2.0*(*lightness));
2009   }
2010 
2011   static inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2012       CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2013   {
2014     float
2015       b,
2016       c,
2017       g,
2018       h,
2019       tmin,
2020       r,
2021       x;
2022 
2023     /*
2024        Convert HSL to RGB colorspace.
2025        */
2026     h=hue*360.0;
2027     if (lightness <= 0.5)
2028       c=2.0*lightness*saturation;
2029     else
2030       c=(2.0-2.0*lightness)*saturation;
2031     tmin=lightness-0.5*c;
2032     h-=360.0*floor(h/360.0);
2033     h/=60.0;
2034     x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2035     switch ((int) floor(h) % 6)
2036     {
2037       case 0:
2038         {
2039           r=tmin+c;
2040           g=tmin+x;
2041           b=tmin;
2042           break;
2043         }
2044       case 1:
2045         {
2046           r=tmin+x;
2047           g=tmin+c;
2048           b=tmin;
2049           break;
2050         }
2051       case 2:
2052         {
2053           r=tmin;
2054           g=tmin+c;
2055           b=tmin+x;
2056           break;
2057         }
2058       case 3:
2059         {
2060           r=tmin;
2061           g=tmin+x;
2062           b=tmin+c;
2063           break;
2064         }
2065       case 4:
2066         {
2067           r=tmin+x;
2068           g=tmin;
2069           b=tmin+c;
2070           break;
2071         }
2072       case 5:
2073         {
2074           r=tmin+c;
2075           g=tmin;
2076           b=tmin+x;
2077           break;
2078         }
2079       default:
2080         {
2081           r=0.0;
2082           g=0.0;
2083           b=0.0;
2084         }
2085     }
2086     *red=ClampToQuantum(QuantumRange*r);
2087     *green=ClampToQuantum(QuantumRange*g);
2088     *blue=ClampToQuantum(QuantumRange*b);
2089   }
2090 
2091   static inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2092     CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2093   {
2094     float
2095       hue,
2096       lightness,
2097       saturation;
2098 
2099     /*
2100     Increase or decrease color lightness, saturation, or hue.
2101     */
2102     ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2103     hue+=0.5*(0.01*percent_hue-1.0);
2104     while (hue < 0.0)
2105       hue+=1.0;
2106     while (hue >= 1.0)
2107       hue-=1.0;
2108     saturation*=0.01*percent_saturation;
2109     lightness*=0.01*percent_lightness;
2110     ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2111   }
2112 
2113   __kernel void Modulate(__global CLPixelType *im,
2114     const float percent_brightness,
2115     const float percent_hue,
2116     const float percent_saturation,
2117     const int colorspace)
2118   {
2119 
2120     const int x = get_global_id(0);
2121     const int y = get_global_id(1);
2122     const int columns = get_global_size(0);
2123     const int c = x + y * columns;
2124 
2125     CLPixelType pixel = im[c];
2126 
2127     CLQuantum
2128         blue,
2129         green,
2130         red;
2131 
2132     red=getRed(pixel);
2133     green=getGreen(pixel);
2134     blue=getBlue(pixel);
2135 
2136     switch (colorspace)
2137     {
2138       case HSLColorspace:
2139       default:
2140         {
2141           ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2142               &red, &green, &blue);
2143         }
2144 
2145     }
2146 
2147     CLPixelType filteredPixel;
2148 
2149     setRed(&filteredPixel, red);
2150     setGreen(&filteredPixel, green);
2151     setBlue(&filteredPixel, blue);
2152     filteredPixel.w = pixel.w;
2153 
2154     im[c] = filteredPixel;
2155   }
2156   )
2157 
2158 /*
2159 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2160 %                                                                             %
2161 %                                                                             %
2162 %                                                                             %
2163 %     M o t i o n B l u r                                                     %
2164 %                                                                             %
2165 %                                                                             %
2166 %                                                                             %
2167 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2168 */
2169 
2170   STRINGIFY(
2171     __kernel
2172     void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2173                     const unsigned int imageWidth, const unsigned int imageHeight,
2174                     const __global float *filter, const unsigned int width, const __global int2* offset,
2175                     const float4 bias,
2176                     const ChannelType channel, const unsigned int matte) {
2177 
2178       int2 currentPixel;
2179       currentPixel.x = get_global_id(0);
2180       currentPixel.y = get_global_id(1);
2181 
2182       if (currentPixel.x >= imageWidth
2183           || currentPixel.y >= imageHeight)
2184           return;
2185 
2186       float4 pixel;
2187       pixel.x = (float)bias.x;
2188       pixel.y = (float)bias.y;
2189       pixel.z = (float)bias.z;
2190       pixel.w = (float)bias.w;
2191 
2192       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
2193 
2194         for (int i = 0; i < width; i++) {
2195           // only support EdgeVirtualPixelMethod through ClampToCanvas
2196           // TODO: implement other virtual pixel method
2197           int2 samplePixel = currentPixel + offset[i];
2198           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2199           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2200           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2201 
2202           pixel.x += (filter[i] * (float)samplePixelValue.x);
2203           pixel.y += (filter[i] * (float)samplePixelValue.y);
2204           pixel.z += (filter[i] * (float)samplePixelValue.z);
2205           pixel.w += (filter[i] * (float)samplePixelValue.w);
2206         }
2207 
2208         CLPixelType outputPixel;
2209         outputPixel.x = ClampToQuantum(pixel.x);
2210         outputPixel.y = ClampToQuantum(pixel.y);
2211         outputPixel.z = ClampToQuantum(pixel.z);
2212         outputPixel.w = ClampToQuantum(pixel.w);
2213         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2214       }
2215       else {
2216 
2217         float gamma = 0.0f;
2218         for (int i = 0; i < width; i++) {
2219           // only support EdgeVirtualPixelMethod through ClampToCanvas
2220           // TODO: implement other virtual pixel method
2221           int2 samplePixel = currentPixel + offset[i];
2222           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2223           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2224 
2225           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2226 
2227           float alpha = QuantumScale*samplePixelValue.w;
2228           float k = filter[i];
2229           pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2230           pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2231           pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2232 
2233           pixel.w += k * alpha * samplePixelValue.w;
2234 
2235           gamma+=k*alpha;
2236         }
2237         gamma = PerceptibleReciprocal(gamma);
2238         pixel.xyz = gamma*pixel.xyz;
2239 
2240         CLPixelType outputPixel;
2241         outputPixel.x = ClampToQuantum(pixel.x);
2242         outputPixel.y = ClampToQuantum(pixel.y);
2243         outputPixel.z = ClampToQuantum(pixel.z);
2244         outputPixel.w = ClampToQuantum(pixel.w);
2245         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2246       }
2247     }
2248   )
2249 
2250 /*
2251 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2252 %                                                                             %
2253 %                                                                             %
2254 %                                                                             %
2255 %     R e s i z e                                                             %
2256 %                                                                             %
2257 %                                                                             %
2258 %                                                                             %
2259 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2260 */
2261 
2262   STRINGIFY(
2263   // Based on Box from resize.c
2264   float BoxResizeFilter(const float x)
2265   {
2266     return 1.0f;
2267   }
2268   )
2269 
2270   STRINGIFY(
2271   // Based on CubicBC from resize.c
2272   float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2273   {
2274     /*
2275     Cubic Filters using B,C determined values:
2276     Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
2277     Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
2278     Spline              B = 1   C = 0    B-Spline Gaussian approximation
2279     Hermite             B = 0   C = 0    B-Spline interpolator
2280 
2281     See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2282     Graphics Computer Graphics, Volume 22, Number 4, August 1988
2283     http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2284     Mitchell.pdf.
2285 
2286     Coefficents are determined from B,C values:
2287     P0 = (  6 - 2*B       )/6 = coeff[0]
2288     P1 =         0
2289     P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2290     P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2291     Q0 = (      8*B +24*C )/6 = coeff[3]
2292     Q1 = (    -12*B -48*C )/6 = coeff[4]
2293     Q2 = (      6*B +30*C )/6 = coeff[5]
2294     Q3 = (    - 1*B - 6*C )/6 = coeff[6]
2295 
2296     which are used to define the filter:
2297 
2298     P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
2299     Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
2300 
2301     which ensures function is continuous in value and derivative (slope).
2302     */
2303     if (x < 1.0)
2304       return(resizeFilterCoefficients[0]+x*(x*
2305       (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2306     if (x < 2.0)
2307       return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2308       (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2309     return(0.0);
2310   }
2311   )
2312 
2313   STRINGIFY(
2314   float Sinc(const float x)
2315   {
2316     if (x != 0.0f)
2317     {
2318       const float alpha=(float) (MagickPI*x);
2319       return sinpi(x)/alpha;
2320     }
2321     return(1.0f);
2322   }
2323   )
2324 
2325   STRINGIFY(
2326   float Triangle(const float x)
2327   {
2328     /*
2329     1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2330     a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
2331     for Sinc().
2332     */
2333     return ((x<1.0f)?(1.0f-x):0.0f);
2334   }
2335   )
2336 
2337 
2338   STRINGIFY(
2339   float Hann(const float x)
2340   {
2341     /*
2342     Cosine window function:
2343       0.5+0.5*cos(pi*x).
2344     */
2345     const float cosine=cos((MagickPI*x));
2346     return(0.5f+0.5f*cosine);
2347   }
2348   )
2349 
2350   STRINGIFY(
2351   float Hamming(const float x)
2352   {
2353     /*
2354       Offset cosine window function:
2355        .54 + .46 cos(pi x).
2356     */
2357     const float cosine=cos((MagickPI*x));
2358     return(0.54f+0.46f*cosine);
2359   }
2360   )
2361 
2362   STRINGIFY(
2363   float Blackman(const float x)
2364   {
2365     /*
2366       Blackman: 2nd order cosine windowing function:
2367         0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2368 
2369       Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2370       five flops.
2371     */
2372     const float cosine=cos((MagickPI*x));
2373     return(0.34f+cosine*(0.5f+cosine*0.16f));
2374   }
2375   )
2376 
2377   STRINGIFY(
2378   static inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2379   {
2380     switch (filterType)
2381     {
2382     /* Call Sinc even for SincFast to get better precision on GPU
2383        and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
2384     case SincWeightingFunction:
2385     case SincFastWeightingFunction:
2386       return Sinc(x);
2387     case CubicBCWeightingFunction:
2388       return CubicBC(x,filterCoefficients);
2389     case BoxWeightingFunction:
2390       return BoxResizeFilter(x);
2391     case TriangleWeightingFunction:
2392       return Triangle(x);
2393     case HannWeightingFunction:
2394       return Hann(x);
2395     case HammingWeightingFunction:
2396       return Hamming(x);
2397     case BlackmanWeightingFunction:
2398       return Blackman(x);
2399 
2400     default:
2401       return 0.0f;
2402     }
2403   }
2404   )
2405 
2406 
2407   STRINGIFY(
2408   static inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2409            , const ResizeWeightingFunctionType resizeWindowType
2410            , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2411   {
2412     float scale;
2413     float xBlur = fabs(x/resizeFilterBlur);
2414     if (resizeWindowSupport < MagickEpsilon
2415         || resizeWindowType == BoxWeightingFunction)
2416     {
2417       scale = 1.0f;
2418     }
2419     else
2420     {
2421       scale = resizeFilterScale;
2422       scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2423     }
2424     float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2425     return weight;
2426   }
2427 
2428   )
2429 
2430   ;
2431   const char *accelerateKernels2 =
2432 
2433   STRINGIFY(
2434 
2435   static inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2436     return (numWorkItems/pixelPerWorkgroup);
2437   }
2438 
2439   // returns the index of the pixel for the current workitem to compute.
2440   // returns -1 if this workitem doesn't need to participate in any computation
2441   static inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2442     const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2443     int pixelIndex = itemID/numWorkItemsPerPixel;
2444     pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2445     return pixelIndex;
2446   }
2447 
2448   )
2449 
2450   STRINGIFY(
2451   __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2452     void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2453       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2454       const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2455       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2456       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2457       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2458       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2459       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2460   {
2461     // calculate the range of resized image pixels computed by this workgroup
2462     const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2463     const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2464     const unsigned int actualNumPixelToCompute = stopX - startX;
2465 
2466     // calculate the range of input image pixels to cache
2467     float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2468     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2469     scale = PerceptibleReciprocal(scale);
2470 
2471     const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2472     const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2473 
2474     // cache the input pixels into local memory
2475     const unsigned int y = get_global_id(1);
2476     const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2477     const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2478     event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2479     wait_group_events(1, &e);
2480 
2481     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2482     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2483     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2484     {
2485       const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2486       const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2487       const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2488 
2489       // determine which resized pixel computed by this workitem
2490       const unsigned int itemID = get_local_id(0);
2491       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2492 
2493       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2494 
2495       float4 filteredPixel = (float4)0.0f;
2496       float density = 0.0f;
2497       float gamma = 0.0f;
2498       // -1 means this workitem doesn't participate in the computation
2499       if (pixelIndex != -1)
2500       {
2501         // x coordinated of the resized pixel computed by this workitem
2502         const int x = chunkStartX + pixelIndex;
2503 
2504         // calculate how many steps required for this pixel
2505         const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2506         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2507         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2508         const unsigned int n = stop - start;
2509 
2510         // calculate how many steps this workitem will contribute
2511         unsigned int numStepsPerWorkItem = n / numItems;
2512         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2513 
2514         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2515         if (startStep < n)
2516         {
2517           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2518 
2519           unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2520           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2521           {
2522             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2523               (ResizeWeightingFunctionType) resizeFilterType,
2524               (ResizeWeightingFunctionType) resizeWindowType,
2525               resizeFilterScale, resizeFilterWindowSupport,
2526               resizeFilterBlur, scale*(start + i - bisect + 0.5));
2527 
2528             float4 cp = (float4)0.0f;
2529 
2530             __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2531             cp.x = (float) *(p);
2532             if (number_channels > 2)
2533             {
2534               cp.y = (float) *(p + 1);
2535               cp.z = (float) *(p + 2);
2536             }
2537 
2538             if (alpha_index != 0)
2539             {
2540               cp.w = (float) *(p + alpha_index);
2541 
2542               float alpha = weight * QuantumScale * cp.w;
2543 
2544               filteredPixel.x += alpha * cp.x;
2545               filteredPixel.y += alpha * cp.y;
2546               filteredPixel.z += alpha * cp.z;
2547               filteredPixel.w += weight * cp.w;
2548               gamma += alpha;
2549             }
2550             else
2551               filteredPixel += ((float4) weight)*cp;
2552 
2553             density += weight;
2554           }
2555         }
2556       }
2557 
2558       // initialize the accumulators to zero
2559       if (itemID < actualNumPixelInThisChunk) {
2560         outputPixelCache[itemID] = (float4)0.0f;
2561         densityCache[itemID] = 0.0f;
2562         if (alpha_index != 0)
2563           gammaCache[itemID] = 0.0f;
2564       }
2565       barrier(CLK_LOCAL_MEM_FENCE);
2566 
2567       // accumulatte the filtered pixel value and the density
2568       for (unsigned int i = 0; i < numItems; i++) {
2569         if (pixelIndex != -1) {
2570           if (itemID%numItems == i) {
2571             outputPixelCache[pixelIndex]+=filteredPixel;
2572             densityCache[pixelIndex]+=density;
2573             if (alpha_index != 0)
2574               gammaCache[pixelIndex]+=gamma;
2575           }
2576         }
2577         barrier(CLK_LOCAL_MEM_FENCE);
2578       }
2579 
2580       if (itemID < actualNumPixelInThisChunk)
2581       {
2582         float4 filteredPixel = outputPixelCache[itemID];
2583 
2584         float gamma = 0.0f;
2585         if (alpha_index != 0)
2586           gamma = gammaCache[itemID];
2587 
2588         float density = densityCache[itemID];
2589         if ((density != 0.0f) && (density != 1.0f))
2590         {
2591           density = PerceptibleReciprocal(density);
2592           filteredPixel *= (float4) density;
2593           if (alpha_index != 0)
2594             gamma *= density;
2595         }
2596 
2597         if (alpha_index != 0)
2598         {
2599           gamma = PerceptibleReciprocal(gamma);
2600           filteredPixel.x *= gamma;
2601           filteredPixel.y *= gamma;
2602           filteredPixel.z *= gamma;
2603         }
2604 
2605         WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
2606       }
2607     }
2608   }
2609   )
2610 
2611 
2612   STRINGIFY(
2613  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2614     void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2615       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2616       const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2617       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2618       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2619       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2620       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2621       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2622   {
2623     // calculate the range of resized image pixels computed by this workgroup
2624     const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2625     const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2626     const unsigned int actualNumPixelToCompute = stopY - startY;
2627 
2628     // calculate the range of input image pixels to cache
2629     float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2630     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2631     scale = PerceptibleReciprocal(scale);
2632 
2633     const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2634     const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2635 
2636     // cache the input pixels into local memory
2637     const unsigned int x = get_global_id(0);
2638     unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2639     unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2640     unsigned int stride = inputColumns * number_channels;
2641     for (unsigned int i = 0; i < number_channels; i++)
2642     {
2643       event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2644       wait_group_events(1,&e);
2645     }
2646 
2647     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2648     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2649     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2650     {
2651       const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2652       const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2653       const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2654 
2655       // determine which resized pixel computed by this workitem
2656       const unsigned int itemID = get_local_id(1);
2657       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2658 
2659       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2660 
2661       float4 filteredPixel = (float4)0.0f;
2662       float density = 0.0f;
2663       float gamma = 0.0f;
2664       // -1 means this workitem doesn't participate in the computation
2665       if (pixelIndex != -1)
2666       {
2667         // x coordinated of the resized pixel computed by this workitem
2668         const int y = chunkStartY + pixelIndex;
2669 
2670         // calculate how many steps required for this pixel
2671         const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2672         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2673         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2674         const unsigned int n = stop - start;
2675 
2676         // calculate how many steps this workitem will contribute
2677         unsigned int numStepsPerWorkItem = n / numItems;
2678         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2679 
2680         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2681         if (startStep < n)
2682         {
2683           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2684 
2685           unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2686           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2687           {
2688             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2689               (ResizeWeightingFunctionType) resizeFilterType,
2690               (ResizeWeightingFunctionType) resizeWindowType,
2691               resizeFilterScale, resizeFilterWindowSupport,
2692               resizeFilterBlur, scale*(start + i - bisect + 0.5));
2693 
2694             float4 cp = (float4)0.0f;
2695 
2696             __local CLQuantum *p = inputImageCache + cacheIndex;
2697             cp.x = (float) *(p);
2698             if (number_channels > 2)
2699             {
2700               cp.y = (float) *(p + rangeLength);
2701               cp.z = (float) *(p + (rangeLength * 2));
2702             }
2703 
2704             if (alpha_index != 0)
2705             {
2706               cp.w = (float) *(p + (rangeLength * alpha_index));
2707 
2708               float alpha = weight * QuantumScale * cp.w;
2709 
2710               filteredPixel.x += alpha * cp.x;
2711               filteredPixel.y += alpha * cp.y;
2712               filteredPixel.z += alpha * cp.z;
2713               filteredPixel.w += weight * cp.w;
2714               gamma += alpha;
2715             }
2716             else
2717               filteredPixel += ((float4) weight)*cp;
2718 
2719             density += weight;
2720           }
2721         }
2722       }
2723 
2724       // initialize the accumulators to zero
2725       if (itemID < actualNumPixelInThisChunk) {
2726         outputPixelCache[itemID] = (float4)0.0f;
2727         densityCache[itemID] = 0.0f;
2728         if (alpha_index != 0)
2729           gammaCache[itemID] = 0.0f;
2730       }
2731       barrier(CLK_LOCAL_MEM_FENCE);
2732 
2733       // accumulatte the filtered pixel value and the density
2734       for (unsigned int i = 0; i < numItems; i++) {
2735         if (pixelIndex != -1) {
2736           if (itemID%numItems == i) {
2737             outputPixelCache[pixelIndex]+=filteredPixel;
2738             densityCache[pixelIndex]+=density;
2739             if (alpha_index != 0)
2740               gammaCache[pixelIndex]+=gamma;
2741           }
2742         }
2743         barrier(CLK_LOCAL_MEM_FENCE);
2744       }
2745 
2746       if (itemID < actualNumPixelInThisChunk)
2747       {
2748         float4 filteredPixel = outputPixelCache[itemID];
2749 
2750         float gamma = 0.0f;
2751         if (alpha_index != 0)
2752           gamma = gammaCache[itemID];
2753 
2754         float density = densityCache[itemID];
2755         if ((density != 0.0f) && (density != 1.0f))
2756         {
2757           density = PerceptibleReciprocal(density);
2758           filteredPixel *= (float4) density;
2759           if (alpha_index != 0)
2760             gamma *= density;
2761         }
2762 
2763         if (alpha_index != 0)
2764         {
2765           gamma = PerceptibleReciprocal(gamma);
2766           filteredPixel.x *= gamma;
2767           filteredPixel.y *= gamma;
2768           filteredPixel.z *= gamma;
2769         }
2770 
2771         WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
2772       }
2773     }
2774   }
2775   )
2776 
2777 /*
2778 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2779 %                                                                             %
2780 %                                                                             %
2781 %                                                                             %
2782 %     R o t a t i o n a l B l u r                                             %
2783 %                                                                             %
2784 %                                                                             %
2785 %                                                                             %
2786 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2787 */
2788 
2789   STRINGIFY(
2790   __kernel void RotationalBlur(const __global CLQuantum *image,
2791     const unsigned int number_channels,const unsigned int channel,
2792     const float2 blurCenter,__constant float *cos_theta,
2793     __constant float *sin_theta,const unsigned int cossin_theta_size,
2794     __global CLQuantum *filteredImage)
2795   {
2796     const int x = get_global_id(0);
2797     const int y = get_global_id(1);
2798     const int columns = get_global_size(0);
2799     const int rows = get_global_size(1);
2800     unsigned int step = 1;
2801     float center_x = (float) x - blurCenter.x;
2802     float center_y = (float) y - blurCenter.y;
2803     float radius = hypot(center_x, center_y);
2804 
2805     float blur_radius = hypot(blurCenter.x, blurCenter.y);
2806 
2807     if (radius > MagickEpsilon)
2808     {
2809       step = (unsigned int) (blur_radius / radius);
2810       if (step == 0)
2811         step = 1;
2812       if (step >= cossin_theta_size)
2813         step = cossin_theta_size-1;
2814     }
2815 
2816     float4 result = 0.0f;
2817     float normalize = 0.0f;
2818     float gamma = 0.0f;
2819 
2820     for (unsigned int i=0; i<cossin_theta_size; i+=step)
2821     {
2822       int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2823       int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2824 
2825       float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
2826 
2827       if ((number_channels == 4) || (number_channels == 2))
2828       {
2829         float alpha = (float)(QuantumScale*pixel.w);
2830 
2831         gamma += alpha;
2832 
2833         result.x += alpha * pixel.x;
2834         result.y += alpha * pixel.y;
2835         result.z += alpha * pixel.z;
2836         result.w += pixel.w;
2837       }
2838       else
2839         result += pixel;
2840 
2841       normalize += 1.0f;
2842     }
2843 
2844     normalize = PerceptibleReciprocal(normalize);
2845 
2846     if ((number_channels == 4) || (number_channels == 2))
2847     {
2848       gamma = PerceptibleReciprocal(gamma);
2849       result.x *= gamma;
2850       result.y *= gamma;
2851       result.z *= gamma;
2852       result.w *= normalize;
2853     }
2854     else
2855       result *= normalize;
2856 
2857     WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2858   }
2859   )
2860 
2861 /*
2862 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2863 %                                                                             %
2864 %                                                                             %
2865 %                                                                             %
2866 %     U n s h a r p M a s k                                                   %
2867 %                                                                             %
2868 %                                                                             %
2869 %                                                                             %
2870 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2871 */
2872 
2873   STRINGIFY(
2874   __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
2875     const __global float4 *blurRowData,const unsigned int number_channels,
2876     const ChannelType channel,const unsigned int columns,
2877     const unsigned int rows,__local float4* cachedData,
2878     __local float* cachedFilter,const __global float *filter,
2879     const unsigned int width,const float gain, const float threshold,
2880     __global CLQuantum *filteredImage)
2881   {
2882     const unsigned int radius = (width-1)/2;
2883 
2884     // cache the pixel shared by the workgroup
2885     const int groupX = get_group_id(0);
2886     const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2887     const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2888 
2889     if ((groupStartY >= 0) && (groupStopY < rows))
2890     {
2891       event_t e = async_work_group_strided_copy(cachedData,
2892         blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2893       wait_group_events(1,&e);
2894     }
2895     else
2896     {
2897       for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2898         cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2899 
2900       barrier(CLK_LOCAL_MEM_FENCE);
2901     }
2902     // cache the filter as well
2903     event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2904     wait_group_events(1,&e);
2905 
2906     // only do the work if this is not a patched item
2907     const int cy = get_global_id(1);
2908 
2909     if (cy < rows)
2910     {
2911       float4 blurredPixel = (float4) 0.0f;
2912 
2913       int i = 0;
2914 
2915       for ( ; i+7 < width; )
2916       {
2917         for (int j=0; j < 8; j++, i++)
2918           blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2919       }
2920 
2921       for ( ; i < width; i++)
2922         blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2923 
2924       float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2925       float4 outputPixel = inputImagePixel - blurredPixel;
2926 
2927       float quantumThreshold = QuantumRange*threshold;
2928 
2929       int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2930       outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2931 
2932       //write back
2933       WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2934     }
2935   }
2936   )
2937 
2938   STRINGIFY(
2939   __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
2940     const ChannelType channel,__constant float *filter,const unsigned int width,
2941     const unsigned int columns,const unsigned int rows,__local float4 *pixels,
2942     const float gain,const float threshold,__global CLQuantum *filteredImage)
2943   {
2944     const unsigned int x = get_global_id(0);
2945     const unsigned int y = get_global_id(1);
2946 
2947     const unsigned int radius = (width - 1) / 2;
2948 
2949     int row = y - radius;
2950     int baseRow = get_group_id(1) * get_local_size(1) - radius;
2951     int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2952 
2953     while (row < endRow) {
2954       int srcy = (row < 0) ? -row : row; // mirror pad
2955       srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2956 
2957       float4 value = 0.0f;
2958 
2959       int ix = x - radius;
2960       int i = 0;
2961 
2962       while (i + 7 < width) {
2963         for (int j = 0; j < 8; ++j) { // unrolled
2964           int srcx = ix + j;
2965           srcx = (srcx < 0) ? -srcx : srcx;
2966           srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2967           value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2968         }
2969         ix += 8;
2970         i += 8;
2971       }
2972 
2973       while (i < width) {
2974         int srcx = (ix < 0) ? -ix : ix; // mirror pad
2975         srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2976         value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2977         ++i;
2978         ++ix;
2979       }
2980       pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2981       row += get_local_size(1);
2982     }
2983 
2984     barrier(CLK_LOCAL_MEM_FENCE);
2985 
2986     const int px = get_local_id(0);
2987     const int py = get_local_id(1);
2988     const int prp = get_local_size(0);
2989     float4 value = (float4)(0.0f);
2990 
2991     int i = 0;
2992     while (i + 7 < width) {
2993       for (int j = 0; j < 8; ++j) // unrolled
2994         value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
2995       i += 8;
2996     }
2997     while (i < width) {
2998       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
2999       ++i;
3000     }
3001 
3002     if ((x < columns) && (y < rows)) {
3003       float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
3004       float4 diff = srcPixel - value;
3005 
3006       float quantumThreshold = QuantumRange*threshold;
3007 
3008       int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3009       value = select(srcPixel + diff * gain, srcPixel, mask);
3010 
3011       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
3012     }
3013   }
3014   )
3015 
3016   STRINGIFY(
3017     __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
3018     void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
3019       const unsigned int number_channels,const unsigned int max_channels,
3020       const float threshold,const int passes,const unsigned int imageWidth,
3021       const unsigned int imageHeight)
3022   {
3023     const int pad = (1 << (passes - 1));
3024     const int tileSize = 64;
3025     const int tileRowPixels = 64;
3026     const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3027 
3028     CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
3029 
3030     local float buffer[64 * 64];
3031 
3032     int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3033     int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3034 
3035     for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3036       int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
3037                 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
3038 
3039       for (int channel = 0; channel < max_channels; ++channel)
3040         stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
3041     }
3042 
3043     for (int channel = 0; channel < max_channels; ++channel) {
3044       // Load LDS
3045       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3046         buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
3047 
3048       // Process
3049 
3050       float tmp[16];
3051       float accum[16];
3052       float pixel;
3053 
3054       for (int i = 0; i < 16; i++)
3055         accum[i]=0.0f;
3056 
3057       for (int pass = 0; pass < passes; ++pass) {
3058         const int radius = 1 << pass;
3059         const int x = get_local_id(0);
3060         const float thresh = threshold * noise[pass];
3061 
3062         // Apply horizontal hat
3063         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3064           const int offset = i * tileRowPixels;
3065           if (pass == 0)
3066             tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3067           pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3068           barrier(CLK_LOCAL_MEM_FENCE);
3069           buffer[x + offset] = pixel;
3070         }
3071         barrier(CLK_LOCAL_MEM_FENCE);
3072 
3073         // Apply vertical hat
3074         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3075           pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3076           float delta = tmp[i / 4] - pixel;
3077           tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3078           if (delta < -thresh)
3079             delta += thresh;
3080           else if (delta > thresh)
3081             delta -= thresh;
3082           else
3083             delta = 0;
3084           accum[i / 4] += delta;
3085         }
3086         barrier(CLK_LOCAL_MEM_FENCE);
3087 
3088         if (pass < passes - 1)
3089           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3090             buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3091         else  // last pass
3092           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3093             accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3094         barrier(CLK_LOCAL_MEM_FENCE);
3095       }
3096 
3097       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3098         stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
3099 
3100       barrier(CLK_LOCAL_MEM_FENCE);
3101     }
3102 
3103     // Write from stage to output
3104 
3105     if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3106       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3107         if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3108           int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
3109           for (int channel = 0; channel < max_channels; ++channel) {
3110             dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
3111           }
3112         }
3113       }
3114     }
3115   }
3116   )
3117 
3118   ;
3119 
3120 #endif // MAGICKCORE_OPENCL_SUPPORT
3121 
3122 #if defined(__cplusplus) || defined(c_plusplus)
3123 }
3124 #endif
3125 
3126 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
3127