1 /*
2   Copyright 1999-2016 ImageMagick Studio LLC, a non-profit organization
3   dedicated to making software imaging solutions freely available.
4 
5   You may not use this file except in compliance with the License.
6   obtain a copy of the License at
7 
8     http://www.imagemagick.org/script/license.php
9 
10   Unless required by applicable law or agreed to in writing, software
11   distributed under the License is distributed on an "AS IS" BASIS,
12   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   See the License for the specific language governing permissions and
14   limitations under the License.
15 
16   MagickCore private kernels for accelerated functions.
17 */
18 
19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21 
22 #if defined(__cplusplus) || defined(c_plusplus)
23 extern "C" {
24 #endif
25 
26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
27 
28 /*
29   Define declarations.
30 */
31 #define OPENCL_DEFINE(VAR,...)	"\n #""define " #VAR " " #__VA_ARGS__ " \n"
32 #define OPENCL_ELIF(...)	"\n #""elif " #__VA_ARGS__ " \n"
33 #define OPENCL_ELSE()		"\n #""else " " \n"
34 #define OPENCL_ENDIF()		"\n #""endif " " \n"
35 #define OPENCL_IF(...)		"\n #""if " #__VA_ARGS__ " \n"
36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
37 
38 /*
39   Typedef declarations.
40 */
41 
42 typedef struct _FloatPixelPacket
43 {
44   MagickRealType
45     red,
46     green,
47     blue,
48     alpha,
49     black;
50 } FloatPixelPacket;
51 
52 const char *accelerateKernels =
53 
54 /*
55   Define declarations.
56 */
57   OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
58   OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
59   OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
60   OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
61   OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
62   OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
63   OPENCL_DEFINE(SigmaRandom, (attenuate))
64   OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
65   OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
66   OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
67 
68 /*
69   Typedef declarations.
70 */
71   STRINGIFY(
72     typedef enum
73   {
74     UndefinedColorspace,
75     RGBColorspace,            /* Linear RGB colorspace */
76     GRAYColorspace,           /* greyscale (linear) image (faked 1 channel) */
77     TransparentColorspace,
78     OHTAColorspace,
79     LabColorspace,
80     XYZColorspace,
81     YCbCrColorspace,
82     YCCColorspace,
83     YIQColorspace,
84     YPbPrColorspace,
85     YUVColorspace,
86     CMYKColorspace,           /* negared linear RGB with black separated */
87     sRGBColorspace,           /* Default: non-lienar sRGB colorspace */
88     HSBColorspace,
89     HSLColorspace,
90     HWBColorspace,
91     Rec601LumaColorspace,
92     Rec601YCbCrColorspace,
93     Rec709LumaColorspace,
94     Rec709YCbCrColorspace,
95     LogColorspace,
96     CMYColorspace,            /* negated linear RGB colorspace */
97     LuvColorspace,
98     HCLColorspace,
99     LCHColorspace,            /* alias for LCHuv */
100     LMSColorspace,
101     LCHabColorspace,          /* Cylindrical (Polar) Lab */
102     LCHuvColorspace,          /* Cylindrical (Polar) Luv */
103     scRGBColorspace,
104     HSIColorspace,
105     HSVColorspace,            /* alias for HSB */
106     HCLpColorspace,
107     YDbDrColorspace
108   } ColorspaceType;
109   )
110 
111   STRINGIFY(
112     typedef enum
113     {
114       UndefinedCompositeOp,
115       NoCompositeOp,
116       ModulusAddCompositeOp,
117       AtopCompositeOp,
118       BlendCompositeOp,
119       BumpmapCompositeOp,
120       ChangeMaskCompositeOp,
121       ClearCompositeOp,
122       ColorBurnCompositeOp,
123       ColorDodgeCompositeOp,
124       ColorizeCompositeOp,
125       CopyBlackCompositeOp,
126       CopyBlueCompositeOp,
127       CopyCompositeOp,
128       CopyCyanCompositeOp,
129       CopyGreenCompositeOp,
130       CopyMagentaCompositeOp,
131       CopyOpacityCompositeOp,
132       CopyRedCompositeOp,
133       CopyYellowCompositeOp,
134       DarkenCompositeOp,
135       DstAtopCompositeOp,
136       DstCompositeOp,
137       DstInCompositeOp,
138       DstOutCompositeOp,
139       DstOverCompositeOp,
140       DifferenceCompositeOp,
141       DisplaceCompositeOp,
142       DissolveCompositeOp,
143       ExclusionCompositeOp,
144       HardLightCompositeOp,
145       HueCompositeOp,
146       InCompositeOp,
147       LightenCompositeOp,
148       LinearLightCompositeOp,
149       LuminizeCompositeOp,
150       MinusDstCompositeOp,
151       ModulateCompositeOp,
152       MultiplyCompositeOp,
153       OutCompositeOp,
154       OverCompositeOp,
155       OverlayCompositeOp,
156       PlusCompositeOp,
157       ReplaceCompositeOp,
158       SaturateCompositeOp,
159       ScreenCompositeOp,
160       SoftLightCompositeOp,
161       SrcAtopCompositeOp,
162       SrcCompositeOp,
163       SrcInCompositeOp,
164       SrcOutCompositeOp,
165       SrcOverCompositeOp,
166       ModulusSubtractCompositeOp,
167       ThresholdCompositeOp,
168       XorCompositeOp,
169       /* These are new operators, added after the above was last sorted.
170        * The list should be re-sorted only when a new library version is
171        * created.
172        */
173       DivideDstCompositeOp,
174       DistortCompositeOp,
175       BlurCompositeOp,
176       PegtopLightCompositeOp,
177       VividLightCompositeOp,
178       PinLightCompositeOp,
179       LinearDodgeCompositeOp,
180       LinearBurnCompositeOp,
181       MathematicsCompositeOp,
182       DivideSrcCompositeOp,
183       MinusSrcCompositeOp,
184       DarkenIntensityCompositeOp,
185       LightenIntensityCompositeOp
186     } CompositeOperator;
187   )
188 
189   STRINGIFY(
190      typedef enum
191      {
192        UndefinedFunction,
193        ArcsinFunction,
194        ArctanFunction,
195        PolynomialFunction,
196        SinusoidFunction
197      } MagickFunction;
198   )
199 
200   STRINGIFY(
201     typedef enum
202     {
203       UndefinedNoise,
204       UniformNoise,
205       GaussianNoise,
206       MultiplicativeGaussianNoise,
207       ImpulseNoise,
208       LaplacianNoise,
209       PoissonNoise,
210       RandomNoise
211     } NoiseType;
212   )
213 
214   STRINGIFY(
215     typedef enum
216   {
217     UndefinedPixelIntensityMethod = 0,
218     AveragePixelIntensityMethod,
219     BrightnessPixelIntensityMethod,
220     LightnessPixelIntensityMethod,
221     Rec601LumaPixelIntensityMethod,
222     Rec601LuminancePixelIntensityMethod,
223     Rec709LumaPixelIntensityMethod,
224     Rec709LuminancePixelIntensityMethod,
225     RMSPixelIntensityMethod,
226     MSPixelIntensityMethod
227   } PixelIntensityMethod;
228   )
229 
230   STRINGIFY(
231   typedef enum {
232     BoxWeightingFunction = 0,
233     TriangleWeightingFunction,
234     CubicBCWeightingFunction,
235     HannWeightingFunction,
236     HammingWeightingFunction,
237     BlackmanWeightingFunction,
238     GaussianWeightingFunction,
239     QuadraticWeightingFunction,
240     JincWeightingFunction,
241     SincWeightingFunction,
242     SincFastWeightingFunction,
243     KaiserWeightingFunction,
244     WelshWeightingFunction,
245     BohmanWeightingFunction,
246     LagrangeWeightingFunction,
247     CosineWeightingFunction,
248   } ResizeWeightingFunctionType;
249   )
250 
251   STRINGIFY(
252      typedef enum
253      {
254        UndefinedChannel = 0x0000,
255        RedChannel = 0x0001,
256        GrayChannel = 0x0001,
257        CyanChannel = 0x0001,
258        GreenChannel = 0x0002,
259        MagentaChannel = 0x0002,
260        BlueChannel = 0x0004,
261        YellowChannel = 0x0004,
262        BlackChannel = 0x0008,
263        AlphaChannel = 0x0010,
264        OpacityChannel = 0x0010,
265        IndexChannel = 0x0020,             /* Color Index Table? */
266        ReadMaskChannel = 0x0040,          /* Pixel is Not Readable? */
267        WriteMaskChannel = 0x0080,         /* Pixel is Write Protected? */
268        MetaChannel = 0x0100,              /* ???? */
269        CompositeChannels = 0x001F,
270        AllChannels = 0x7ffffff,
271        /*
272          Special purpose channel types.
273          FUTURE: are these needed any more - they are more like hacks
274          SyncChannels for example is NOT a real channel but a 'flag'
275          It really says -- "User has not defined channels"
276          Though it does have extra meaning in the "-auto-level" operator
277        */
278        TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
279        RGBChannels = 0x0200,      /* set alpha from grayscale mask in RGB */
280        GrayChannels = 0x0400,
281        SyncChannels = 0x20000,    /* channels modified as a single unit */
282        DefaultChannels = AllChannels
283      } ChannelType;  /* must correspond to PixelChannel */
284   )
285 
286 /*
287   Helper functions.
288 */
289 
290 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
291 
292   STRINGIFY(
293     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
294     {
295       return((CLQuantum) value);
296     }
297   )
298 
299 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
300 
301   STRINGIFY(
302     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
303     {
304       return((CLQuantum) (257.0f*value));
305     }
306   )
307 
308 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
309 
310   STRINGIFY(
311     inline CLQuantum ScaleCharToQuantum(const unsigned char value)
312     {
313       return((CLQuantum) (16843009.0*value));
314     }
315   )
316 
317 OPENCL_ENDIF()
318 
319   STRINGIFY(
320     inline int ClampToCanvas(const int offset,const int range)
321       {
322         return clamp(offset, (int)0, range-1);
323       }
324   )
325 
326   STRINGIFY(
327     inline CLQuantum ClampToQuantum(const float value)
328       {
329         return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
330       }
331   )
332 
333   STRINGIFY(
334     inline uint ScaleQuantumToMap(CLQuantum value)
335       {
336         if (value >= (CLQuantum) MaxMap)
337           return ((uint)MaxMap);
338         else
339           return ((uint)value);
340       }
341   )
342 
343   STRINGIFY(
344     inline float PerceptibleReciprocal(const float x)
345     {
346       float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
347       return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
348     }
349   )
350 
351   STRINGIFY(
352   inline float RoundToUnity(const float value)
353    {
354      return clamp(value,0.0f,1.0f);
355    }
356   )
357 
358   STRINGIFY(
359 
360   inline unsigned int getPixelIndex(const unsigned int number_channels,
361     const unsigned int columns, const unsigned int x, const unsigned int y)
362   {
363     return (x * number_channels) + (y * columns * number_channels);
364   }
365 
366   inline float getPixelRed(const __global CLQuantum *p)   { return (float)*p; }
367   inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
368   inline float getPixelBlue(const __global CLQuantum *p)  { return (float)*(p+2); }
369   inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
370 
371   inline void setPixelRed(__global CLQuantum *p,const CLQuantum value)   { *p=value; }
372   inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
373   inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value)  { *(p+2)=value; }
374   inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
375 
376   inline CLQuantum getBlue(CLPixelType p)               { return p.x; }
377   inline void setBlue(CLPixelType* p, CLQuantum value)  { (*p).x = value; }
378   inline float getBlueF4(float4 p)                      { return p.x; }
379   inline void setBlueF4(float4* p, float value)         { (*p).x = value; }
380 
381   inline CLQuantum getGreen(CLPixelType p)              { return p.y; }
382   inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
383   inline float getGreenF4(float4 p)                     { return p.y; }
384   inline void setGreenF4(float4* p, float value)        { (*p).y = value; }
385 
386   inline CLQuantum getRed(CLPixelType p)                { return p.z; }
387   inline void setRed(CLPixelType* p, CLQuantum value)   { (*p).z = value; }
388   inline float getRedF4(float4 p)                       { return p.z; }
389   inline void setRedF4(float4* p, float value)          { (*p).z = value; }
390 
391   inline CLQuantum getAlpha(CLPixelType p)              { return p.w; }
392   inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
393   inline float getAlphaF4(float4 p)                     { return p.w; }
394   inline void setAlphaF4(float4* p, float value)        { (*p).w = value; }
395 
396   inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
397     const ChannelType channel, float *red, float *green, float *blue, float *alpha)
398   {
399     if ((channel & RedChannel) != 0)
400       *red=getPixelRed(p);
401 
402     if (number_channels > 2)
403       {
404         if ((channel & GreenChannel) != 0)
405           *green=getPixelGreen(p);
406 
407         if ((channel & BlueChannel) != 0)
408           *blue=getPixelBlue(p);
409       }
410 
411     if (((number_channels == 4) || (number_channels == 2)) &&
412         ((channel & AlphaChannel) != 0))
413       *alpha=getPixelAlpha(p,number_channels);
414   }
415 
416   inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
417     const unsigned int columns, const unsigned int x, const unsigned int y)
418   {
419     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
420 
421     float4 pixel;
422 
423     pixel.x=getPixelRed(p);
424 
425     if (number_channels > 2)
426       {
427         pixel.y=getPixelGreen(p);
428         pixel.z=getPixelBlue(p);
429       }
430 
431     if ((number_channels == 4) || (number_channels == 2))
432       pixel.w=getPixelAlpha(p,number_channels);
433     return(pixel);
434   }
435 
436   inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
437     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
438   {
439     const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
440 
441     float red = 0.0f;
442     float green = 0.0f;
443     float blue = 0.0f;
444     float alpha = 0.0f;
445 
446     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
447     return (float4)(red, green, blue, alpha);
448   }
449 
450   inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
451     const ChannelType channel, float red, float green, float blue, float alpha)
452   {
453     if ((channel & RedChannel) != 0)
454       setPixelRed(p,ClampToQuantum(red));
455 
456     if (number_channels > 2)
457       {
458         if ((channel & GreenChannel) != 0)
459           setPixelGreen(p,ClampToQuantum(green));
460 
461         if ((channel & BlueChannel) != 0)
462           setPixelBlue(p,ClampToQuantum(blue));
463       }
464 
465     if (((number_channels == 4) || (number_channels == 2)) &&
466         ((channel & AlphaChannel) != 0))
467       setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
468   }
469 
470   inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
471     const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
472   {
473     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
474 
475     setPixelRed(p,ClampToQuantum(pixel.x));
476 
477     if (number_channels > 2)
478       {
479         setPixelGreen(p,ClampToQuantum(pixel.y));
480         setPixelBlue(p,ClampToQuantum(pixel.z));
481       }
482 
483     if ((number_channels == 4) || (number_channels == 2))
484       setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
485   }
486 
487   inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
488     const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
489     float4 pixel)
490   {
491     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
492     WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
493   }
494 
495   inline float GetPixelIntensity(const unsigned int colorspace,
496     const unsigned int method,float red,float green,float blue)
497   {
498     float intensity;
499 
500     if (colorspace == GRAYColorspace)
501       return red;
502 
503     switch (method)
504     {
505       case AveragePixelIntensityMethod:
506         {
507           intensity=(red+green+blue)/3.0;
508           break;
509         }
510       case BrightnessPixelIntensityMethod:
511         {
512           intensity=MagickMax(MagickMax(red,green),blue);
513           break;
514         }
515       case LightnessPixelIntensityMethod:
516         {
517           intensity=(MagickMin(MagickMin(red,green),blue)+
518               MagickMax(MagickMax(red,green),blue))/2.0;
519           break;
520         }
521       case MSPixelIntensityMethod:
522         {
523           intensity=(float) (((float) red*red+green*green+blue*blue)/
524               (3.0*QuantumRange));
525           break;
526         }
527       case Rec601LumaPixelIntensityMethod:
528         {
529           /*
530           if (image->colorspace == RGBColorspace)
531           {
532             red=EncodePixelGamma(red);
533             green=EncodePixelGamma(green);
534             blue=EncodePixelGamma(blue);
535           }
536           */
537           intensity=0.298839*red+0.586811*green+0.114350*blue;
538           break;
539         }
540       case Rec601LuminancePixelIntensityMethod:
541         {
542           /*
543           if (image->colorspace == sRGBColorspace)
544           {
545             red=DecodePixelGamma(red);
546             green=DecodePixelGamma(green);
547             blue=DecodePixelGamma(blue);
548           }
549           */
550           intensity=0.298839*red+0.586811*green+0.114350*blue;
551           break;
552         }
553       case Rec709LumaPixelIntensityMethod:
554       default:
555         {
556           /*
557           if (image->colorspace == RGBColorspace)
558           {
559             red=EncodePixelGamma(red);
560             green=EncodePixelGamma(green);
561             blue=EncodePixelGamma(blue);
562           }
563           */
564           intensity=0.212656*red+0.715158*green+0.072186*blue;
565           break;
566         }
567       case Rec709LuminancePixelIntensityMethod:
568         {
569           /*
570           if (image->colorspace == sRGBColorspace)
571           {
572             red=DecodePixelGamma(red);
573             green=DecodePixelGamma(green);
574             blue=DecodePixelGamma(blue);
575           }
576           */
577           intensity=0.212656*red+0.715158*green+0.072186*blue;
578           break;
579         }
580       case RMSPixelIntensityMethod:
581         {
582           intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
583               sqrt(3.0));
584           break;
585         }
586     }
587 
588     return intensity;
589   }
590 
591   inline int mirrorBottom(int value)
592   {
593       return (value < 0) ? - (value) : value;
594   }
595 
596   inline int mirrorTop(int value, int width)
597   {
598       return (value >= width) ? (2 * width - value - 1) : value;
599   }
600   )
601 
602 /*
603 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
604 %                                                                             %
605 %                                                                             %
606 %                                                                             %
607 %     A d d N o i s e                                                         %
608 %                                                                             %
609 %                                                                             %
610 %                                                                             %
611 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
612 */
613 
614   STRINGIFY(
615   /*
616   Part of MWC64X by David Thomas, dt10@imperial.ac.uk
617   This is provided under BSD, full license is with the main package.
618   See http://www.doc.ic.ac.uk/~dt10/research
619   */
620 
621   // Pre: a<M, b<M
622   // Post: r=(a+b) mod M
623   ulong MWC_AddMod64(ulong a, ulong b, ulong M)
624   {
625     ulong v=a+b;
626     //if( (v>=M) || (v<a) )
627     if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
628       v=v-M;
629     return v;
630   }
631 
632   // Pre: a<M,b<M
633   // Post: r=(a*b) mod M
634   // This could be done more efficently, but it is portable, and should
635   // be easy to understand. It can be replaced with any of the better
636   // modular multiplication algorithms (for example if you know you have
637   // double precision available or something).
638   ulong MWC_MulMod64(ulong a, ulong b, ulong M)
639   {
640     ulong r=0;
641     while(a!=0){
642       if(a&1)
643         r=MWC_AddMod64(r,b,M);
644       b=MWC_AddMod64(b,b,M);
645       a=a>>1;
646     }
647     return r;
648   }
649 
650   // Pre: a<M, e>=0
651   // Post: r=(a^b) mod M
652   // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
653   // most architectures
654   ulong MWC_PowMod64(ulong a, ulong e, ulong M)
655   {
656     ulong sqr=a, acc=1;
657     while(e!=0){
658       if(e&1)
659         acc=MWC_MulMod64(acc,sqr,M);
660         sqr=MWC_MulMod64(sqr,sqr,M);
661       e=e>>1;
662     }
663     return acc;
664   }
665 
666   uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
667   {
668     ulong m=MWC_PowMod64(A, distance, M);
669     ulong x=curr.x*(ulong)A+curr.y;
670     x=MWC_MulMod64(x, m, M);
671     return (uint2)((uint)(x/A), (uint)(x%A));
672   }
673 
674   uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
675   {
676     // This is an arbitrary constant for starting LCG jumping from. I didn't
677     // want to start from 1, as then you end up with the two or three first values
678     // being a bit poor in ones - once you've decided that, one constant is as
679     // good as any another. There is no deep mathematical reason for it, I just
680     // generated a random number.
681     enum{ MWC_BASEID = 4077358422479273989UL };
682 
683     ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
684     ulong m=MWC_PowMod64(A, dist, M);
685 
686     ulong x=MWC_MulMod64(MWC_BASEID, m, M);
687     return (uint2)((uint)(x/A), (uint)(x%A));
688   }
689 
690   //! Represents the state of a particular generator
691   typedef struct{ uint x; uint c; } mwc64x_state_t;
692 
693   enum{ MWC64X_A = 4294883355U };
694   enum{ MWC64X_M = 18446383549859758079UL };
695 
696   void MWC64X_Step(mwc64x_state_t *s)
697   {
698     uint X=s->x, C=s->c;
699 
700     uint Xn=MWC64X_A*X+C;
701     uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
702     uint Cn=mad_hi(MWC64X_A,X,carry);
703 
704     s->x=Xn;
705     s->c=Cn;
706   }
707 
708   void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
709   {
710     uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), MWC64X_A, MWC64X_M, distance);
711     s->x=tmp.x;
712     s->c=tmp.y;
713   }
714 
715   void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
716   {
717     uint2 tmp=MWC_SeedImpl_Mod64(MWC64X_A, MWC64X_M, 1, 0, baseOffset, perStreamOffset);
718     s->x=tmp.x;
719     s->c=tmp.y;
720   }
721 
722   //! Return a 32-bit integer in the range [0..2^32)
723   uint MWC64X_NextUint(mwc64x_state_t *s)
724   {
725     uint res=s->x ^ s->c;
726     MWC64X_Step(s);
727     return res;
728   }
729 
730   //
731   // End of MWC64X excerpt
732   //
733 
734   float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
735   {
736     return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
737   }
738 
739   float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
740   {
741     float
742       alpha,
743       beta,
744       noise,
745       sigma;
746 
747     noise = 0.0f;
748     alpha=mwcReadPseudoRandomValue(r);
749     switch(noise_type)
750     {
751       case UniformNoise:
752       default:
753         {
754           noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
755           break;
756         }
757       case GaussianNoise:
758         {
759           float
760             gamma,
761             tau;
762 
763           if (alpha == 0.0f)
764             alpha=1.0f;
765           beta=mwcReadPseudoRandomValue(r);
766           gamma=sqrt(-2.0f*log(alpha));
767           sigma=gamma*cospi((2.0f*beta));
768           tau=gamma*sinpi((2.0f*beta));
769           noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
770           break;
771         }
772       case ImpulseNoise:
773       {
774         if (alpha < (SigmaImpulse/2.0f))
775           noise=0.0f;
776         else
777           if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
778             noise=QuantumRange;
779           else
780             noise=pixel;
781         break;
782       }
783       case LaplacianNoise:
784       {
785         if (alpha <= 0.5f)
786           {
787             if (alpha <= MagickEpsilon)
788               noise=(pixel-QuantumRange);
789             else
790               noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
791                 0.5f);
792             break;
793           }
794         beta=1.0f-alpha;
795         if (beta <= (0.5f*MagickEpsilon))
796           noise=(pixel+QuantumRange);
797         else
798           noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
799         break;
800       }
801       case MultiplicativeGaussianNoise:
802       {
803         sigma=1.0f;
804         if (alpha > MagickEpsilon)
805           sigma=sqrt(-2.0f*log(alpha));
806         beta=mwcReadPseudoRandomValue(r);
807         noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
808           cospi((2.0f*beta))/2.0f);
809         break;
810       }
811       case PoissonNoise:
812       {
813         float
814           poisson;
815         unsigned int i;
816         poisson=exp(-SigmaPoisson*QuantumScale*pixel);
817         for (i=0; alpha > poisson; i++)
818         {
819           beta=mwcReadPseudoRandomValue(r);
820           alpha*=beta;
821         }
822         noise=(QuantumRange*i/SigmaPoisson);
823         break;
824       }
825       case RandomNoise:
826       {
827         noise=(QuantumRange*SigmaRandom*alpha);
828         break;
829       }
830     }
831     return noise;
832   }
833 
834   __kernel
835   void AddNoise(const __global CLQuantum *image,
836     const unsigned int number_channels,const ChannelType channel,
837     const unsigned int length,const unsigned int pixelsPerWorkItem,
838     const NoiseType noise_type,const float attenuate,const unsigned int seed0,
839     const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
840     __global CLQuantum *filteredImage)
841   {
842     mwc64x_state_t rng;
843     rng.x = seed0;
844     rng.c = seed1;
845 
846     uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
847     uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
848     MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
849 
850     uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
851     uint count = pixelsPerWorkItem;
852 
853     while (count > 0 && pos < length)
854     {
855       const __global CLQuantum *p = image + pos;
856       __global CLQuantum *q = filteredImage + pos;
857 
858       float red;
859       float green;
860       float blue;
861       float alpha;
862 
863       ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
864 
865       if ((channel & RedChannel) != 0)
866         red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
867 
868       if (number_channels > 2)
869       {
870         if ((channel & GreenChannel) != 0)
871           green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
872 
873         if ((channel & BlueChannel) != 0)
874           blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
875       }
876 
877       if (((number_channels == 4) || (number_channels == 2)) &&
878           ((channel & AlphaChannel) != 0))
879         alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
880 
881       WriteChannels(q, number_channels, channel, red, green, blue, alpha);
882 
883       pos += (get_local_size(0) * number_channels);
884       count--;
885     }
886   }
887   )
888 
889 /*
890 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
891 %                                                                             %
892 %                                                                             %
893 %                                                                             %
894 %    B l u r                                                                  %
895 %                                                                             %
896 %                                                                             %
897 %                                                                             %
898 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
899 */
900 
901   STRINGIFY(
902   /*
903   Reduce image noise and reduce detail levels by row
904   */
905   __kernel void BlurRow(const __global CLQuantum *image,
906     const unsigned int number_channels,const ChannelType channel,
907     __constant float *filter,const unsigned int width,
908     const unsigned int imageColumns,const unsigned int imageRows,
909     __local float4 *temp,__global float4 *tempImage)
910   {
911     const int x = get_global_id(0);
912     const int y = get_global_id(1);
913 
914     const int columns = imageColumns;
915 
916     const unsigned int radius = (width-1)/2;
917     const int wsize = get_local_size(0);
918     const unsigned int loadSize = wsize+width;
919 
920     //group coordinate
921     const int groupX=get_local_size(0)*get_group_id(0);
922 
923     //parallel load and clamp
924     for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
925     {
926       int cx = ClampToCanvas(i + groupX - radius, columns);
927       temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
928     }
929 
930     // barrier
931     barrier(CLK_LOCAL_MEM_FENCE);
932 
933     // only do the work if this is not a patched item
934     if (get_global_id(0) < columns)
935     {
936       // compute
937       float4 result = (float4) 0;
938 
939       int i = 0;
940 
941       for ( ; i+7 < width; )
942       {
943         for (int j=0; j < 8; j++)
944           result+=filter[i+j]*temp[i+j+get_local_id(0)];
945         i+=8;
946       }
947 
948       for ( ; i < width; i++)
949         result+=filter[i]*temp[i+get_local_id(0)];
950 
951       // write back to global
952       tempImage[y*columns+x] = result;
953     }
954   }
955   )
956 
957   STRINGIFY(
958   /*
959   Reduce image noise and reduce detail levels by line
960   */
961   __kernel void BlurColumn(const __global float4 *blurRowData,
962     const unsigned int number_channels,const ChannelType channel,
963     __constant float *filter,const unsigned int width,
964     const unsigned int imageColumns,const unsigned int imageRows,
965     __local float4 *temp,__global CLQuantum *filteredImage)
966   {
967     const int x = get_global_id(0);
968     const int y = get_global_id(1);
969 
970     const int columns = imageColumns;
971     const int rows = imageRows;
972 
973     unsigned int radius = (width-1)/2;
974     const int wsize = get_local_size(1);
975     const unsigned int loadSize = wsize+width;
976 
977     //group coordinate
978     const int groupX=get_local_size(0)*get_group_id(0);
979     const int groupY=get_local_size(1)*get_group_id(1);
980     //notice that get_local_size(0) is 1, so
981     //groupX=get_group_id(0);
982 
983     //parallel load and clamp
984     for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
985       temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
986 
987     // barrier
988     barrier(CLK_LOCAL_MEM_FENCE);
989 
990     // only do the work if this is not a patched item
991     if (get_global_id(1) < rows)
992     {
993       // compute
994       float4 result = (float4) 0;
995 
996       int i = 0;
997 
998       for ( ; i+7 < width; )
999       {
1000         for (int j=0; j < 8; j++)
1001           result+=filter[i+j]*temp[i+j+get_local_id(1)];
1002         i+=8;
1003       }
1004 
1005       for ( ; i < width; i++)
1006         result+=filter[i]*temp[i+get_local_id(1)];
1007 
1008       // write back to global
1009       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
1010     }
1011   }
1012   )
1013 
1014 /*
1015 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1016 %                                                                             %
1017 %                                                                             %
1018 %                                                                             %
1019 %    C o n t r a s t                                                          %
1020 %                                                                             %
1021 %                                                                             %
1022 %                                                                             %
1023 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1024 */
1025 
1026   STRINGIFY(
1027 
1028   inline float3 ConvertRGBToHSB(CLPixelType pixel) {
1029     float3 HueSaturationBrightness;
1030     HueSaturationBrightness.x = 0.0f; // Hue
1031     HueSaturationBrightness.y = 0.0f; // Saturation
1032     HueSaturationBrightness.z = 0.0f; // Brightness
1033 
1034     float r=(float) getRed(pixel);
1035     float g=(float) getGreen(pixel);
1036     float b=(float) getBlue(pixel);
1037 
1038     float tmin=MagickMin(MagickMin(r,g),b);
1039     float tmax=MagickMax(MagickMax(r,g),b);
1040 
1041     if (tmax!=0.0f) {
1042       float delta=tmax-tmin;
1043       HueSaturationBrightness.y=delta/tmax;
1044       HueSaturationBrightness.z=QuantumScale*tmax;
1045 
1046       if (delta != 0.0f) {
1047   HueSaturationBrightness.x = ((r == tmax)?0.0f:((g == tmax)?2.0f:4.0f));
1048   HueSaturationBrightness.x += ((r == tmax)?(g-b):((g == tmax)?(b-r):(r-g)))/delta;
1049         HueSaturationBrightness.x/=6.0f;
1050         HueSaturationBrightness.x += (HueSaturationBrightness.x < 0.0f)?0.0f:1.0f;
1051       }
1052     }
1053     return HueSaturationBrightness;
1054   }
1055 
1056   inline CLPixelType ConvertHSBToRGB(float3 HueSaturationBrightness) {
1057 
1058     float hue = HueSaturationBrightness.x;
1059     float brightness = HueSaturationBrightness.z;
1060     float saturation = HueSaturationBrightness.y;
1061 
1062     CLPixelType rgb;
1063 
1064     if (saturation == 0.0f) {
1065       setRed(&rgb,ClampToQuantum(QuantumRange*brightness));
1066       setGreen(&rgb,getRed(rgb));
1067       setBlue(&rgb,getRed(rgb));
1068     }
1069     else {
1070 
1071       float h=6.0f*(hue-floor(hue));
1072       float f=h-floor(h);
1073       float p=brightness*(1.0f-saturation);
1074       float q=brightness*(1.0f-saturation*f);
1075       float t=brightness*(1.0f-(saturation*(1.0f-f)));
1076 
1077       float clampedBrightness = ClampToQuantum(QuantumRange*brightness);
1078       float clamped_t = ClampToQuantum(QuantumRange*t);
1079       float clamped_p = ClampToQuantum(QuantumRange*p);
1080       float clamped_q = ClampToQuantum(QuantumRange*q);
1081       int ih = (int)h;
1082       setRed(&rgb, (ih == 1)?clamped_q:
1083         (ih == 2 || ih == 3)?clamped_p:
1084         (ih == 4)?clamped_t:
1085                  clampedBrightness);
1086 
1087       setGreen(&rgb, (ih == 1 || ih == 2)?clampedBrightness:
1088         (ih == 3)?clamped_q:
1089         (ih == 4 || ih == 5)?clamped_p:
1090                  clamped_t);
1091 
1092       setBlue(&rgb, (ih == 2)?clamped_t:
1093         (ih == 3 || ih == 4)?clampedBrightness:
1094         (ih == 5)?clamped_q:
1095                  clamped_p);
1096     }
1097     return rgb;
1098   }
1099 
1100   __kernel void Contrast(__global CLPixelType *im, const unsigned int sharpen)
1101   {
1102 
1103     const int sign = sharpen!=0?1:-1;
1104     const int x = get_global_id(0);
1105     const int y = get_global_id(1);
1106     const int columns = get_global_size(0);
1107     const int c = x + y * columns;
1108 
1109     CLPixelType pixel = im[c];
1110     float3 HueSaturationBrightness = ConvertRGBToHSB(pixel);
1111     float brightness = HueSaturationBrightness.z;
1112     brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1113     brightness = clamp(brightness,0.0f,1.0f);
1114     HueSaturationBrightness.z = brightness;
1115 
1116     CLPixelType filteredPixel = ConvertHSBToRGB(HueSaturationBrightness);
1117     filteredPixel.w = pixel.w;
1118     im[c] = filteredPixel;
1119   }
1120   )
1121 
1122 /*
1123 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1124 %                                                                             %
1125 %                                                                             %
1126 %                                                                             %
1127 %    C o n t r a s t S t r e t c h                                            %
1128 %                                                                             %
1129 %                                                                             %
1130 %                                                                             %
1131 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1132 */
1133 
1134     STRINGIFY(
1135     /*
1136     */
1137     __kernel void Histogram(__global CLPixelType * restrict im,
1138       const ChannelType channel,
1139       const unsigned int colorspace,
1140       const unsigned int method,
1141       __global uint4 * restrict histogram)
1142       {
1143         const int x = get_global_id(0);
1144         const int y = get_global_id(1);
1145         const int columns = get_global_size(0);
1146         const int c = x + y * columns;
1147         if ((channel & SyncChannels) != 0)
1148         {
1149           float red=(float)getRed(im[c]);
1150           float green=(float)getGreen(im[c]);
1151           float blue=(float)getBlue(im[c]);
1152 
1153           float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1154           uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1155           atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1156         }
1157         else
1158         {
1159           // for equalizing, we always need all channels?
1160           // otherwise something more
1161         }
1162       }
1163     )
1164 
1165     STRINGIFY(
1166     /*
1167     */
1168     __kernel void ContrastStretch(__global CLPixelType * restrict im,
1169       const ChannelType channel,
1170       __global CLPixelType * restrict stretch_map,
1171       const float4 white, const float4 black)
1172       {
1173         const int x = get_global_id(0);
1174         const int y = get_global_id(1);
1175         const int columns = get_global_size(0);
1176         const int c = x + y * columns;
1177 
1178         uint ePos;
1179         CLPixelType oValue, eValue;
1180         CLQuantum red, green, blue, alpha;
1181 
1182         //read from global
1183         oValue=im[c];
1184 
1185         if ((channel & RedChannel) != 0)
1186         {
1187           if (getRedF4(white) != getRedF4(black))
1188           {
1189             ePos = ScaleQuantumToMap(getRed(oValue));
1190             eValue = stretch_map[ePos];
1191             red = getRed(eValue);
1192           }
1193         }
1194 
1195         if ((channel & GreenChannel) != 0)
1196         {
1197           if (getGreenF4(white) != getGreenF4(black))
1198           {
1199             ePos = ScaleQuantumToMap(getGreen(oValue));
1200             eValue = stretch_map[ePos];
1201             green = getGreen(eValue);
1202           }
1203         }
1204 
1205         if ((channel & BlueChannel) != 0)
1206         {
1207           if (getBlueF4(white) != getBlueF4(black))
1208           {
1209             ePos = ScaleQuantumToMap(getBlue(oValue));
1210             eValue = stretch_map[ePos];
1211             blue = getBlue(eValue);
1212           }
1213         }
1214 
1215         if ((channel & AlphaChannel) != 0)
1216         {
1217           if (getAlphaF4(white) != getAlphaF4(black))
1218           {
1219             ePos = ScaleQuantumToMap(getAlpha(oValue));
1220             eValue = stretch_map[ePos];
1221             alpha = getAlpha(eValue);
1222           }
1223         }
1224 
1225         //write back
1226         im[c]=(CLPixelType)(blue, green, red, alpha);
1227 
1228       }
1229     )
1230 
1231 /*
1232 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1233 %                                                                             %
1234 %                                                                             %
1235 %                                                                             %
1236 %    C o n v o l v e                                                          %
1237 %                                                                             %
1238 %                                                                             %
1239 %                                                                             %
1240 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1241 */
1242 
1243   STRINGIFY(
1244     __kernel
1245     void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1246     const unsigned int imageWidth, const unsigned int imageHeight,
1247     __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1248     const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1249 
1250       int2 blockID;
1251       blockID.x = get_global_id(0) / get_local_size(0);
1252       blockID.y = get_global_id(1) / get_local_size(1);
1253 
1254       // image area processed by this workgroup
1255       int2 imageAreaOrg;
1256       imageAreaOrg.x = blockID.x * get_local_size(0);
1257       imageAreaOrg.y = blockID.y * get_local_size(1);
1258 
1259       int2 midFilterDimen;
1260       midFilterDimen.x = (filterWidth-1)/2;
1261       midFilterDimen.y = (filterHeight-1)/2;
1262 
1263       int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1264 
1265       // dimension of the local cache
1266       int2 cachedAreaDimen;
1267       cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1268       cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1269 
1270       // cache the pixels accessed by this workgroup in local memory
1271       int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1272       int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1273       int groupSize = get_local_size(0) * get_local_size(1);
1274       for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1275 
1276         int2 cachedAreaIndex;
1277         cachedAreaIndex.x = i % cachedAreaDimen.x;
1278         cachedAreaIndex.y = i / cachedAreaDimen.x;
1279 
1280         int2 imagePixelIndex;
1281         imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1282 
1283         // only support EdgeVirtualPixelMethod through ClampToCanvas
1284         // TODO: implement other virtual pixel method
1285         imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1286         imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1287 
1288         pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1289       }
1290 
1291       // cache the filter
1292       for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1293         filterCache[i] = filter[i];
1294       }
1295       barrier(CLK_LOCAL_MEM_FENCE);
1296 
1297 
1298       int2 imageIndex;
1299       imageIndex.x = imageAreaOrg.x + get_local_id(0);
1300       imageIndex.y = imageAreaOrg.y + get_local_id(1);
1301 
1302       // if out-of-range, stops here and quit
1303       if (imageIndex.x >= imageWidth
1304         || imageIndex.y >= imageHeight) {
1305           return;
1306       }
1307 
1308       int filterIndex = 0;
1309       float4 sum = (float4)0.0f;
1310       float gamma = 0.0f;
1311       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1312         int cacheIndexY = get_local_id(1);
1313         for (int j = 0; j < filterHeight; j++) {
1314           int cacheIndexX = get_local_id(0);
1315           for (int i = 0; i < filterWidth; i++) {
1316             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1317             float f = filterCache[filterIndex];
1318 
1319             sum.x += f * p.x;
1320             sum.y += f * p.y;
1321             sum.z += f * p.z;
1322             sum.w += f * p.w;
1323 
1324             gamma += f;
1325             filterIndex++;
1326             cacheIndexX++;
1327           }
1328           cacheIndexY++;
1329         }
1330       }
1331       else {
1332         int cacheIndexY = get_local_id(1);
1333         for (int j = 0; j < filterHeight; j++) {
1334           int cacheIndexX = get_local_id(0);
1335           for (int i = 0; i < filterWidth; i++) {
1336 
1337             CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1338             float alpha = QuantumScale*p.w;
1339             float f = filterCache[filterIndex];
1340             float g = alpha * f;
1341 
1342             sum.x += g*p.x;
1343             sum.y += g*p.y;
1344             sum.z += g*p.z;
1345             sum.w += f*p.w;
1346 
1347             gamma += g;
1348             filterIndex++;
1349             cacheIndexX++;
1350           }
1351           cacheIndexY++;
1352         }
1353         gamma = PerceptibleReciprocal(gamma);
1354         sum.xyz = gamma*sum.xyz;
1355       }
1356       CLPixelType outputPixel;
1357       outputPixel.x = ClampToQuantum(sum.x);
1358       outputPixel.y = ClampToQuantum(sum.y);
1359       outputPixel.z = ClampToQuantum(sum.z);
1360       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1361 
1362       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1363     }
1364   )
1365 
1366   STRINGIFY(
1367     __kernel
1368     void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1369                   const uint imageWidth, const uint imageHeight,
1370                   __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1371                   const uint matte, const ChannelType channel) {
1372 
1373       int2 imageIndex;
1374       imageIndex.x = get_global_id(0);
1375       imageIndex.y = get_global_id(1);
1376 
1377       /*
1378       unsigned int imageWidth = get_global_size(0);
1379       unsigned int imageHeight = get_global_size(1);
1380       */
1381       if (imageIndex.x >= imageWidth
1382           || imageIndex.y >= imageHeight)
1383           return;
1384 
1385       int2 midFilterDimen;
1386       midFilterDimen.x = (filterWidth-1)/2;
1387       midFilterDimen.y = (filterHeight-1)/2;
1388 
1389       int filterIndex = 0;
1390       float4 sum = (float4)0.0f;
1391       float gamma = 0.0f;
1392       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1393         for (int j = 0; j < filterHeight; j++) {
1394           int2 inputPixelIndex;
1395           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1396           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1397           for (int i = 0; i < filterWidth; i++) {
1398             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1399             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1400 
1401             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1402             float f = filter[filterIndex];
1403 
1404             sum.x += f * p.x;
1405             sum.y += f * p.y;
1406             sum.z += f * p.z;
1407             sum.w += f * p.w;
1408 
1409             gamma += f;
1410 
1411             filterIndex++;
1412           }
1413         }
1414       }
1415       else {
1416 
1417         for (int j = 0; j < filterHeight; j++) {
1418           int2 inputPixelIndex;
1419           inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1420           inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1421           for (int i = 0; i < filterWidth; i++) {
1422             inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1423             inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1424 
1425             CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1426             float alpha = QuantumScale*p.w;
1427             float f = filter[filterIndex];
1428             float g = alpha * f;
1429 
1430             sum.x += g*p.x;
1431             sum.y += g*p.y;
1432             sum.z += g*p.z;
1433             sum.w += f*p.w;
1434 
1435             gamma += g;
1436 
1437 
1438             filterIndex++;
1439           }
1440         }
1441         gamma = PerceptibleReciprocal(gamma);
1442         sum.xyz = gamma*sum.xyz;
1443       }
1444 
1445       CLPixelType outputPixel;
1446       outputPixel.x = ClampToQuantum(sum.x);
1447       outputPixel.y = ClampToQuantum(sum.y);
1448       outputPixel.z = ClampToQuantum(sum.z);
1449       outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1450 
1451       output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1452     }
1453   )
1454 
1455 /*
1456 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1457 %                                                                             %
1458 %                                                                             %
1459 %                                                                             %
1460 %     D e s p e c k l e                                                       %
1461 %                                                                             %
1462 %                                                                             %
1463 %                                                                             %
1464 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1465 */
1466 
1467   STRINGIFY(
1468 
1469   __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1470   , const unsigned int imageWidth, const unsigned int imageHeight
1471   , const int2 offset, const int polarity, const int matte) {
1472 
1473     int x = get_global_id(0);
1474     int y = get_global_id(1);
1475 
1476     CLPixelType v = inputImage[y*imageWidth+x];
1477 
1478     int2 neighbor;
1479     neighbor.y = y + offset.y;
1480     neighbor.x = x + offset.x;
1481 
1482     int2 clampedNeighbor;
1483     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1484     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1485 
1486     CLPixelType r = (clampedNeighbor.x == neighbor.x
1487                      && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1488     :(CLPixelType)0;
1489 
1490     int sv[4];
1491     sv[0] = (int)v.x;
1492     sv[1] = (int)v.y;
1493     sv[2] = (int)v.z;
1494     sv[3] = (int)v.w;
1495 
1496     int sr[4];
1497     sr[0] = (int)r.x;
1498     sr[1] = (int)r.y;
1499     sr[2] = (int)r.z;
1500     sr[3] = (int)r.w;
1501 
1502     if (polarity > 0) {
1503       \n #pragma unroll 4\n
1504       for (unsigned int i = 0; i < 4; i++) {
1505         sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1506       }
1507     }
1508     else {
1509       \n #pragma unroll 4\n
1510       for (unsigned int i = 0; i < 4; i++) {
1511         sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1512       }
1513 
1514     }
1515 
1516     v.x = (CLQuantum)sv[0];
1517     v.y = (CLQuantum)sv[1];
1518     v.z = (CLQuantum)sv[2];
1519 
1520     if (matte!=0)
1521       v.w = (CLQuantum)sv[3];
1522 
1523     outputImage[y*imageWidth+x] = v;
1524 
1525     }
1526 
1527 
1528   )
1529 
1530   STRINGIFY(
1531 
1532   __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1533   , const unsigned int imageWidth, const unsigned int imageHeight
1534   , const int2 offset, const int polarity, const int matte) {
1535 
1536     int x = get_global_id(0);
1537     int y = get_global_id(1);
1538 
1539     CLPixelType v = inputImage[y*imageWidth+x];
1540 
1541     int2 neighbor, clampedNeighbor;
1542 
1543     neighbor.y = y + offset.y;
1544     neighbor.x = x + offset.x;
1545     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1546     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1547 
1548     CLPixelType r = (clampedNeighbor.x == neighbor.x
1549       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1550     :(CLPixelType)0;
1551 
1552 
1553     neighbor.y = y - offset.y;
1554     neighbor.x = x - offset.x;
1555     clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1556     clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1557 
1558     CLPixelType s = (clampedNeighbor.x == neighbor.x
1559       && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1560     :(CLPixelType)0;
1561 
1562 
1563     int sv[4];
1564     sv[0] = (int)v.x;
1565     sv[1] = (int)v.y;
1566     sv[2] = (int)v.z;
1567     sv[3] = (int)v.w;
1568 
1569     int sr[4];
1570     sr[0] = (int)r.x;
1571     sr[1] = (int)r.y;
1572     sr[2] = (int)r.z;
1573     sr[3] = (int)r.w;
1574 
1575     int ss[4];
1576     ss[0] = (int)s.x;
1577     ss[1] = (int)s.y;
1578     ss[2] = (int)s.z;
1579     ss[3] = (int)s.w;
1580 
1581     if (polarity > 0) {
1582       \n #pragma unroll 4\n
1583       for (unsigned int i = 0; i < 4; i++) {
1584         //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] )   ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1585         //
1586         //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1587         //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) ))  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1588         sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0)  ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1589       }
1590     }
1591     else {
1592       \n #pragma unroll 4\n
1593       for (unsigned int i = 0; i < 4; i++) {
1594         //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] )   ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1595         //
1596         //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1597         sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0)   ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1598       }
1599     }
1600 
1601     v.x = (CLQuantum)sv[0];
1602     v.y = (CLQuantum)sv[1];
1603     v.z = (CLQuantum)sv[2];
1604 
1605     if (matte!=0)
1606       v.w = (CLQuantum)sv[3];
1607 
1608     outputImage[y*imageWidth+x] = v;
1609 
1610     }
1611   )
1612 
1613 /*
1614 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1615 %                                                                             %
1616 %                                                                             %
1617 %                                                                             %
1618 %     E q u a l i z e                                                         %
1619 %                                                                             %
1620 %                                                                             %
1621 %                                                                             %
1622 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1623 */
1624 
1625     STRINGIFY(
1626     /*
1627     */
1628     __kernel void Equalize(__global CLPixelType * restrict im,
1629       const ChannelType channel,
1630       __global CLPixelType * restrict equalize_map,
1631       const float4 white, const float4 black)
1632       {
1633         const int x = get_global_id(0);
1634         const int y = get_global_id(1);
1635         const int columns = get_global_size(0);
1636         const int c = x + y * columns;
1637 
1638         uint ePos;
1639         CLPixelType oValue, eValue;
1640         CLQuantum red, green, blue, alpha;
1641 
1642         //read from global
1643         oValue=im[c];
1644 
1645         if ((channel & SyncChannels) != 0)
1646         {
1647           if (getRedF4(white) != getRedF4(black))
1648           {
1649             ePos = ScaleQuantumToMap(getRed(oValue));
1650             eValue = equalize_map[ePos];
1651             red = getRed(eValue);
1652             ePos = ScaleQuantumToMap(getGreen(oValue));
1653             eValue = equalize_map[ePos];
1654             green = getRed(eValue);
1655             ePos = ScaleQuantumToMap(getBlue(oValue));
1656             eValue = equalize_map[ePos];
1657             blue = getRed(eValue);
1658             ePos = ScaleQuantumToMap(getAlpha(oValue));
1659             eValue = equalize_map[ePos];
1660             alpha = getRed(eValue);
1661 
1662             //write back
1663             im[c]=(CLPixelType)(blue, green, red, alpha);
1664           }
1665 
1666         }
1667 
1668         // for equalizing, we always need all channels?
1669         // otherwise something more
1670 
1671      }
1672     )
1673 
1674 /*
1675 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1676 %                                                                             %
1677 %                                                                             %
1678 %                                                                             %
1679 %     F u n c t i o n                                                         %
1680 %                                                                             %
1681 %                                                                             %
1682 %                                                                             %
1683 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1684 */
1685 
1686   STRINGIFY(
1687   /*
1688   apply FunctionImageChannel(braightness-contrast)
1689   */
1690   CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1691     const unsigned int number_parameters,__constant float *parameters)
1692   {
1693     float result = 0.0f;
1694 
1695     switch (function)
1696     {
1697     case PolynomialFunction:
1698       {
1699         for (unsigned int i=0; i < number_parameters; i++)
1700           result = result*QuantumScale*pixel + parameters[i];
1701         result *= QuantumRange;
1702         break;
1703       }
1704     case SinusoidFunction:
1705       {
1706         float  freq,phase,ampl,bias;
1707         freq  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1708         phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1709         ampl  = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1710         bias  = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1711         result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1712           (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1713         break;
1714       }
1715     case ArcsinFunction:
1716       {
1717         float  width,range,center,bias;
1718         width  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1719         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1720         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1721         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1722 
1723         result = 2.0f/width*(QuantumScale*pixel - center);
1724         result = range/MagickPI*asin(result)+bias;
1725         result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1726         result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1727         result *= QuantumRange;
1728         break;
1729       }
1730     case ArctanFunction:
1731       {
1732         float slope,range,center,bias;
1733         slope  = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1734         center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1735         range  = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1736         bias   = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1737         result = MagickPI*slope*(QuantumScale*pixel-center);
1738         result = QuantumRange*(range/MagickPI*atan(result) + bias);
1739         break;
1740       }
1741     case UndefinedFunction:
1742       break;
1743     }
1744     return(ClampToQuantum(result));
1745   }
1746   )
1747 
1748   STRINGIFY(
1749   /*
1750   Improve brightness / contrast of the image
1751   channel : define which channel is improved
1752   function : the function called to enchance the brightness contrast
1753   number_parameters : numbers of parameters
1754   parameters : the parameter
1755   */
1756   __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1757     const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1758     __constant float *parameters)
1759   {
1760     const unsigned int x = get_global_id(0);
1761     const unsigned int y = get_global_id(1);
1762     const unsigned int columns = get_global_size(0);
1763     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1764 
1765     float red;
1766     float green;
1767     float blue;
1768     float alpha;
1769 
1770     ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1771 
1772     if ((channel & RedChannel) != 0)
1773       red=ApplyFunction(red, function, number_parameters, parameters);
1774 
1775     if (number_channels > 2)
1776       {
1777         if ((channel & GreenChannel) != 0)
1778           green=ApplyFunction(green, function, number_parameters, parameters);
1779 
1780         if ((channel & BlueChannel) != 0)
1781           blue=ApplyFunction(blue, function, number_parameters, parameters);
1782       }
1783 
1784     if (((number_channels == 4) || (number_channels == 2)) &&
1785         ((channel & AlphaChannel) != 0))
1786       alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1787 
1788     WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1789   }
1790   )
1791 
1792 /*
1793 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1794 %                                                                             %
1795 %                                                                             %
1796 %                                                                             %
1797 %     G r a y s c a l e                                                       %
1798 %                                                                             %
1799 %                                                                             %
1800 %                                                                             %
1801 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1802 */
1803 
1804   STRINGIFY(
1805   __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1806     const unsigned int colorspace,const unsigned int method)
1807   {
1808     const unsigned int x = get_global_id(0);
1809     const unsigned int y = get_global_id(1);
1810     const unsigned int columns = get_global_size(0);
1811     __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1812 
1813     float
1814       blue,
1815       green,
1816       red;
1817 
1818     red=getPixelRed(p);
1819     green=getPixelGreen(p);
1820     blue=getPixelBlue(p);
1821 
1822     CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1823 
1824     setPixelRed(p,intensity);
1825     setPixelGreen(p,intensity);
1826     setPixelBlue(p,intensity);
1827   }
1828   )
1829 
1830 /*
1831 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1832 %                                                                             %
1833 %                                                                             %
1834 %                                                                             %
1835 %     L o c a l C o n t r a s t                                               %
1836 %                                                                             %
1837 %                                                                             %
1838 %                                                                             %
1839 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1840 */
1841 
1842     STRINGIFY(
1843 
1844       __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1845           const int radius,
1846           const int imageWidth,
1847           const int imageHeight)
1848       {
1849         const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1850 
1851         int x = get_local_id(0);
1852         int y = get_global_id(1);
1853 
1854         global CLPixelType *src = srcImage + y * imageWidth;
1855 
1856         for (int i = x; i < imageWidth; i += get_local_size(0)) {
1857             float sum = 0.0f;
1858             float weight = 1.0f;
1859 
1860             int j = i - radius;
1861             while ((j + 7) < i) {
1862                 for (int k = 0; k < 8; ++k) // Unroll 8x
1863                     sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1864                 weight += 8.0f;
1865                 j+=8;
1866             }
1867             while (j < i) {
1868                 sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1869                 weight += 1.0f;
1870                 ++j;
1871             }
1872 
1873             while ((j + 7) < radius + i) {
1874                 for (int k = 0; k < 8; ++k) // Unroll 8x
1875                     sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1876                 weight -= 8.0f;
1877                 j+=8;
1878             }
1879             while (j < radius + i) {
1880                 sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1881                 weight -= 1.0f;
1882                 ++j;
1883             }
1884 
1885             tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1886         }
1887       }
1888     )
1889 
1890     STRINGIFY(
1891       __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
1892           const int radius,
1893           const float strength,
1894           const int imageWidth,
1895           const int imageHeight)
1896       {
1897         const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1898 
1899         int x = get_global_id(0);
1900         int y = get_global_id(1);
1901 
1902         if ((x >= imageWidth) || (y >= imageHeight))
1903                 return;
1904 
1905         global float *src = blurImage + x;
1906 
1907         float sum = 0.0f;
1908         float weight = 1.0f;
1909 
1910         int j = y - radius;
1911         while ((j + 7) < y) {
1912             for (int k = 0; k < 8; ++k) // Unroll 8x
1913                 sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1914             weight += 8.0f;
1915             j+=8;
1916         }
1917         while (j < y) {
1918             sum += weight * src[mirrorBottom(j) * imageWidth];
1919             weight += 1.0f;
1920             ++j;
1921         }
1922 
1923         while ((j + 7) < radius + y) {
1924             for (int k = 0; k < 8; ++k) // Unroll 8x
1925                 sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1926             weight -= 8.0f;
1927             j+=8;
1928         }
1929         while (j < radius + y) {
1930             sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1931             weight -= 1.0f;
1932             ++j;
1933         }
1934 
1935         CLPixelType pixel = srcImage[x + y * imageWidth];
1936         float srcVal = dot(RGB, convert_float4(pixel));
1937         float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1938         mult = (srcVal + mult) / srcVal;
1939 
1940         pixel.x = ClampToQuantum(pixel.x * mult);
1941         pixel.y = ClampToQuantum(pixel.y * mult);
1942         pixel.z = ClampToQuantum(pixel.z * mult);
1943 
1944         dstImage[x + y * imageWidth] = pixel;
1945       }
1946     )
1947 
1948 /*
1949 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1950 %                                                                             %
1951 %                                                                             %
1952 %                                                                             %
1953 %     M o d u l a t e                                                         %
1954 %                                                                             %
1955 %                                                                             %
1956 %                                                                             %
1957 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1958 */
1959 
1960   STRINGIFY(
1961 
1962   inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1963     float *hue, float *saturation, float *lightness)
1964   {
1965   float
1966     c,
1967     tmax,
1968     tmin;
1969 
1970   /*
1971      Convert RGB to HSL colorspace.
1972      */
1973   tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1974   tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1975 
1976   c=tmax-tmin;
1977 
1978   *lightness=(tmax+tmin)/2.0;
1979   if (c <= 0.0)
1980   {
1981     *hue=0.0;
1982     *saturation=0.0;
1983     return;
1984   }
1985 
1986   if (tmax == (QuantumScale*red))
1987   {
1988     *hue=(QuantumScale*green-QuantumScale*blue)/c;
1989     if ((QuantumScale*green) < (QuantumScale*blue))
1990       *hue+=6.0;
1991   }
1992   else
1993     if (tmax == (QuantumScale*green))
1994       *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
1995     else
1996       *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
1997 
1998   *hue*=60.0/360.0;
1999   if (*lightness <= 0.5)
2000     *saturation=c/(2.0*(*lightness));
2001   else
2002     *saturation=c/(2.0-2.0*(*lightness));
2003   }
2004 
2005   inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2006       CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2007   {
2008     float
2009       b,
2010       c,
2011       g,
2012       h,
2013       tmin,
2014       r,
2015       x;
2016 
2017     /*
2018        Convert HSL to RGB colorspace.
2019        */
2020     h=hue*360.0;
2021     if (lightness <= 0.5)
2022       c=2.0*lightness*saturation;
2023     else
2024       c=(2.0-2.0*lightness)*saturation;
2025     tmin=lightness-0.5*c;
2026     h-=360.0*floor(h/360.0);
2027     h/=60.0;
2028     x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2029     switch ((int) floor(h) % 6)
2030     {
2031       case 0:
2032         {
2033           r=tmin+c;
2034           g=tmin+x;
2035           b=tmin;
2036           break;
2037         }
2038       case 1:
2039         {
2040           r=tmin+x;
2041           g=tmin+c;
2042           b=tmin;
2043           break;
2044         }
2045       case 2:
2046         {
2047           r=tmin;
2048           g=tmin+c;
2049           b=tmin+x;
2050           break;
2051         }
2052       case 3:
2053         {
2054           r=tmin;
2055           g=tmin+x;
2056           b=tmin+c;
2057           break;
2058         }
2059       case 4:
2060         {
2061           r=tmin+x;
2062           g=tmin;
2063           b=tmin+c;
2064           break;
2065         }
2066       case 5:
2067         {
2068           r=tmin+c;
2069           g=tmin;
2070           b=tmin+x;
2071           break;
2072         }
2073       default:
2074         {
2075           r=0.0;
2076           g=0.0;
2077           b=0.0;
2078         }
2079     }
2080     *red=ClampToQuantum(QuantumRange*r);
2081     *green=ClampToQuantum(QuantumRange*g);
2082     *blue=ClampToQuantum(QuantumRange*b);
2083   }
2084 
2085   inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2086     CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2087   {
2088     float
2089       hue,
2090       lightness,
2091       saturation;
2092 
2093     /*
2094     Increase or decrease color lightness, saturation, or hue.
2095     */
2096     ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2097     hue+=0.5*(0.01*percent_hue-1.0);
2098     while (hue < 0.0)
2099       hue+=1.0;
2100     while (hue >= 1.0)
2101       hue-=1.0;
2102     saturation*=0.01*percent_saturation;
2103     lightness*=0.01*percent_lightness;
2104     ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2105   }
2106 
2107   __kernel void Modulate(__global CLPixelType *im,
2108     const float percent_brightness,
2109     const float percent_hue,
2110     const float percent_saturation,
2111     const int colorspace)
2112   {
2113 
2114     const int x = get_global_id(0);
2115     const int y = get_global_id(1);
2116     const int columns = get_global_size(0);
2117     const int c = x + y * columns;
2118 
2119     CLPixelType pixel = im[c];
2120 
2121     CLQuantum
2122         blue,
2123         green,
2124         red;
2125 
2126     red=getRed(pixel);
2127     green=getGreen(pixel);
2128     blue=getBlue(pixel);
2129 
2130     switch (colorspace)
2131     {
2132       case HSLColorspace:
2133       default:
2134         {
2135           ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2136               &red, &green, &blue);
2137         }
2138 
2139     }
2140 
2141     CLPixelType filteredPixel;
2142 
2143     setRed(&filteredPixel, red);
2144     setGreen(&filteredPixel, green);
2145     setBlue(&filteredPixel, blue);
2146     filteredPixel.w = pixel.w;
2147 
2148     im[c] = filteredPixel;
2149   }
2150   )
2151 
2152 /*
2153 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2154 %                                                                             %
2155 %                                                                             %
2156 %                                                                             %
2157 %     M o t i o n B l u r                                                     %
2158 %                                                                             %
2159 %                                                                             %
2160 %                                                                             %
2161 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2162 */
2163 
2164   STRINGIFY(
2165     __kernel
2166     void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2167                     const unsigned int imageWidth, const unsigned int imageHeight,
2168                     const __global float *filter, const unsigned int width, const __global int2* offset,
2169                     const float4 bias,
2170                     const ChannelType channel, const unsigned int matte) {
2171 
2172       int2 currentPixel;
2173       currentPixel.x = get_global_id(0);
2174       currentPixel.y = get_global_id(1);
2175 
2176       if (currentPixel.x >= imageWidth
2177           || currentPixel.y >= imageHeight)
2178           return;
2179 
2180       float4 pixel;
2181       pixel.x = (float)bias.x;
2182       pixel.y = (float)bias.y;
2183       pixel.z = (float)bias.z;
2184       pixel.w = (float)bias.w;
2185 
2186       if (((channel & AlphaChannel) == 0) || (matte == 0)) {
2187 
2188         for (int i = 0; i < width; i++) {
2189           // only support EdgeVirtualPixelMethod through ClampToCanvas
2190           // TODO: implement other virtual pixel method
2191           int2 samplePixel = currentPixel + offset[i];
2192           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2193           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2194           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2195 
2196           pixel.x += (filter[i] * (float)samplePixelValue.x);
2197           pixel.y += (filter[i] * (float)samplePixelValue.y);
2198           pixel.z += (filter[i] * (float)samplePixelValue.z);
2199           pixel.w += (filter[i] * (float)samplePixelValue.w);
2200         }
2201 
2202         CLPixelType outputPixel;
2203         outputPixel.x = ClampToQuantum(pixel.x);
2204         outputPixel.y = ClampToQuantum(pixel.y);
2205         outputPixel.z = ClampToQuantum(pixel.z);
2206         outputPixel.w = ClampToQuantum(pixel.w);
2207         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2208       }
2209       else {
2210 
2211         float gamma = 0.0f;
2212         for (int i = 0; i < width; i++) {
2213           // only support EdgeVirtualPixelMethod through ClampToCanvas
2214           // TODO: implement other virtual pixel method
2215           int2 samplePixel = currentPixel + offset[i];
2216           samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2217           samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2218 
2219           CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2220 
2221           float alpha = QuantumScale*samplePixelValue.w;
2222           float k = filter[i];
2223           pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2224           pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2225           pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2226 
2227           pixel.w += k * alpha * samplePixelValue.w;
2228 
2229           gamma+=k*alpha;
2230         }
2231         gamma = PerceptibleReciprocal(gamma);
2232         pixel.xyz = gamma*pixel.xyz;
2233 
2234         CLPixelType outputPixel;
2235         outputPixel.x = ClampToQuantum(pixel.x);
2236         outputPixel.y = ClampToQuantum(pixel.y);
2237         outputPixel.z = ClampToQuantum(pixel.z);
2238         outputPixel.w = ClampToQuantum(pixel.w);
2239         output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2240       }
2241     }
2242   )
2243 
2244 /*
2245 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2246 %                                                                             %
2247 %                                                                             %
2248 %                                                                             %
2249 %     R e s i z e                                                             %
2250 %                                                                             %
2251 %                                                                             %
2252 %                                                                             %
2253 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2254 */
2255 
2256   STRINGIFY(
2257   // Based on Box from resize.c
2258   float BoxResizeFilter(const float x)
2259   {
2260     return 1.0f;
2261   }
2262   )
2263 
2264   STRINGIFY(
2265   // Based on CubicBC from resize.c
2266   float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2267   {
2268     /*
2269     Cubic Filters using B,C determined values:
2270     Mitchell-Netravali  B = 1/3 C = 1/3  "Balanced" cubic spline filter
2271     Catmull-Rom         B = 0   C = 1/2  Interpolatory and exact on linears
2272     Spline              B = 1   C = 0    B-Spline Gaussian approximation
2273     Hermite             B = 0   C = 0    B-Spline interpolator
2274 
2275     See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2276     Graphics Computer Graphics, Volume 22, Number 4, August 1988
2277     http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2278     Mitchell.pdf.
2279 
2280     Coefficents are determined from B,C values:
2281     P0 = (  6 - 2*B       )/6 = coeff[0]
2282     P1 =         0
2283     P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2284     P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2285     Q0 = (      8*B +24*C )/6 = coeff[3]
2286     Q1 = (    -12*B -48*C )/6 = coeff[4]
2287     Q2 = (      6*B +30*C )/6 = coeff[5]
2288     Q3 = (    - 1*B - 6*C )/6 = coeff[6]
2289 
2290     which are used to define the filter:
2291 
2292     P0 + P1*x + P2*x^2 + P3*x^3      0 <= x < 1
2293     Q0 + Q1*x + Q2*x^2 + Q3*x^3      1 <= x < 2
2294 
2295     which ensures function is continuous in value and derivative (slope).
2296     */
2297     if (x < 1.0)
2298       return(resizeFilterCoefficients[0]+x*(x*
2299       (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2300     if (x < 2.0)
2301       return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2302       (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2303     return(0.0);
2304   }
2305   )
2306 
2307   STRINGIFY(
2308   float Sinc(const float x)
2309   {
2310     if (x != 0.0f)
2311     {
2312       const float alpha=(float) (MagickPI*x);
2313       return sinpi(x)/alpha;
2314     }
2315     return(1.0f);
2316   }
2317   )
2318 
2319   STRINGIFY(
2320   float Triangle(const float x)
2321   {
2322     /*
2323     1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2324     a Bartlett 2D Cone filter.  Also used as a Bartlett Windowing function
2325     for Sinc().
2326     */
2327     return ((x<1.0f)?(1.0f-x):0.0f);
2328   }
2329   )
2330 
2331 
2332   STRINGIFY(
2333   float Hann(const float x)
2334   {
2335     /*
2336     Cosine window function:
2337       0.5+0.5*cos(pi*x).
2338     */
2339     const float cosine=cos((MagickPI*x));
2340     return(0.5f+0.5f*cosine);
2341   }
2342   )
2343 
2344   STRINGIFY(
2345   float Hamming(const float x)
2346   {
2347     /*
2348       Offset cosine window function:
2349        .54 + .46 cos(pi x).
2350     */
2351     const float cosine=cos((MagickPI*x));
2352     return(0.54f+0.46f*cosine);
2353   }
2354   )
2355 
2356   STRINGIFY(
2357   float Blackman(const float x)
2358   {
2359     /*
2360       Blackman: 2nd order cosine windowing function:
2361         0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2362 
2363       Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2364       five flops.
2365     */
2366     const float cosine=cos((MagickPI*x));
2367     return(0.34f+cosine*(0.5f+cosine*0.16f));
2368   }
2369   )
2370 
2371   STRINGIFY(
2372   inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2373   {
2374     switch (filterType)
2375     {
2376     /* Call Sinc even for SincFast to get better precision on GPU
2377        and to avoid thread divergence.  Sinc is pretty fast on GPU anyway...*/
2378     case SincWeightingFunction:
2379     case SincFastWeightingFunction:
2380       return Sinc(x);
2381     case CubicBCWeightingFunction:
2382       return CubicBC(x,filterCoefficients);
2383     case BoxWeightingFunction:
2384       return BoxResizeFilter(x);
2385     case TriangleWeightingFunction:
2386       return Triangle(x);
2387     case HannWeightingFunction:
2388       return Hann(x);
2389     case HammingWeightingFunction:
2390       return Hamming(x);
2391     case BlackmanWeightingFunction:
2392       return Blackman(x);
2393 
2394     default:
2395       return 0.0f;
2396     }
2397   }
2398   )
2399 
2400 
2401   STRINGIFY(
2402   inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2403            , const ResizeWeightingFunctionType resizeWindowType
2404            , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2405   {
2406     float scale;
2407     float xBlur = fabs(x/resizeFilterBlur);
2408     if (resizeWindowSupport < MagickEpsilon
2409         || resizeWindowType == BoxWeightingFunction)
2410     {
2411       scale = 1.0f;
2412     }
2413     else
2414     {
2415       scale = resizeFilterScale;
2416       scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2417     }
2418     float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2419     return weight;
2420   }
2421 
2422   )
2423 
2424   ;
2425   const char *accelerateKernels2 =
2426 
2427   STRINGIFY(
2428 
2429   inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2430     return (numWorkItems/pixelPerWorkgroup);
2431   }
2432 
2433   // returns the index of the pixel for the current workitem to compute.
2434   // returns -1 if this workitem doesn't need to participate in any computation
2435   inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2436     const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2437     int pixelIndex = itemID/numWorkItemsPerPixel;
2438     pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2439     return pixelIndex;
2440   }
2441 
2442   )
2443 
2444   STRINGIFY(
2445   __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2446     void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2447       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2448       const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2449       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2450       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2451       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2452       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2453       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2454   {
2455     // calculate the range of resized image pixels computed by this workgroup
2456     const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2457     const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2458     const unsigned int actualNumPixelToCompute = stopX - startX;
2459 
2460     // calculate the range of input image pixels to cache
2461     float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2462     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2463     scale = PerceptibleReciprocal(scale);
2464 
2465     const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2466     const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2467 
2468     // cache the input pixels into local memory
2469     const unsigned int y = get_global_id(1);
2470     const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2471     const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2472     event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2473     wait_group_events(1, &e);
2474 
2475     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2476     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2477     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2478     {
2479       const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2480       const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2481       const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2482 
2483       // determine which resized pixel computed by this workitem
2484       const unsigned int itemID = get_local_id(0);
2485       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2486 
2487       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2488 
2489       float4 filteredPixel = (float4)0.0f;
2490       float density = 0.0f;
2491       float gamma = 0.0f;
2492       // -1 means this workitem doesn't participate in the computation
2493       if (pixelIndex != -1)
2494       {
2495         // x coordinated of the resized pixel computed by this workitem
2496         const int x = chunkStartX + pixelIndex;
2497 
2498         // calculate how many steps required for this pixel
2499         const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2500         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2501         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2502         const unsigned int n = stop - start;
2503 
2504         // calculate how many steps this workitem will contribute
2505         unsigned int numStepsPerWorkItem = n / numItems;
2506         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2507 
2508         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2509         if (startStep < n)
2510         {
2511           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2512 
2513           unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2514           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2515           {
2516             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2517               (ResizeWeightingFunctionType) resizeFilterType,
2518               (ResizeWeightingFunctionType) resizeWindowType,
2519               resizeFilterScale, resizeFilterWindowSupport,
2520               resizeFilterBlur, scale*(start + i - bisect + 0.5));
2521 
2522             float4 cp = (float4)0.0f;
2523 
2524             __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2525             cp.x = (float) *(p);
2526             if (number_channels > 2)
2527             {
2528               cp.y = (float) *(p + 1);
2529               cp.z = (float) *(p + 2);
2530             }
2531 
2532             if (alpha_index != 0)
2533             {
2534               cp.w = (float) *(p + alpha_index);
2535 
2536               float alpha = weight * QuantumScale * cp.w;
2537 
2538               filteredPixel.x += alpha * cp.x;
2539               filteredPixel.y += alpha * cp.y;
2540               filteredPixel.z += alpha * cp.z;
2541               filteredPixel.w += weight * cp.w;
2542               gamma += alpha;
2543             }
2544             else
2545               filteredPixel += ((float4) weight)*cp;
2546 
2547             density += weight;
2548           }
2549         }
2550       }
2551 
2552       // initialize the accumulators to zero
2553       if (itemID < actualNumPixelInThisChunk) {
2554         outputPixelCache[itemID] = (float4)0.0f;
2555         densityCache[itemID] = 0.0f;
2556         if (alpha_index != 0)
2557           gammaCache[itemID] = 0.0f;
2558       }
2559       barrier(CLK_LOCAL_MEM_FENCE);
2560 
2561       // accumulatte the filtered pixel value and the density
2562       for (unsigned int i = 0; i < numItems; i++) {
2563         if (pixelIndex != -1) {
2564           if (itemID%numItems == i) {
2565             outputPixelCache[pixelIndex]+=filteredPixel;
2566             densityCache[pixelIndex]+=density;
2567             if (alpha_index != 0)
2568               gammaCache[pixelIndex]+=gamma;
2569           }
2570         }
2571         barrier(CLK_LOCAL_MEM_FENCE);
2572       }
2573 
2574       if (itemID < actualNumPixelInThisChunk)
2575       {
2576         float4 filteredPixel = outputPixelCache[itemID];
2577 
2578         float gamma = 0.0f;
2579         if (alpha_index != 0)
2580           gamma = gammaCache[itemID];
2581 
2582         float density = densityCache[itemID];
2583         if ((density != 0.0f) && (density != 1.0f))
2584         {
2585           density = PerceptibleReciprocal(density);
2586           filteredPixel *= (float4) density;
2587           if (alpha_index != 0)
2588             gamma *= density;
2589         }
2590 
2591         if (alpha_index != 0)
2592         {
2593           gamma = PerceptibleReciprocal(gamma);
2594           filteredPixel.x *= gamma;
2595           filteredPixel.y *= gamma;
2596           filteredPixel.z *= gamma;
2597         }
2598 
2599         WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
2600       }
2601     }
2602   }
2603   )
2604 
2605 
2606   STRINGIFY(
2607  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2608     void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2609       const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2610       const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2611       const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2612       const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2613       const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2614       const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2615       __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2616   {
2617     // calculate the range of resized image pixels computed by this workgroup
2618     const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2619     const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2620     const unsigned int actualNumPixelToCompute = stopY - startY;
2621 
2622     // calculate the range of input image pixels to cache
2623     float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2624     const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2625     scale = PerceptibleReciprocal(scale);
2626 
2627     const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2628     const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2629 
2630     // cache the input pixels into local memory
2631     const unsigned int x = get_global_id(0);
2632     unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2633     unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2634     unsigned int stride = inputColumns * number_channels;
2635     for (unsigned int i = 0; i < number_channels; i++)
2636     {
2637       event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2638       wait_group_events(1,&e);
2639     }
2640 
2641     unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2642     unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2643     for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2644     {
2645       const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2646       const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2647       const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2648 
2649       // determine which resized pixel computed by this workitem
2650       const unsigned int itemID = get_local_id(1);
2651       const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2652 
2653       const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2654 
2655       float4 filteredPixel = (float4)0.0f;
2656       float density = 0.0f;
2657       float gamma = 0.0f;
2658       // -1 means this workitem doesn't participate in the computation
2659       if (pixelIndex != -1)
2660       {
2661         // x coordinated of the resized pixel computed by this workitem
2662         const int y = chunkStartY + pixelIndex;
2663 
2664         // calculate how many steps required for this pixel
2665         const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2666         const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2667         const unsigned int stop  = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2668         const unsigned int n = stop - start;
2669 
2670         // calculate how many steps this workitem will contribute
2671         unsigned int numStepsPerWorkItem = n / numItems;
2672         numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2673 
2674         const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2675         if (startStep < n)
2676         {
2677           const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2678 
2679           unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2680           for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2681           {
2682             float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2683               (ResizeWeightingFunctionType) resizeFilterType,
2684               (ResizeWeightingFunctionType) resizeWindowType,
2685               resizeFilterScale, resizeFilterWindowSupport,
2686               resizeFilterBlur, scale*(start + i - bisect + 0.5));
2687 
2688             float4 cp = (float4)0.0f;
2689 
2690             __local CLQuantum *p = inputImageCache + cacheIndex;
2691             cp.x = (float) *(p);
2692             if (number_channels > 2)
2693             {
2694               cp.y = (float) *(p + rangeLength);
2695               cp.z = (float) *(p + (rangeLength * 2));
2696             }
2697 
2698             if (alpha_index != 0)
2699             {
2700               cp.w = (float) *(p + (rangeLength * alpha_index));
2701 
2702               float alpha = weight * QuantumScale * cp.w;
2703 
2704               filteredPixel.x += alpha * cp.x;
2705               filteredPixel.y += alpha * cp.y;
2706               filteredPixel.z += alpha * cp.z;
2707               filteredPixel.w += weight * cp.w;
2708               gamma += alpha;
2709             }
2710             else
2711               filteredPixel += ((float4) weight)*cp;
2712 
2713             density += weight;
2714           }
2715         }
2716       }
2717 
2718       // initialize the accumulators to zero
2719       if (itemID < actualNumPixelInThisChunk) {
2720         outputPixelCache[itemID] = (float4)0.0f;
2721         densityCache[itemID] = 0.0f;
2722         if (alpha_index != 0)
2723           gammaCache[itemID] = 0.0f;
2724       }
2725       barrier(CLK_LOCAL_MEM_FENCE);
2726 
2727       // accumulatte the filtered pixel value and the density
2728       for (unsigned int i = 0; i < numItems; i++) {
2729         if (pixelIndex != -1) {
2730           if (itemID%numItems == i) {
2731             outputPixelCache[pixelIndex]+=filteredPixel;
2732             densityCache[pixelIndex]+=density;
2733             if (alpha_index != 0)
2734               gammaCache[pixelIndex]+=gamma;
2735           }
2736         }
2737         barrier(CLK_LOCAL_MEM_FENCE);
2738       }
2739 
2740       if (itemID < actualNumPixelInThisChunk)
2741       {
2742         float4 filteredPixel = outputPixelCache[itemID];
2743 
2744         float gamma = 0.0f;
2745         if (alpha_index != 0)
2746           gamma = gammaCache[itemID];
2747 
2748         float density = densityCache[itemID];
2749         if ((density != 0.0f) && (density != 1.0f))
2750         {
2751           density = PerceptibleReciprocal(density);
2752           filteredPixel *= (float4) density;
2753           if (alpha_index != 0)
2754             gamma *= density;
2755         }
2756 
2757         if (alpha_index != 0)
2758         {
2759           gamma = PerceptibleReciprocal(gamma);
2760           filteredPixel.x *= gamma;
2761           filteredPixel.y *= gamma;
2762           filteredPixel.z *= gamma;
2763         }
2764 
2765         WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
2766       }
2767     }
2768   }
2769   )
2770 
2771 /*
2772 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2773 %                                                                             %
2774 %                                                                             %
2775 %                                                                             %
2776 %     R o t a t i o n a l B l u r                                             %
2777 %                                                                             %
2778 %                                                                             %
2779 %                                                                             %
2780 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2781 */
2782 
2783   STRINGIFY(
2784   __kernel void RotationalBlur(const __global CLQuantum *image,
2785     const unsigned int number_channels,const unsigned int channel,
2786     const float2 blurCenter,__constant float *cos_theta,
2787     __constant float *sin_theta,const unsigned int cossin_theta_size,
2788     __global CLQuantum *filteredImage)
2789   {
2790     const int x = get_global_id(0);
2791     const int y = get_global_id(1);
2792     const int columns = get_global_size(0);
2793     const int rows = get_global_size(1);
2794     unsigned int step = 1;
2795     float center_x = (float) x - blurCenter.x;
2796     float center_y = (float) y - blurCenter.y;
2797     float radius = hypot(center_x, center_y);
2798 
2799     float blur_radius = hypot(blurCenter.x, blurCenter.y);
2800 
2801     if (radius > MagickEpsilon)
2802     {
2803       step = (unsigned int) (blur_radius / radius);
2804       if (step == 0)
2805         step = 1;
2806       if (step >= cossin_theta_size)
2807         step = cossin_theta_size-1;
2808     }
2809 
2810     float4 result = 0.0f;
2811     float normalize = 0.0f;
2812     float gamma = 0.0f;
2813 
2814     for (unsigned int i=0; i<cossin_theta_size; i+=step)
2815     {
2816       int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2817       int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2818 
2819       float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
2820 
2821       if ((number_channels == 4) || (number_channels == 2))
2822       {
2823         float alpha = (float)(QuantumScale*pixel.w);
2824 
2825         gamma += alpha;
2826 
2827         result.x += alpha * pixel.x;
2828         result.y += alpha * pixel.y;
2829         result.z += alpha * pixel.z;
2830         result.w += pixel.w;
2831       }
2832       else
2833         result += pixel;
2834 
2835       normalize += 1.0f;
2836     }
2837 
2838     normalize = PerceptibleReciprocal(normalize);
2839 
2840     if ((number_channels == 4) || (number_channels == 2))
2841     {
2842       gamma = PerceptibleReciprocal(gamma);
2843       result.x *= gamma;
2844       result.y *= gamma;
2845       result.z *= gamma;
2846       result.w *= normalize;
2847     }
2848     else
2849       result *= normalize;
2850 
2851     WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2852   }
2853   )
2854 
2855 /*
2856 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2857 %                                                                             %
2858 %                                                                             %
2859 %                                                                             %
2860 %     U n s h a r p M a s k                                                   %
2861 %                                                                             %
2862 %                                                                             %
2863 %                                                                             %
2864 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2865 */
2866 
2867   STRINGIFY(
2868   __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
2869     const __global float4 *blurRowData,const unsigned int number_channels,
2870     const ChannelType channel,const unsigned int columns,
2871     const unsigned int rows,__local float4* cachedData,
2872     __local float* cachedFilter,const __global float *filter,
2873     const unsigned int width,const float gain, const float threshold,
2874     __global CLQuantum *filteredImage)
2875   {
2876     const unsigned int radius = (width-1)/2;
2877 
2878     // cache the pixel shared by the workgroup
2879     const int groupX = get_group_id(0);
2880     const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2881     const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2882 
2883     if ((groupStartY >= 0) && (groupStopY < rows))
2884     {
2885       event_t e = async_work_group_strided_copy(cachedData,
2886         blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2887       wait_group_events(1,&e);
2888     }
2889     else
2890     {
2891       for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2892         cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2893 
2894       barrier(CLK_LOCAL_MEM_FENCE);
2895     }
2896     // cache the filter as well
2897     event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2898     wait_group_events(1,&e);
2899 
2900     // only do the work if this is not a patched item
2901     const int cy = get_global_id(1);
2902 
2903     if (cy < rows)
2904     {
2905       float4 blurredPixel = (float4) 0.0f;
2906 
2907       int i = 0;
2908 
2909       for ( ; i+7 < width; )
2910       {
2911         for (int j=0; j < 8; j++, i++)
2912           blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2913       }
2914 
2915       for ( ; i < width; i++)
2916         blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2917 
2918       float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2919       float4 outputPixel = inputImagePixel - blurredPixel;
2920 
2921       float quantumThreshold = QuantumRange*threshold;
2922 
2923       int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2924       outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2925 
2926       //write back
2927       WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2928     }
2929   }
2930   )
2931 
2932   STRINGIFY(
2933   __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
2934     const ChannelType channel,__constant float *filter,const unsigned int width,
2935     const unsigned int columns,const unsigned int rows,__local float4 *pixels,
2936     const float gain,const float threshold,__global CLQuantum *filteredImage)
2937   {
2938     const unsigned int x = get_global_id(0);
2939     const unsigned int y = get_global_id(1);
2940 
2941     const unsigned int radius = (width - 1) / 2;
2942 
2943     int row = y - radius;
2944     int baseRow = get_group_id(1) * get_local_size(1) - radius;
2945     int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2946 
2947     while (row < endRow) {
2948       int srcy = (row < 0) ? -row : row; // mirror pad
2949       srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2950 
2951       float4 value = 0.0f;
2952 
2953       int ix = x - radius;
2954       int i = 0;
2955 
2956       while (i + 7 < width) {
2957         for (int j = 0; j < 8; ++j) { // unrolled
2958           int srcx = ix + j;
2959           srcx = (srcx < 0) ? -srcx : srcx;
2960           srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2961           value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2962         }
2963         ix += 8;
2964         i += 8;
2965       }
2966 
2967       while (i < width) {
2968         int srcx = (ix < 0) ? -ix : ix; // mirror pad
2969         srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2970         value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2971         ++i;
2972         ++ix;
2973       }
2974       pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2975       row += get_local_size(1);
2976     }
2977 
2978     barrier(CLK_LOCAL_MEM_FENCE);
2979 
2980     const int px = get_local_id(0);
2981     const int py = get_local_id(1);
2982     const int prp = get_local_size(0);
2983     float4 value = (float4)(0.0f);
2984 
2985     int i = 0;
2986     while (i + 7 < width) {
2987       for (int j = 0; j < 8; ++j) // unrolled
2988         value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
2989       i += 8;
2990     }
2991     while (i < width) {
2992       value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
2993       ++i;
2994     }
2995 
2996     float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
2997     float4 diff = srcPixel - value;
2998 
2999     float quantumThreshold = QuantumRange*threshold;
3000 
3001     int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3002     value = select(srcPixel + diff * gain, srcPixel, mask);
3003 
3004     if ((x < columns) && (y < rows))
3005       WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
3006   }
3007   )
3008 
3009   STRINGIFY(
3010     __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
3011     void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
3012       const unsigned int number_channels,const unsigned int max_channels,
3013       const float threshold,const int passes,const unsigned int imageWidth,
3014       const unsigned int imageHeight)
3015   {
3016     const int pad = (1 << (passes - 1));
3017     const int tileSize = 64;
3018     const int tileRowPixels = 64;
3019     const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3020 
3021     CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
3022 
3023     local float buffer[64 * 64];
3024 
3025     int srcx = get_group_id(0) * (tileSize - 2 * pad) - pad + get_local_id(0);
3026     int srcy = get_group_id(1) * (tileSize - 2 * pad) - pad;
3027 
3028     for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3029       int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
3030                 (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
3031 
3032       for (int channel = 0; channel < max_channels; ++channel)
3033         stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
3034     }
3035 
3036     for (int channel = 0; channel < max_channels; ++channel) {
3037       // Load LDS
3038       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3039         buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
3040 
3041       // Process
3042 
3043       float tmp[16];
3044       float accum[16];
3045       float pixel;
3046 
3047       for (int i = 0; i < 16; i++)
3048         accum[i]=0.0f;
3049 
3050       for (int pass = 0; pass < passes; ++pass) {
3051         const int radius = 1 << pass;
3052         const int x = get_local_id(0);
3053         const float thresh = threshold * noise[pass];
3054 
3055         // Apply horizontal hat
3056         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3057           const int offset = i * tileRowPixels;
3058           if (pass == 0)
3059             tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3060           pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3061           barrier(CLK_LOCAL_MEM_FENCE);
3062           buffer[x + offset] = pixel;
3063         }
3064         barrier(CLK_LOCAL_MEM_FENCE);
3065 
3066         // Apply vertical hat
3067         for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3068           pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3069           float delta = tmp[i / 4] - pixel;
3070           tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3071           if (delta < -thresh)
3072             delta += thresh;
3073           else if (delta > thresh)
3074             delta -= thresh;
3075           else
3076             delta = 0;
3077           accum[i / 4] += delta;
3078         }
3079         barrier(CLK_LOCAL_MEM_FENCE);
3080 
3081         if (pass < passes - 1)
3082           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3083             buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3084         else  // last pass
3085           for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3086             accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3087         barrier(CLK_LOCAL_MEM_FENCE);
3088       }
3089 
3090       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3091         stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
3092 
3093       barrier(CLK_LOCAL_MEM_FENCE);
3094     }
3095 
3096     // Write from stage to output
3097 
3098     if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3099       for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3100         if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3101           int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
3102           for (int channel = 0; channel < max_channels; ++channel) {
3103             dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
3104           }
3105         }
3106       }
3107     }
3108   }
3109   )
3110 
3111   ;
3112 
3113 #endif // MAGICKCORE_OPENCL_SUPPORT
3114 
3115 #if defined(__cplusplus) || defined(c_plusplus)
3116 }
3117 #endif
3118 
3119 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
3120