1///////////////////////////// OpenCL kernels for face detection //////////////////////////////
2////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////
3
4//
5// the code has been derived from the OpenCL Haar cascade kernel by
6//
7//    Niko Li, newlife20080214@gmail.com
8//    Wang Weiyan, wangweiyanster@gmail.com
9//    Jia Haipeng, jiahaipeng95@gmail.com
10//    Nathan, liujun@multicorewareinc.com
11//    Peng Xiao, pengxiao@outlook.com
12//    Erping Pang, erping@multicorewareinc.com
13//
14
15#ifdef HAAR
16typedef struct __attribute__((aligned(4))) OptHaarFeature
17{
18    int4 ofs[3] __attribute__((aligned (4)));
19    float4 weight __attribute__((aligned (4)));
20}
21OptHaarFeature;
22#endif
23
24#ifdef LBP
25typedef struct __attribute__((aligned(4))) OptLBPFeature
26{
27    int16 ofs __attribute__((aligned (4)));
28}
29OptLBPFeature;
30#endif
31
32typedef struct __attribute__((aligned(4))) Stump
33{
34    float4 st __attribute__((aligned (4)));
35}
36Stump;
37
38typedef struct __attribute__((aligned(4))) Node
39{
40    int4 n __attribute__((aligned (4)));
41}
42Node;
43
44typedef struct __attribute__((aligned (4))) Stage
45{
46    int first __attribute__((aligned (4)));
47    int ntrees __attribute__((aligned (4)));
48    float threshold __attribute__((aligned (4)));
49}
50Stage;
51
52typedef struct __attribute__((aligned (4))) ScaleData
53{
54    float scale __attribute__((aligned (4)));
55    int szi_width __attribute__((aligned (4)));
56    int szi_height __attribute__((aligned (4)));
57    int layer_ofs __attribute__((aligned (4)));
58    int ystep __attribute__((aligned (4)));
59}
60ScaleData;
61
62#ifndef SUM_BUF_SIZE
63#define SUM_BUF_SIZE 0
64#endif
65
66#ifndef NODE_COUNT
67#define NODE_COUNT 1
68#endif
69
70#ifdef HAAR
71__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
72void runHaarClassifier(
73    int nscales, __global const ScaleData* scaleData,
74    __global const int* sum,
75    int _sumstep, int sumoffset,
76    __global const OptHaarFeature* optfeatures,
77    __global const Stage* stages,
78    __global const Node* nodes,
79    __global const float* leaves0,
80
81    volatile __global int* facepos,
82    int4 normrect, int sqofs, int2 windowsize)
83{
84    int lx = get_local_id(0);
85    int ly = get_local_id(1);
86    int groupIdx = get_group_id(0);
87    int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
88    int scaleIdx, tileIdx, stageIdx;
89    int sumstep = (int)(_sumstep/sizeof(int));
90    int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x),
91                        mad24(normrect.y, sumstep, normrect.x + normrect.z),
92                        mad24(normrect.y + normrect.w, sumstep, normrect.x),
93                        mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z));
94    int normarea = normrect.z * normrect.w;
95    float invarea = 1.f/normarea;
96    int lidx = ly*LOCAL_SIZE_X + lx;
97
98    #if SUM_BUF_SIZE > 0
99    int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x),
100                       mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z),
101                       mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x),
102                       mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z));
103    #else
104    int4 nofs = nofs0;
105    #endif
106    #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
107    __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1];
108    #if SUM_BUF_SIZE > 0
109    __local int* ibuf = lstore;
110    __local int* lcount = ibuf + SUM_BUF_SIZE;
111    #else
112    __local int* lcount = lstore;
113    #endif
114    __local float* lnf = (__local float*)(lcount + 1);
115    __local float* lpartsum = lnf + LOCAL_SIZE;
116    __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
117
118    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
119    {
120        __global const ScaleData* s = scaleData + scaleIdx;
121        int ystep = s->ystep;
122        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
123        int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
124                             (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
125        int totalTiles = ntiles.x*ntiles.y;
126
127        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
128        {
129            int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
130            int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
131            int ix = lx, iy = ly;
132            __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
133            __global const int* psum1 = psum0 + mad24(iy, sumstep, ix);
134
135            if( ix0 >= worksize.x || iy0 >= worksize.y )
136                continue;
137            #if SUM_BUF_SIZE > 0
138            for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
139            {
140                int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
141                vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
142            }
143            #endif
144
145            if( lidx == 0 )
146                lcount[0] = 0;
147            barrier(CLK_LOCAL_MEM_FENCE);
148
149            if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
150            {
151                #if NODE_COUNT==1
152                __global const Stump* stump = (__global const Stump*)nodes;
153                #else
154                __global const Node* node = nodes;
155                __global const float* leaves = leaves0;
156                #endif
157                #if SUM_BUF_SIZE > 0
158                __local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix);
159                #else
160                __global const int* psum = psum1;
161                #endif
162
163                __global const int* psqsum = (__global const int*)(psum1 + sqofs);
164                float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea;
165                float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea;
166                float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
167                nf = nf > 0 ? nf : 1.f;
168
169                for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
170                {
171                    int ntrees = stages[stageIdx].ntrees;
172                    float s = 0.f;
173                    #if NODE_COUNT==1
174                    for( i = 0; i < ntrees; i++ )
175                    {
176                        float4 st = stump[i].st;
177                        __global const OptHaarFeature* f = optfeatures + as_int(st.x);
178                        float4 weight = f->weight;
179
180                        int4 ofs = f->ofs[0];
181                        sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
182                        ofs = f->ofs[1];
183                        sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
184                        if( weight.z > 0 )
185                        {
186                            ofs = f->ofs[2];
187                            sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
188                        }
189
190                        s += (sval < st.y*nf) ? st.z : st.w;
191                    }
192                    stump += ntrees;
193                    #else
194                    for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 )
195                    {
196                        int idx = 0;
197                        do
198                        {
199                            int4 n = node[idx].n;
200                            __global const OptHaarFeature* f = optfeatures + n.x;
201                            float4 weight = f->weight;
202
203                            int4 ofs = f->ofs[0];
204
205                            sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
206                            ofs = f->ofs[1];
207                            sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
208                            if( weight.z > 0 )
209                            {
210                                ofs = f->ofs[2];
211                                sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
212                            }
213
214                            idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
215                        }
216                        while(idx > 0);
217                        s += leaves[-idx];
218                    }
219                    #endif
220
221                    if( s < stages[stageIdx].threshold )
222                        break;
223                }
224
225                if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
226                {
227                    int count = atomic_inc(lcount);
228                    lbuf[count] = (int)(ix | (iy << 8));
229                    lnf[count] = nf;
230                }
231            }
232
233            for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
234            {
235                barrier(CLK_LOCAL_MEM_FENCE);
236                int nrects = lcount[0];
237
238                if( nrects == 0 )
239                    break;
240                barrier(CLK_LOCAL_MEM_FENCE);
241                if( lidx == 0 )
242                    lcount[0] = 0;
243
244                {
245                    #if NODE_COUNT == 1
246                    __global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first;
247                    #else
248                    __global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT;
249                    __global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1);
250                    #endif
251                    int nparts = LOCAL_SIZE / nrects;
252                    int ntrees = stages[stageIdx].ntrees;
253                    int ntrees_p = (ntrees + nparts - 1)/nparts;
254                    int nr = lidx / nparts;
255                    int partidx = -1, idxval = 0;
256                    float partsum = 0.f, nf = 0.f;
257
258                    if( nr < nrects )
259                    {
260                        partidx = lidx % nparts;
261                        idxval = lbuf[nr];
262                        nf = lnf[nr];
263
264                        {
265                        int ntrees0 = ntrees_p*partidx;
266                        int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
267                        int ix1 = idxval & 255, iy1 = idxval >> 8;
268                        #if SUM_BUF_SIZE > 0
269                        __local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
270                        #else
271                        __global const int* psum = psum0 + mad24(iy1, sumstep, ix1);
272                        #endif
273
274                        #if NODE_COUNT == 1
275                        for( i = ntrees0; i < ntrees1; i++ )
276                        {
277                            float4 st = stump[i].st;
278                            __global const OptHaarFeature* f = optfeatures + as_int(st.x);
279                            float4 weight = f->weight;
280
281                            int4 ofs = f->ofs[0];
282                            float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
283                            ofs = f->ofs[1];
284                            sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
285                            //if( weight.z > 0 )
286                            if( fabs(weight.z) > 0 )
287                            {
288                                ofs = f->ofs[2];
289                                sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
290                            }
291
292                            partsum += (sval < st.y*nf) ? st.z : st.w;
293                        }
294                        #else
295                        for( i = ntrees0; i < ntrees1; i++ )
296                        {
297                            int idx = 0;
298                            do
299                            {
300                                int4 n = node[i*2 + idx].n;
301                                __global const OptHaarFeature* f = optfeatures + n.x;
302                                float4 weight = f->weight;
303                                int4 ofs = f->ofs[0];
304
305                                float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
306                                ofs = f->ofs[1];
307                                sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
308                                if( weight.z > 0 )
309                                {
310                                    ofs = f->ofs[2];
311                                    sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
312                                }
313
314                                idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
315                            }
316                            while(idx > 0);
317                            partsum += leaves[i*3-idx];
318                        }
319                        #endif
320                        }
321                    }
322                    lpartsum[lidx] = partsum;
323                    barrier(CLK_LOCAL_MEM_FENCE);
324
325                    if( partidx == 0 )
326                    {
327                        float s = lpartsum[nr*nparts];
328                        for( i = 1; i < nparts; i++ )
329                            s += lpartsum[i + nr*nparts];
330                        if( s >= stages[stageIdx].threshold )
331                        {
332                            int count = atomic_inc(lcount);
333                            lbuf[count] = idxval;
334                            lnf[count] = nf;
335                        }
336                    }
337                }
338            }
339
340            barrier(CLK_LOCAL_MEM_FENCE);
341            if( stageIdx == N_STAGES )
342            {
343                int nrects = lcount[0];
344                if( lidx < nrects )
345                {
346                    int nfaces = atomic_inc(facepos);
347                    if( nfaces < MAX_FACES )
348                    {
349                        volatile __global int* face = facepos + 1 + nfaces*3;
350                        int val = lbuf[lidx];
351                        face[0] = scaleIdx;
352                        face[1] = ix0 + (val & 255);
353                        face[2] = iy0 + (val >> 8);
354                    }
355                }
356            }
357        }
358    }
359}
360#endif
361
362#ifdef LBP
363#undef CALC_SUM_OFS_
364#define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
365    ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
366
367__kernel void runLBPClassifierStumpSimple(
368    int nscales, __global const ScaleData* scaleData,
369    __global const int* sum,
370    int _sumstep, int sumoffset,
371    __global const OptLBPFeature* optfeatures,
372    __global const Stage* stages,
373    __global const Stump* stumps,
374    __global const int* bitsets,
375    int bitsetSize,
376
377    volatile __global int* facepos,
378    int2 windowsize)
379{
380    int lx = get_local_id(0);
381    int ly = get_local_id(1);
382    int local_size_x = get_local_size(0);
383    int local_size_y = get_local_size(1);
384    int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
385    int ngroups = get_num_groups(0)*get_num_groups(1);
386    int scaleIdx, tileIdx, stageIdx;
387    int sumstep = (int)(_sumstep/sizeof(int));
388
389    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
390    {
391        __global const ScaleData* s = scaleData + scaleIdx;
392        int ystep = s->ystep;
393        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
394        int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x,
395                             (worksize.y/ystep + local_size_y-1)/local_size_y);
396        int totalTiles = ntiles.x*ntiles.y;
397
398        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
399        {
400            int iy = mad24((tileIdx / ntiles.x), local_size_y, ly) * ystep;
401            int ix = mad24((tileIdx % ntiles.x), local_size_x, lx) * ystep;
402
403            if( ix < worksize.x && iy < worksize.y )
404            {
405                __global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs;
406                __global const Stump* stump = stumps;
407                __global const int* bitset = bitsets;
408
409                for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ )
410                {
411                    int i, ntrees = stages[stageIdx].ntrees;
412                    float s = 0.f;
413                    for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
414                    {
415                        float4 st = stump->st;
416                        __global const OptLBPFeature* f = optfeatures + as_int(st.x);
417                        int16 ofs = f->ofs;
418
419                        int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
420
421                        int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
422                        idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
423                        idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
424
425                        mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
426                        mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
427                        mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
428                        mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
429                        mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
430
431                        s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
432                    }
433
434                    if( s < stages[stageIdx].threshold )
435                        break;
436                }
437
438                if( stageIdx == N_STAGES )
439                {
440                    int nfaces = atomic_inc(facepos);
441                    if( nfaces < MAX_FACES )
442                    {
443                        volatile __global int* face = facepos + 1 + nfaces*3;
444                        face[0] = scaleIdx;
445                        face[1] = ix;
446                        face[2] = iy;
447                    }
448                }
449            }
450        }
451    }
452}
453
454__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
455void runLBPClassifierStump(
456    int nscales, __global const ScaleData* scaleData,
457    __global const int* sum,
458    int _sumstep, int sumoffset,
459    __global const OptLBPFeature* optfeatures,
460    __global const Stage* stages,
461    __global const Stump* stumps,
462    __global const int* bitsets,
463    int bitsetSize,
464
465    volatile __global int* facepos,
466    int2 windowsize)
467{
468    int lx = get_local_id(0);
469    int ly = get_local_id(1);
470    int groupIdx = get_group_id(0);
471    int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
472    int scaleIdx, tileIdx, stageIdx;
473    int sumstep = (int)(_sumstep/sizeof(int));
474    int lidx = ly*LOCAL_SIZE_X + lx;
475
476    #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
477    __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1];
478    #if SUM_BUF_SIZE > 0
479    __local int* ibuf = lstore;
480    __local int* lcount = ibuf + SUM_BUF_SIZE;
481    #else
482    __local int* lcount = lstore;
483    #endif
484    __local float* lpartsum = (__local float*)(lcount + 1);
485    __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
486
487    for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
488    {
489        __global const ScaleData* s = scaleData + scaleIdx;
490        int ystep = s->ystep;
491        int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
492        int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
493                             (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
494        int totalTiles = ntiles.x*ntiles.y;
495
496        for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
497        {
498            int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
499            int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
500            int ix = lx, iy = ly;
501            __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
502
503            if( ix0 >= worksize.x || iy0 >= worksize.y )
504                continue;
505            #if SUM_BUF_SIZE > 0
506            for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
507            {
508                int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
509                vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
510            }
511            barrier(CLK_LOCAL_MEM_FENCE);
512            #endif
513
514            if( lidx == 0 )
515                lcount[0] = 0;
516            barrier(CLK_LOCAL_MEM_FENCE);
517
518            if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
519            {
520                __global const Stump* stump = stumps;
521                __global const int* bitset = bitsets;
522                #if SUM_BUF_SIZE > 0
523                __local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix);
524                #else
525                __global const int* p = psum0 + mad24(iy, sumstep, ix);
526                #endif
527
528                for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
529                {
530                    int ntrees = stages[stageIdx].ntrees;
531                    float s = 0.f;
532                    for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
533                    {
534                        float4 st = stump->st;
535                        __global const OptLBPFeature* f = optfeatures + as_int(st.x);
536                        int16 ofs = f->ofs;
537
538                        int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
539
540                        int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
541                        idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
542                        idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
543
544                        mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
545                        mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
546                        mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
547                        mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
548                        mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
549
550                        s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
551                    }
552
553                    if( s < stages[stageIdx].threshold )
554                        break;
555                }
556
557                if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
558                {
559                    int count = atomic_inc(lcount);
560                    lbuf[count] = (int)(ix | (iy << 8));
561                }
562            }
563
564            for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
565            {
566                int nrects = lcount[0];
567
568                barrier(CLK_LOCAL_MEM_FENCE);
569                if( nrects == 0 )
570                    break;
571                if( lidx == 0 )
572                    lcount[0] = 0;
573
574                {
575                    __global const Stump* stump = stumps + stages[stageIdx].first;
576                    __global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize;
577                    int nparts = LOCAL_SIZE / nrects;
578                    int ntrees = stages[stageIdx].ntrees;
579                    int ntrees_p = (ntrees + nparts - 1)/nparts;
580                    int nr = lidx / nparts;
581                    int partidx = -1, idxval = 0;
582                    float partsum = 0.f, nf = 0.f;
583
584                    if( nr < nrects )
585                    {
586                        partidx = lidx % nparts;
587                        idxval = lbuf[nr];
588
589                        {
590                            int ntrees0 = ntrees_p*partidx;
591                            int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
592                            int ix1 = idxval & 255, iy1 = idxval >> 8;
593                            #if SUM_BUF_SIZE > 0
594                            __local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
595                            #else
596                            __global const int* p = psum0 + mad24(iy1, sumstep, ix1);
597                            #endif
598
599                            for( i = ntrees0; i < ntrees1; i++ )
600                            {
601                                float4 st = stump[i].st;
602                                __global const OptLBPFeature* f = optfeatures + as_int(st.x);
603                                int16 ofs = f->ofs;
604
605                                #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
606                                    ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
607
608                                int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
609
610                                int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
611                                idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
612                                idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
613
614                                mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
615                                mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
616                                mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
617                                mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
618                                mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
619
620                                partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w;
621                            }
622                        }
623                    }
624                    lpartsum[lidx] = partsum;
625                    barrier(CLK_LOCAL_MEM_FENCE);
626
627                    if( partidx == 0 )
628                    {
629                        float s = lpartsum[nr*nparts];
630                        for( i = 1; i < nparts; i++ )
631                            s += lpartsum[i + nr*nparts];
632                        if( s >= stages[stageIdx].threshold )
633                        {
634                            int count = atomic_inc(lcount);
635                            lbuf[count] = idxval;
636                        }
637                    }
638                }
639            }
640
641            barrier(CLK_LOCAL_MEM_FENCE);
642            if( stageIdx == N_STAGES )
643            {
644                int nrects = lcount[0];
645                if( lidx < nrects )
646                {
647                    int nfaces = atomic_inc(facepos);
648                    if( nfaces < MAX_FACES )
649                    {
650                        volatile __global int* face = facepos + 1 + nfaces*3;
651                        int val = lbuf[lidx];
652                        face[0] = scaleIdx;
653                        face[1] = ix0 + (val & 255);
654                        face[2] = iy0 + (val >> 8);
655                    }
656                }
657            }
658        }
659    }
660}
661#endif
662