1/* See LICENSE file in the root OpenCV directory */ 2 3#if TILE_SIZE != 32 4#error "TILE SIZE should be 32" 5#endif 6 7 8__kernel void moments(__global const uchar* src, int src_step, int src_offset, 9 int src_rows, int src_cols, __global int* mom0, int xtiles) 10{ 11 int x0 = get_global_id(0); 12 int y0 = get_group_id(1); 13 int x, y = get_local_id(1); 14 int x_min = x0*TILE_SIZE; 15 int ypix = y0*TILE_SIZE + y; 16 __local int mom[TILE_SIZE][10]; 17 18 if (x_min < src_cols && y0*TILE_SIZE < src_rows) 19 { 20 if (ypix < src_rows) 21 { 22 int x_max = min(src_cols - x_min, TILE_SIZE); 23 __global const uchar* ptr = src + src_offset + ypix*src_step + x_min; 24 int4 S = (int4)(0, 0, 0, 0), p; 25 26#define SUM_ELEM(elem, ofs) \ 27 (int4)(1, (ofs), (ofs)*(ofs), (ofs)*(ofs)*(ofs))*elem 28 29 x = x_max & -4; 30 if (x_max >= 4) 31 { 32 p = convert_int4(vload4(0, ptr)); 33#ifdef OP_MOMENTS_BINARY 34 p = min(p, 1); 35#endif 36 S += (int4)(p.s0, 0, 0, 0) + (int4)(p.s1, p.s1, p.s1, p.s1) + 37 (int4)(p.s2, p.s2 * 2, p.s2 * 4, p.s2 * 8) + (int4)(p.s3, p.s3 * 3, p.s3 * 9, p.s3 * 27); 38 //SUM_ELEM(p.s0, 0) + SUM_ELEM(p.s1, 1) + SUM_ELEM(p.s2, 2) + SUM_ELEM(p.s3, 3); 39 40 if (x_max >= 8) 41 { 42 p = convert_int4(vload4(0, ptr + 4)); 43#ifdef OP_MOMENTS_BINARY 44 p = min(p, 1); 45#endif 46 S += (int4)(p.s0, p.s0 * 4, p.s0 * 16, p.s0 * 64) + (int4)(p.s1, p.s1 * 5, p.s1 * 25, p.s1 * 125) + 47 (int4)(p.s2, p.s2 * 6, p.s2 * 36, p.s2 * 216) + (int4)(p.s3, p.s3 * 7, p.s3 * 49, p.s3 * 343); 48 //SUM_ELEM(p.s0, 4) + SUM_ELEM(p.s1, 5) + SUM_ELEM(p.s2, 6) + SUM_ELEM(p.s3, 7); 49 50 if (x_max >= 12) 51 { 52 p = convert_int4(vload4(0, ptr + 8)); 53#ifdef OP_MOMENTS_BINARY 54 p = min(p, 1); 55#endif 56 S += (int4)(p.s0, p.s0 * 8, p.s0 * 64, p.s0 * 512) + (int4)(p.s1, p.s1 * 9, p.s1 * 81, p.s1 * 729) + 57 (int4)(p.s2, p.s2 * 10, p.s2 * 100, p.s2 * 1000) + (int4)(p.s3, p.s3 * 11, p.s3 * 121, p.s3 * 1331); 58 //SUM_ELEM(p.s0, 8) + SUM_ELEM(p.s1, 9) + SUM_ELEM(p.s2, 10) + SUM_ELEM(p.s3, 11); 59 60 if (x_max >= 16) 61 { 62 p = convert_int4(vload4(0, ptr + 12)); 63#ifdef OP_MOMENTS_BINARY 64 p = min(p, 1); 65#endif 66 S += (int4)(p.s0, p.s0 * 12, p.s0 * 144, p.s0 * 1728) + (int4)(p.s1, p.s1 * 13, p.s1 * 169, p.s1 * 2197) + 67 (int4)(p.s2, p.s2 * 14, p.s2 * 196, p.s2 * 2744) + (int4)(p.s3, p.s3 * 15, p.s3 * 225, p.s3 * 3375); 68 //SUM_ELEM(p.s0, 12) + SUM_ELEM(p.s1, 13) + SUM_ELEM(p.s2, 14) + SUM_ELEM(p.s3, 15); 69 } 70 } 71 } 72 } 73 74 if (x_max >= 20) 75 { 76 p = convert_int4(vload4(0, ptr + 16)); 77#ifdef OP_MOMENTS_BINARY 78 p = min(p, 1); 79#endif 80 S += (int4)(p.s0, p.s0 * 16, p.s0 * 256, p.s0 * 4096) + (int4)(p.s1, p.s1 * 17, p.s1 * 289, p.s1 * 4913) + 81 (int4)(p.s2, p.s2 * 18, p.s2 * 324, p.s2 * 5832) + (int4)(p.s3, p.s3 * 19, p.s3 * 361, p.s3 * 6859); 82 //SUM_ELEM(p.s0, 16) + SUM_ELEM(p.s1, 17) + SUM_ELEM(p.s2, 18) + SUM_ELEM(p.s3, 19); 83 84 if (x_max >= 24) 85 { 86 p = convert_int4(vload4(0, ptr + 20)); 87#ifdef OP_MOMENTS_BINARY 88 p = min(p, 1); 89#endif 90 S += (int4)(p.s0, p.s0 * 20, p.s0 * 400, p.s0 * 8000) + (int4)(p.s1, p.s1 * 21, p.s1 * 441, p.s1 * 9261) + 91 (int4)(p.s2, p.s2 * 22, p.s2 * 484, p.s2 * 10648) + (int4)(p.s3, p.s3 * 23, p.s3 * 529, p.s3 * 12167); 92 //SUM_ELEM(p.s0, 20) + SUM_ELEM(p.s1, 21) + SUM_ELEM(p.s2, 22) + SUM_ELEM(p.s3, 23); 93 94 if (x_max >= 28) 95 { 96 p = convert_int4(vload4(0, ptr + 24)); 97#ifdef OP_MOMENTS_BINARY 98 p = min(p, 1); 99#endif 100 S += (int4)(p.s0, p.s0 * 24, p.s0 * 576, p.s0 * 13824) + (int4)(p.s1, p.s1 * 25, p.s1 * 625, p.s1 * 15625) + 101 (int4)(p.s2, p.s2 * 26, p.s2 * 676, p.s2 * 17576) + (int4)(p.s3, p.s3 * 27, p.s3 * 729, p.s3 * 19683); 102 //SUM_ELEM(p.s0, 24) + SUM_ELEM(p.s1, 25) + SUM_ELEM(p.s2, 26) + SUM_ELEM(p.s3, 27); 103 104 if (x_max >= 32) 105 { 106 p = convert_int4(vload4(0, ptr + 28)); 107#ifdef OP_MOMENTS_BINARY 108 p = min(p, 1); 109#endif 110 S += (int4)(p.s0, p.s0 * 28, p.s0 * 784, p.s0 * 21952) + (int4)(p.s1, p.s1 * 29, p.s1 * 841, p.s1 * 24389) + 111 (int4)(p.s2, p.s2 * 30, p.s2 * 900, p.s2 * 27000) + (int4)(p.s3, p.s3 * 31, p.s3 * 961, p.s3 * 29791); 112 //SUM_ELEM(p.s0, 28) + SUM_ELEM(p.s1, 29) + SUM_ELEM(p.s2, 30) + SUM_ELEM(p.s3, 31); 113 } 114 } 115 } 116 } 117 118 if (x < x_max) 119 { 120 int ps = ptr[x]; 121#ifdef OP_MOMENTS_BINARY 122 ps = min(ps, 1); 123#endif 124 S += SUM_ELEM(ps, x); 125 if (x + 1 < x_max) 126 { 127 ps = ptr[x + 1]; 128#ifdef OP_MOMENTS_BINARY 129 ps = min(ps, 1); 130#endif 131 S += SUM_ELEM(ps, x + 1); 132 if (x + 2 < x_max) 133 { 134 ps = ptr[x + 2]; 135#ifdef OP_MOMENTS_BINARY 136 ps = min(ps, 1); 137#endif 138 S += SUM_ELEM(ps, x + 2); 139 } 140 } 141 } 142 143 int sy = y*y; 144 145 mom[y][0] = S.s0; 146 mom[y][1] = S.s1; 147 mom[y][2] = y*S.s0; 148 mom[y][3] = S.s2; 149 mom[y][4] = y*S.s1; 150 mom[y][5] = sy*S.s0; 151 mom[y][6] = S.s3; 152 mom[y][7] = y*S.s2; 153 mom[y][8] = sy*S.s1; 154 mom[y][9] = y*sy*S.s0; 155 } 156 else 157 mom[y][0] = mom[y][1] = mom[y][2] = mom[y][3] = mom[y][4] = 158 mom[y][5] = mom[y][6] = mom[y][7] = mom[y][8] = mom[y][9] = 0; 159 barrier(CLK_LOCAL_MEM_FENCE); 160 161#define REDUCE(d) \ 162 if (y < d) \ 163 { \ 164 mom[y][0] += mom[y + d][0]; \ 165 mom[y][1] += mom[y + d][1]; \ 166 mom[y][2] += mom[y + d][2]; \ 167 mom[y][3] += mom[y + d][3]; \ 168 mom[y][4] += mom[y + d][4]; \ 169 mom[y][5] += mom[y + d][5]; \ 170 mom[y][6] += mom[y + d][6]; \ 171 mom[y][7] += mom[y + d][7]; \ 172 mom[y][8] += mom[y + d][8]; \ 173 mom[y][9] += mom[y + d][9]; \ 174 } \ 175 barrier(CLK_LOCAL_MEM_FENCE) 176 177 REDUCE(16); 178 REDUCE(8); 179 REDUCE(4); 180 REDUCE(2); 181 182 if (y < 10) 183 { 184 __global int* momout = mom0 + (y0*xtiles + x0) * 10; 185 momout[y] = mom[0][y] + mom[1][y]; 186 } 187 } 188} 189