1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 
24 
25 #include "procs.h"
26 
27 const char *wg_barrier_kernel_code =
28 "__kernel void compute_sum(__global int *a, int n, __global int *tmp_sum, __global int *sum)\n"
29 "{\n"
30 "    int  tid = get_local_id(0);\n"
31 "    int  lsize = get_local_size(0);\n"
32 "    int  i;\n"
33 "\n"
34 "    tmp_sum[tid] = 0;\n"
35 "    for (i=tid; i<n; i+=lsize)\n"
36 "        tmp_sum[tid] += a[i];\n"
37 "     \n"
38 "     // updated to work for any workgroup size \n"
39 "    for (i=hadd(lsize,1); lsize>1; i = hadd(i,1))\n"
40 "    {\n"
41 "        work_group_barrier(CLK_GLOBAL_MEM_FENCE);\n"
42 "        if (tid + i < lsize)\n"
43 "            tmp_sum[tid] += tmp_sum[tid + i];\n"
44 "         lsize = i; \n"
45 "    }\n"
46 "\n"
47 "     //no barrier is required here because last person to write to tmp_sum[0] was tid 0 \n"
48 "    if (tid == 0)\n"
49 "        *sum = tmp_sum[0];\n"
50 "}\n";
51 
52 
53 static int
verify_sum(int * inptr,int * tmpptr,int * outptr,int n)54 verify_sum(int *inptr, int *tmpptr, int *outptr, int n)
55 {
56     int i;
57     int reference = 0;
58 
59     for (i=0; i<n; i++)
60     {
61         reference += inptr[i];
62     }
63 
64     if (reference != outptr[0])
65     {
66         log_error("work_group_barrier test failed\n");
67         return -1;
68     }
69 
70     log_info("work_group_barrier test passed\n");
71     return 0;
72 }
73 
74 
75 int
test_wg_barrier(cl_device_id device,cl_context context,cl_command_queue queue,int num_elements)76 test_wg_barrier(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements)
77 {
78     cl_mem            streams[3];
79     cl_int            *input_ptr = NULL, *output_ptr = NULL, *tmp_ptr =NULL;
80     cl_program        program;
81     cl_kernel        kernel;
82     size_t    global_threads[3];
83     size_t    local_threads[3];
84     int                err;
85     int                i;
86     size_t max_local_workgroup_size[3];
87     size_t max_threadgroup_size = 0;
88     MTdata d;
89 
90     err = create_single_kernel_helper_with_build_options(
91         context, &program, &kernel, 1, &wg_barrier_kernel_code, "compute_sum",
92         nullptr);
93     test_error(err, "Failed to build kernel/program.");
94 
95     err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
96                                  sizeof(max_threadgroup_size), &max_threadgroup_size, NULL);
97     test_error(err, "clGetKernelWorkgroupInfo failed.");
98 
99     err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(max_local_workgroup_size), max_local_workgroup_size, NULL);
100     test_error(err, "clGetDeviceInfo failed for CL_DEVICE_MAX_WORK_ITEM_SIZES");
101 
102     // Pick the minimum of the device and the kernel
103     if (max_threadgroup_size > max_local_workgroup_size[0])
104         max_threadgroup_size = max_local_workgroup_size[0];
105 
106     // work group size must divide evenly into the global size
107     while( num_elements % max_threadgroup_size )
108         max_threadgroup_size--;
109 
110     input_ptr = (int*)malloc(sizeof(int) * num_elements);
111     output_ptr = (int*)malloc(sizeof(int));
112 
113     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
114                                 sizeof(cl_int) * num_elements, NULL, &err);
115     test_error(err, "clCreateBuffer failed.");
116     streams[1] =
117         clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &err);
118     test_error(err, "clCreateBuffer failed.");
119     streams[2] =
120         clCreateBuffer(context, CL_MEM_READ_WRITE,
121                        sizeof(cl_int) * max_threadgroup_size, NULL, &err);
122     test_error(err, "clCreateBuffer failed.");
123 
124     d = init_genrand( gRandomSeed );
125     for (i=0; i<num_elements; i++)
126         input_ptr[i] = (int)get_random_float(-0x01000000, 0x01000000, d);
127     free_mtdata(d);  d = NULL;
128 
129     err = clEnqueueWriteBuffer(queue, streams[0], CL_TRUE, 0, sizeof(cl_int)*num_elements, (void *)input_ptr, 0, NULL, NULL);
130     test_error(err, "clEnqueueWriteBuffer failed.");
131 
132     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0]);
133     err |= clSetKernelArg(kernel, 1, sizeof num_elements, &num_elements);
134     err |= clSetKernelArg(kernel, 2, sizeof streams[2], &streams[2]);
135     err |= clSetKernelArg(kernel, 3, sizeof streams[1], &streams[1]);
136     test_error(err, "clSetKernelArg failed.");
137 
138     global_threads[0] = max_threadgroup_size;
139     local_threads[0] = max_threadgroup_size;
140 
141     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL );
142     test_error(err, "clEnqueueNDRangeKernel failed.");
143 
144     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_int), (void *)output_ptr, 0, NULL, NULL );
145     test_error(err, "clEnqueueReadBuffer failed.");
146 
147     err = verify_sum(input_ptr, tmp_ptr, output_ptr, num_elements);
148 
149     // cleanup
150     clReleaseMemObject(streams[0]);
151     clReleaseMemObject(streams[1]);
152     clReleaseMemObject(streams[2]);
153     clReleaseKernel(kernel);
154     clReleaseProgram(program);
155     free(input_ptr);
156     free(output_ptr);
157 
158     return err;
159 }
160