1 //
2 // Copyright 2012 Francisco Jerez
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a
5 // copy of this software and associated documentation files (the "Software"),
6 // to deal in the Software without restriction, including without limitation
7 // the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 // and/or sell copies of the Software, and to permit persons to whom the
9 // Software is furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 // OTHER DEALINGS IN THE SOFTWARE.
21 //
22 
23 #include "core/kernel.hpp"
24 #include "core/resource.hpp"
25 #include "util/factor.hpp"
26 #include "util/u_math.h"
27 #include "pipe/p_context.h"
28 
29 using namespace clover;
30 
kernel(clover::program & prog,const std::string & name,const std::vector<module::argument> & margs)31 kernel::kernel(clover::program &prog, const std::string &name,
32                const std::vector<module::argument> &margs) :
33    program(prog), _name(name), exec(*this),
34    program_ref(prog._kernel_ref_counter) {
35    for (auto &marg : margs) {
36       if (marg.semantic == module::argument::general)
37          _args.emplace_back(argument::create(marg));
38    }
39    for (auto &dev : prog.devices()) {
40       auto &m = prog.build(dev).binary;
41       auto msym = find(name_equals(name), m.syms);
42       const auto f = id_type_equals(msym.section, module::section::data_constant);
43       if (!any_of(f, m.secs))
44          continue;
45 
46       auto mconst = find(f, m.secs);
47       auto rb = std::make_unique<root_buffer>(prog.context(),
48                                               CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
49                                               mconst.size, mconst.data.data());
50       _constant_buffers.emplace(&dev, std::move(rb));
51    }
52 }
53 
54 template<typename V>
55 static inline std::vector<uint>
pad_vector(command_queue & q,const V & v,uint x)56 pad_vector(command_queue &q, const V &v, uint x) {
57    std::vector<uint> w { v.begin(), v.end() };
58    w.resize(q.device().max_block_size().size(), x);
59    return w;
60 }
61 
62 void
launch(command_queue & q,const std::vector<size_t> & grid_offset,const std::vector<size_t> & grid_size,const std::vector<size_t> & block_size)63 kernel::launch(command_queue &q,
64                const std::vector<size_t> &grid_offset,
65                const std::vector<size_t> &grid_size,
66                const std::vector<size_t> &block_size) {
67    const auto m = program().build(q.device()).binary;
68    const auto reduced_grid_size =
69       map(divides(), grid_size, block_size);
70    void *st = exec.bind(&q, grid_offset);
71    struct pipe_grid_info info = {};
72 
73    // The handles are created during exec_context::bind(), so we need make
74    // sure to call exec_context::bind() before retrieving them.
75    std::vector<uint32_t *> g_handles = map([&](size_t h) {
76          return (uint32_t *)&exec.input[h];
77       }, exec.g_handles);
78 
79    q.pipe->bind_compute_state(q.pipe, st);
80    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
81                                0, exec.samplers.size(),
82                                exec.samplers.data());
83 
84    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
85                              exec.sviews.size(), exec.sviews.data());
86    q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
87                              exec.iviews.size(), exec.iviews.data());
88    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
89                                  exec.resources.data());
90    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
91                               exec.g_buffers.data(), g_handles.data());
92 
93    // Fill information for the launch_grid() call.
94    info.work_dim = grid_size.size();
95    copy(pad_vector(q, block_size, 1), info.block);
96    copy(pad_vector(q, reduced_grid_size, 1), info.grid);
97    info.pc = find(name_equals(_name), m.syms).offset;
98    info.input = exec.input.data();
99 
100    q.pipe->launch_grid(q.pipe, &info);
101 
102    q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
103    q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
104    q.pipe->set_shader_images(q.pipe, PIPE_SHADER_COMPUTE, 0,
105                              exec.iviews.size(), NULL);
106    q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
107                              exec.sviews.size(), NULL);
108    q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
109                                exec.samplers.size(), NULL);
110 
111    q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
112    exec.unbind();
113 }
114 
115 size_t
mem_local() const116 kernel::mem_local() const {
117    size_t sz = 0;
118 
119    for (auto &arg : args()) {
120       if (dynamic_cast<local_argument *>(&arg))
121          sz += arg.storage();
122    }
123 
124    return sz;
125 }
126 
127 size_t
mem_private() const128 kernel::mem_private() const {
129    return 0;
130 }
131 
132 const std::string &
name() const133 kernel::name() const {
134    return _name;
135 }
136 
137 std::vector<size_t>
optimal_block_size(const command_queue & q,const std::vector<size_t> & grid_size) const138 kernel::optimal_block_size(const command_queue &q,
139                            const std::vector<size_t> &grid_size) const {
140    return factor::find_grid_optimal_factor<size_t>(
141       q.device().max_threads_per_block(), q.device().max_block_size(),
142       grid_size);
143 }
144 
145 std::vector<size_t>
required_block_size() const146 kernel::required_block_size() const {
147    return find(name_equals(_name), program().symbols()).reqd_work_group_size;
148 }
149 
150 kernel::argument_range
args()151 kernel::args() {
152    return map(derefs(), _args);
153 }
154 
155 kernel::const_argument_range
args() const156 kernel::args() const {
157    return map(derefs(), _args);
158 }
159 
160 std::vector<clover::module::arg_info>
args_infos()161 kernel::args_infos() {
162    std::vector<clover::module::arg_info> infos;
163    for (auto &marg: find(name_equals(_name), program().symbols()).args)
164       if (marg.semantic == clover::module::argument::general)
165          infos.emplace_back(marg.info);
166 
167    return infos;
168 }
169 
170 const module &
module(const command_queue & q) const171 kernel::module(const command_queue &q) const {
172    return program().build(q.device()).binary;
173 }
174 
exec_context(kernel & kern)175 kernel::exec_context::exec_context(kernel &kern) :
176    kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
177 }
178 
~exec_context()179 kernel::exec_context::~exec_context() {
180    if (st)
181       q->pipe->delete_compute_state(q->pipe, st);
182 }
183 
184 void *
bind(intrusive_ptr<command_queue> _q,const std::vector<size_t> & grid_offset)185 kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
186                            const std::vector<size_t> &grid_offset) {
187    std::swap(q, _q);
188 
189    // Bind kernel arguments.
190    auto &m = kern.program().build(q->device()).binary;
191    auto msym = find(name_equals(kern.name()), m.syms);
192    auto margs = msym.args;
193    auto msec = find(id_type_equals(msym.section, module::section::text_executable), m.secs);
194    auto explicit_arg = kern._args.begin();
195 
196    for (auto &marg : margs) {
197       switch (marg.semantic) {
198       case module::argument::general:
199          (*(explicit_arg++))->bind(*this, marg);
200          break;
201 
202       case module::argument::grid_dimension: {
203          const cl_uint dimension = grid_offset.size();
204          auto arg = argument::create(marg);
205 
206          arg->set(sizeof(dimension), &dimension);
207          arg->bind(*this, marg);
208          break;
209       }
210       case module::argument::grid_offset: {
211          for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
212             auto arg = argument::create(marg);
213 
214             arg->set(sizeof(x), &x);
215             arg->bind(*this, marg);
216          }
217          break;
218       }
219       case module::argument::image_size: {
220          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
221          std::vector<cl_uint> image_size{
222                static_cast<cl_uint>(img->width()),
223                static_cast<cl_uint>(img->height()),
224                static_cast<cl_uint>(img->depth())};
225          for (auto x : image_size) {
226             auto arg = argument::create(marg);
227 
228             arg->set(sizeof(x), &x);
229             arg->bind(*this, marg);
230          }
231          break;
232       }
233       case module::argument::image_format: {
234          auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
235          cl_image_format fmt = img->format();
236          std::vector<cl_uint> image_format{
237                static_cast<cl_uint>(fmt.image_channel_data_type),
238                static_cast<cl_uint>(fmt.image_channel_order)};
239          for (auto x : image_format) {
240             auto arg = argument::create(marg);
241 
242             arg->set(sizeof(x), &x);
243             arg->bind(*this, marg);
244          }
245          break;
246       }
247       case module::argument::constant_buffer: {
248          auto arg = argument::create(marg);
249          cl_mem buf = kern._constant_buffers.at(&q->device()).get();
250          arg->set(q->device().address_bits() / 8, &buf);
251          arg->bind(*this, marg);
252          break;
253       }
254       }
255    }
256 
257    // Create a new compute state if anything changed.
258    if (!st || q != _q ||
259        cs.req_local_mem != mem_local ||
260        cs.req_input_mem != input.size()) {
261       if (st)
262          _q->pipe->delete_compute_state(_q->pipe, st);
263 
264       cs.ir_type = q->device().ir_format();
265       cs.prog = &(msec.data[0]);
266       cs.req_local_mem = mem_local;
267       cs.req_input_mem = input.size();
268       st = q->pipe->create_compute_state(q->pipe, &cs);
269       if (!st) {
270          unbind(); // Cleanup
271          throw error(CL_OUT_OF_RESOURCES);
272       }
273    }
274 
275    return st;
276 }
277 
278 void
unbind()279 kernel::exec_context::unbind() {
280    for (auto &arg : kern.args())
281       arg.unbind(*this);
282 
283    input.clear();
284    samplers.clear();
285    sviews.clear();
286    iviews.clear();
287    resources.clear();
288    g_buffers.clear();
289    g_handles.clear();
290    mem_local = 0;
291 }
292 
293 namespace {
294    template<typename T>
295    std::vector<uint8_t>
bytes(const T & x)296    bytes(const T& x) {
297       return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
298    }
299 
300    ///
301    /// Transform buffer \a v from the native byte order into the byte
302    /// order specified by \a e.
303    ///
304    template<typename T>
305    void
byteswap(T & v,pipe_endian e)306    byteswap(T &v, pipe_endian e) {
307       if (PIPE_ENDIAN_NATIVE != e)
308          std::reverse(v.begin(), v.end());
309    }
310 
311    ///
312    /// Pad buffer \a v to the next multiple of \a n.
313    ///
314    template<typename T>
315    void
align(T & v,size_t n)316    align(T &v, size_t n) {
317       v.resize(util_align_npot(v.size(), n));
318    }
319 
320    bool
msb(const std::vector<uint8_t> & s)321    msb(const std::vector<uint8_t> &s) {
322       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
323          return s.back() & 0x80;
324       else
325          return s.front() & 0x80;
326    }
327 
328    ///
329    /// Resize buffer \a v to size \a n using sign or zero extension
330    /// according to \a ext.
331    ///
332    template<typename T>
333    void
extend(T & v,enum module::argument::ext_type ext,size_t n)334    extend(T &v, enum module::argument::ext_type ext, size_t n) {
335       const size_t m = std::min(v.size(), n);
336       const bool sign_ext = (ext == module::argument::sign_ext);
337       const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
338       T w(n, fill);
339 
340       if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
341          std::copy_n(v.begin(), m, w.begin());
342       else
343          std::copy_n(v.end() - m, m, w.end() - m);
344 
345       std::swap(v, w);
346    }
347 
348    ///
349    /// Append buffer \a w to \a v.
350    ///
351    template<typename T>
352    void
insert(T & v,const T & w)353    insert(T &v, const T &w) {
354       v.insert(v.end(), w.begin(), w.end());
355    }
356 
357    ///
358    /// Append \a n elements to the end of buffer \a v.
359    ///
360    template<typename T>
361    size_t
allocate(T & v,size_t n)362    allocate(T &v, size_t n) {
363       size_t pos = v.size();
364       v.resize(pos + n);
365       return pos;
366    }
367 }
368 
369 std::unique_ptr<kernel::argument>
create(const module::argument & marg)370 kernel::argument::create(const module::argument &marg) {
371    switch (marg.type) {
372    case module::argument::scalar:
373       return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));
374 
375    case module::argument::global:
376       return std::unique_ptr<kernel::argument>(new global_argument);
377 
378    case module::argument::local:
379       return std::unique_ptr<kernel::argument>(new local_argument);
380 
381    case module::argument::constant:
382       return std::unique_ptr<kernel::argument>(new constant_argument);
383 
384    case module::argument::image2d_rd:
385    case module::argument::image3d_rd:
386       return std::unique_ptr<kernel::argument>(new image_rd_argument);
387 
388    case module::argument::image2d_wr:
389    case module::argument::image3d_wr:
390       return std::unique_ptr<kernel::argument>(new image_wr_argument);
391 
392    case module::argument::sampler:
393       return std::unique_ptr<kernel::argument>(new sampler_argument);
394 
395    }
396    throw error(CL_INVALID_KERNEL_DEFINITION);
397 }
398 
argument()399 kernel::argument::argument() : _set(false) {
400 }
401 
402 bool
set() const403 kernel::argument::set() const {
404    return _set;
405 }
406 
407 size_t
storage() const408 kernel::argument::storage() const {
409    return 0;
410 }
411 
scalar_argument(size_t size)412 kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
413 }
414 
415 void
set(size_t size,const void * value)416 kernel::scalar_argument::set(size_t size, const void *value) {
417    if (!value)
418       throw error(CL_INVALID_ARG_VALUE);
419 
420    if (size != this->size)
421       throw error(CL_INVALID_ARG_SIZE);
422 
423    v = { (uint8_t *)value, (uint8_t *)value + size };
424    _set = true;
425 }
426 
427 void
bind(exec_context & ctx,const module::argument & marg)428 kernel::scalar_argument::bind(exec_context &ctx,
429                               const module::argument &marg) {
430    auto w = v;
431 
432    extend(w, marg.ext_type, marg.target_size);
433    byteswap(w, ctx.q->device().endianness());
434    align(ctx.input, marg.target_align);
435    insert(ctx.input, w);
436 }
437 
438 void
unbind(exec_context & ctx)439 kernel::scalar_argument::unbind(exec_context &ctx) {
440 }
441 
442 void
set(size_t size,const void * value)443 kernel::global_argument::set(size_t size, const void *value) {
444    if (size != sizeof(cl_mem))
445       throw error(CL_INVALID_ARG_SIZE);
446 
447    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
448    svm = nullptr;
449    _set = true;
450 }
451 
452 void
set_svm(const void * value)453 kernel::global_argument::set_svm(const void *value) {
454    svm = value;
455    buf = nullptr;
456    _set = true;
457 }
458 
459 void
bind(exec_context & ctx,const module::argument & marg)460 kernel::global_argument::bind(exec_context &ctx,
461                               const module::argument &marg) {
462    align(ctx.input, marg.target_align);
463 
464    if (buf) {
465       const resource &r = buf->resource_in(*ctx.q);
466       ctx.g_handles.push_back(ctx.input.size());
467       ctx.g_buffers.push_back(r.pipe);
468 
469       // How to handle multi-demensional offsets?
470       // We don't need to.  Buffer offsets are always
471       // one-dimensional.
472       auto v = bytes(r.offset[0]);
473       extend(v, marg.ext_type, marg.target_size);
474       byteswap(v, ctx.q->device().endianness());
475       insert(ctx.input, v);
476    } else if (svm) {
477       auto v = bytes(svm);
478       extend(v, marg.ext_type, marg.target_size);
479       byteswap(v, ctx.q->device().endianness());
480       insert(ctx.input, v);
481    } else {
482       // Null pointer.
483       allocate(ctx.input, marg.target_size);
484    }
485 }
486 
487 void
unbind(exec_context & ctx)488 kernel::global_argument::unbind(exec_context &ctx) {
489 }
490 
491 size_t
storage() const492 kernel::local_argument::storage() const {
493    return _storage;
494 }
495 
496 void
set(size_t size,const void * value)497 kernel::local_argument::set(size_t size, const void *value) {
498    if (value)
499       throw error(CL_INVALID_ARG_VALUE);
500 
501    if (!size)
502       throw error(CL_INVALID_ARG_SIZE);
503 
504    _storage = size;
505    _set = true;
506 }
507 
508 void
bind(exec_context & ctx,const module::argument & marg)509 kernel::local_argument::bind(exec_context &ctx,
510                              const module::argument &marg) {
511    auto v = bytes(ctx.mem_local);
512 
513    extend(v, module::argument::zero_ext, marg.target_size);
514    byteswap(v, ctx.q->device().endianness());
515    align(ctx.input, marg.target_align);
516    insert(ctx.input, v);
517 
518    ctx.mem_local += _storage;
519 }
520 
521 void
unbind(exec_context & ctx)522 kernel::local_argument::unbind(exec_context &ctx) {
523 }
524 
525 void
set(size_t size,const void * value)526 kernel::constant_argument::set(size_t size, const void *value) {
527    if (size != sizeof(cl_mem))
528       throw error(CL_INVALID_ARG_SIZE);
529 
530    buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
531    _set = true;
532 }
533 
534 void
bind(exec_context & ctx,const module::argument & marg)535 kernel::constant_argument::bind(exec_context &ctx,
536                                 const module::argument &marg) {
537    align(ctx.input, marg.target_align);
538 
539    if (buf) {
540       resource &r = buf->resource_in(*ctx.q);
541       auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
542 
543       extend(v, module::argument::zero_ext, marg.target_size);
544       byteswap(v, ctx.q->device().endianness());
545       insert(ctx.input, v);
546 
547       st = r.bind_surface(*ctx.q, false);
548       ctx.resources.push_back(st);
549    } else {
550       // Null pointer.
551       allocate(ctx.input, marg.target_size);
552    }
553 }
554 
555 void
unbind(exec_context & ctx)556 kernel::constant_argument::unbind(exec_context &ctx) {
557    if (buf)
558       buf->resource_in(*ctx.q).unbind_surface(*ctx.q, st);
559 }
560 
561 void
set(size_t size,const void * value)562 kernel::image_rd_argument::set(size_t size, const void *value) {
563    if (!value)
564       throw error(CL_INVALID_ARG_VALUE);
565 
566    if (size != sizeof(cl_mem))
567       throw error(CL_INVALID_ARG_SIZE);
568 
569    img = &obj<image>(*(cl_mem *)value);
570    _set = true;
571 }
572 
573 void
bind(exec_context & ctx,const module::argument & marg)574 kernel::image_rd_argument::bind(exec_context &ctx,
575                                 const module::argument &marg) {
576    auto v = bytes(ctx.sviews.size());
577 
578    extend(v, module::argument::zero_ext, marg.target_size);
579    byteswap(v, ctx.q->device().endianness());
580    align(ctx.input, marg.target_align);
581    insert(ctx.input, v);
582 
583    st = img->resource_in(*ctx.q).bind_sampler_view(*ctx.q);
584    ctx.sviews.push_back(st);
585 }
586 
587 void
unbind(exec_context & ctx)588 kernel::image_rd_argument::unbind(exec_context &ctx) {
589    img->resource_in(*ctx.q).unbind_sampler_view(*ctx.q, st);
590 }
591 
592 void
set(size_t size,const void * value)593 kernel::image_wr_argument::set(size_t size, const void *value) {
594    if (!value)
595       throw error(CL_INVALID_ARG_VALUE);
596 
597    if (size != sizeof(cl_mem))
598       throw error(CL_INVALID_ARG_SIZE);
599 
600    img = &obj<image>(*(cl_mem *)value);
601    _set = true;
602 }
603 
604 void
bind(exec_context & ctx,const module::argument & marg)605 kernel::image_wr_argument::bind(exec_context &ctx,
606                                 const module::argument &marg) {
607    auto v = bytes(ctx.iviews.size());
608 
609    extend(v, module::argument::zero_ext, marg.target_size);
610    byteswap(v, ctx.q->device().endianness());
611    align(ctx.input, marg.target_align);
612    insert(ctx.input, v);
613    ctx.iviews.push_back(img->resource_in(*ctx.q).create_image_view(*ctx.q));
614 }
615 
616 void
unbind(exec_context & ctx)617 kernel::image_wr_argument::unbind(exec_context &ctx) {
618 }
619 
620 void
set(size_t size,const void * value)621 kernel::sampler_argument::set(size_t size, const void *value) {
622    if (!value)
623       throw error(CL_INVALID_SAMPLER);
624 
625    if (size != sizeof(cl_sampler))
626       throw error(CL_INVALID_ARG_SIZE);
627 
628    s = &obj(*(cl_sampler *)value);
629    _set = true;
630 }
631 
632 void
bind(exec_context & ctx,const module::argument & marg)633 kernel::sampler_argument::bind(exec_context &ctx,
634                                const module::argument &marg) {
635    st = s->bind(*ctx.q);
636    ctx.samplers.push_back(st);
637 }
638 
639 void
unbind(exec_context & ctx)640 kernel::sampler_argument::unbind(exec_context &ctx) {
641    s->unbind(*ctx.q, st);
642 }
643