1//===-- GPUOps.td - GPU dialect operation definitions ------*- tablegen -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Defines some operations of the GPU dialect.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef GPU_OPS
14#define GPU_OPS
15
16include "mlir/Dialect/GPU/GPUBase.td"
17include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
18include "mlir/IR/SymbolInterfaces.td"
19include "mlir/Interfaces/SideEffectInterfaces.td"
20
21//===----------------------------------------------------------------------===//
22// GPU Dialect operations.
23//===----------------------------------------------------------------------===//
24
25class GPU_Op<string mnemonic, list<OpTrait> traits = []> :
26    Op<GPU_Dialect, mnemonic, traits>;
27
28class GPU_IndexOp<string mnemonic, list<OpTrait> traits = []> :
29    GPU_Op<mnemonic, !listconcat(traits, [NoSideEffect])>,
30    Arguments<(ins StrAttr:$dimension)>, Results<(outs Index)> {
31  let verifier = [{ return ::verifyIndexOp(*this); }];
32}
33
34def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> {
35  let description = [{
36    Returns the number of threads in the thread block (aka the block size) along
37    the x, y, or z `dimension`.
38
39    Example:
40
41    ```mlir
42    %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
43    ```
44  }];
45}
46def GPU_BlockIdOp : GPU_IndexOp<"block_id"> {
47  let description = [{
48    Returns the block id, i.e. the index of the current block within the grid
49    along the x, y, or z `dimension`.
50
51    Example:
52
53    ```mlir
54    %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
55    ```
56  }];
57}
58def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> {
59  let description = [{
60    Returns the number of thread blocks in the grid along the x, y, or z
61    `dimension`.
62
63    Example:
64
65    ```mlir
66    %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
67    ```
68  }];
69}
70def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> {
71  let description = [{
72    Returns the thread id, i.e. the index of the current thread within the block
73    along the x, y, or z `dimension`.
74
75    Example:
76
77    ```mlir
78    %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
79    ```
80  }];
81}
82
83def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [NoSideEffect]>,
84    Arguments<(ins)>, Results<(outs Index:$result)> {
85  let description = [{
86    Returns the subgroup id, i.e. the index of the current subgroup within the
87    workgroup.
88
89    Example:
90
91    ```mlir
92    %sgId = gpu.subgroup_id : index
93    ```
94  }];
95
96  let assemblyFormat = "attr-dict `:` type($result)";
97  let verifier = [{ return success(); }];
98}
99
100def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [NoSideEffect]>,
101    Arguments<(ins)>, Results<(outs Index:$result)> {
102  let description = [{
103    Returns the number of subgroups within a workgroup.
104
105    Example:
106
107    ```mlir
108    %numSg = gpu.num_subgroups : index
109    ```
110  }];
111
112  let assemblyFormat = "attr-dict `:` type($result)";
113  let verifier = [{ return success(); }];
114}
115
116def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [NoSideEffect]>,
117    Arguments<(ins)>, Results<(outs Index:$result)> {
118  let description = [{
119    Returns the number of threads within a subgroup.
120
121    Example:
122
123    ```mlir
124    %sgSz = gpu.subgroup_size : index
125    ```
126  }];
127
128  let assemblyFormat = "attr-dict `:` type($result)";
129  let verifier = [{ return success(); }];
130}
131
132def GPU_GPUFuncOp : GPU_Op<"func", [HasParent<"GPUModuleOp">,
133                                    AutomaticAllocationScope, FunctionLike,
134                                    IsolatedFromAbove, Symbol]> {
135  let summary = "Function executable on a GPU";
136
137  let description = [{
138    Defines a function that can be executed on a GPU. This supports memory
139    attribution and its body has a particular execution model.
140
141    GPU functions are either kernels (as indicated by the `kernel` attribute) or
142    regular functions. The former can be launched from the host side, while the
143    latter are device side only.
144
145    The memory attribution defines SSA values that correspond to memory buffers
146    allocated in the memory hierarchy of the GPU (see below).
147
148    The operation has one attached region that corresponds to the body of the
149    function. The region arguments consist of the function arguments without
150    modification, followed by buffers defined in memory annotations. The body of
151    a GPU function, when launched, is executed by multiple work items. There are
152    no guarantees on the order in which work items execute, or on the connection
153    between them. In particular, work items are not necessarily executed in
154    lock-step. Synchronization ops such as "gpu.barrier" should be used to
155    coordinate work items. Declarations of GPU functions, i.e. not having the
156    body region, are not supported.
157
158    Syntax:
159
160    ```
161    op ::= `gpu.func` symbol-ref-id `(` argument-list `)` (`->`
162    function-result-list)?
163           memory-attribution `kernel`? function-attributes? region
164
165    memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)?
166                           (`private` `(` ssa-id-and-type-list `)`)?
167    ```
168
169    Example:
170
171    ```mlir
172    gpu.func @foo(%arg0: index)
173        workgroup(%workgroup: memref<32xf32, 3>)
174        private(%private: memref<1xf32, 5>)
175        kernel
176        attributes {qux: "quux"} {
177      gpu.return
178    }
179    ```
180
181    The generic form illustrates the concept
182
183    ```mlir
184    "gpu.func"(%arg: index) {sym_name: "foo", kernel, qux: "quux"} ({
185    ^bb0(%arg0: index, %workgroup: memref<32xf32, 3>,
186         %private: memref<1xf32, 5>):
187      "gpu.return"() : () -> ()
188    }) : (index) -> ()
189    ```
190
191    Note the non-default memory spaces used in memref types in memory
192    attribution.
193  }];
194
195  let regions = (region AnyRegion:$body);
196
197  let skipDefaultBuilders = 1;
198
199  let builders = [
200    OpBuilderDAG<(ins "StringRef":$name, "FunctionType":$type,
201      CArg<"TypeRange", "{}">:$workgroupAttributions,
202      CArg<"TypeRange", "{}">:$privateAttributions,
203      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
204  ];
205
206  let extraClassDeclaration = [{
207    /// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
208    /// it is intended to be launched from host.
209    bool isKernel() {
210      return (*this)->getAttrOfType<UnitAttr>(
211          GPUDialect::getKernelFuncAttrName()) != nullptr;
212    }
213
214    /// Change the type of this function in place. This is an extremely
215    /// dangerous operation and it is up to the caller to ensure that this is
216    /// legal for this function, and to restore invariants:
217    ///  - the entry block args must be updated to match the function params.
218    ///  - the argument/result attributes may need an update: if the new type
219    ///  has less parameters we drop the extra attributes, if there are more
220    ///  parameters they won't have any attributes.
221    // TODO: consider removing this function thanks to rewrite patterns.
222    void setType(FunctionType newType);
223
224    /// Returns the number of buffers located in the workgroup memory.
225    unsigned getNumWorkgroupAttributions() {
226      return (*this)->getAttrOfType<IntegerAttr>(
227          getNumWorkgroupAttributionsAttrName()).getInt();
228    }
229
230    /// Returns a list of block arguments that correspond to buffers located in
231    /// the workgroup memory
232    ArrayRef<BlockArgument> getWorkgroupAttributions() {
233      auto begin =
234          std::next(getBody().args_begin(), getType().getNumInputs());
235      auto end = std::next(begin, getNumWorkgroupAttributions());
236      return {begin, end};
237    }
238
239    /// Adds a new block argument that corresponds to buffers located in
240    /// workgroup memory.
241    BlockArgument addWorkgroupAttribution(Type type);
242
243    /// Returns the number of buffers located in the private memory.
244    unsigned getNumPrivateAttributions() {
245      return getBody().getNumArguments() - getType().getNumInputs() -
246          getNumWorkgroupAttributions();
247    }
248
249    /// Returns a list of block arguments that correspond to buffers located in
250    /// the private memory.
251    ArrayRef<BlockArgument> getPrivateAttributions() {
252      // Buffers on the private memory always come after buffers on the workgroup
253      // memory.
254      auto begin =
255          std::next(getBody().args_begin(),
256                    getType().getNumInputs() + getNumWorkgroupAttributions());
257      return {begin, getBody().args_end()};
258    }
259
260    /// Adds a new block argument that corresponds to buffers located in
261    /// private memory.
262    BlockArgument addPrivateAttribution(Type type);
263
264    /// Returns the name of the attribute containing the number of buffers
265    /// located in the workgroup memory.
266    static StringRef getNumWorkgroupAttributionsAttrName() {
267      return "workgroup_attributions";
268    }
269
270    // FunctionLike trait needs access to the functions below.
271    friend class OpTrait::FunctionLike<GPUFuncOp>;
272
273    /// Hooks for the input/output type enumeration in FunctionLike .
274    unsigned getNumFuncArguments() { return getType().getNumInputs(); }
275    unsigned getNumFuncResults() { return getType().getNumResults(); }
276
277    /// Returns the keywords used in the custom syntax for this Op.
278    static StringRef getWorkgroupKeyword() { return "workgroup"; }
279    static StringRef getPrivateKeyword() { return "private"; }
280    static StringRef getKernelKeyword() { return "kernel"; }
281
282    /// Hook for FunctionLike verifier.
283    LogicalResult verifyType();
284
285    /// Verifies the body of the function.
286    LogicalResult verifyBody();
287  }];
288
289  // let verifier = [{ return ::verifFuncOpy(*this); }];
290  let printer = [{ printGPUFuncOp(p, *this); }];
291  let parser = [{ return parseGPUFuncOp(parser, result); }];
292}
293
294def GPU_LaunchFuncOp : GPU_Op<"launch_func",
295                              [GPU_AsyncOpInterface, AttrSizedOperandSegments]>,
296    Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
297               SymbolRefAttr:$kernel,
298               Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
299               Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
300               Variadic<AnyType>:$operands)>,
301    Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
302  let summary = "Launches a function as a GPU kernel";
303
304  let description = [{
305    Launch a kernel function on the specified grid of thread blocks.
306    `gpu.launch` operations are lowered to `gpu.launch_func` operations by
307    outlining the kernel body into a function in a dedicated module, which
308    reflects the separate compilation process. The kernel function is required
309    to have the `gpu.kernel` attribute. The module containing the kernel
310    function is required to be a gpu.module. And finally, the module containing
311    the kernel module (which thus cannot be the top-level module) is required
312    to have the `gpu.container_module` attribute. The `gpu.launch_func`
313    operation has a symbol attribute named `kernel` to identify the fully
314    specified kernel function to launch (both the gpu.module and func).
315
316    The `gpu.launch_func` supports async dependencies: the kernel does not start
317    executing until the ops producing those async dependencies have completed.
318
319    By the default, the host implicitly blocks until kernel execution has
320    completed. If the `async` keyword is present, the host does not block but
321    instead a `!gpu.async.token` is returned. Other async GPU ops can take this
322    token as dependency.
323
324    The operation requires at least the grid and block sizes along the x,y,z
325    dimensions as arguments. When a lower-dimensional kernel is required,
326    unused sizes must be explicitly set to `1`.
327
328    The remaining operands are passed as arguments to the kernel function.
329
330    Example:
331
332    ```mlir
333    module attributes {gpu.container_module} {
334
335      // This module creates a separate compilation unit for the GPU compiler.
336      gpu.module @kernels {
337        func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>)
338            attributes { nvvm.kernel = true } {
339
340          // Operations that produce block/thread IDs and dimensions are
341          // injected when outlining the `gpu.launch` body to a function called
342          // by `gpu.launch_func`.
343          %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
344          %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index)
345          %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index)
346
347          %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index)
348          %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index)
349          %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index)
350
351          %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index)
352          %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index)
353          %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index)
354
355          %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index)
356          %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index)
357          %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
358
359          "some_op"(%bx, %tx) : (index, index) -> ()
360          %42 = load %arg1[%bx] : memref<?xf32, 1>
361        }
362      }
363
364      %t0 = gpu.wait async
365      gpu.launch_func
366          async                           // (Optional) Don't block host, return token.
367          [%t0]                           // (Optional) Execute only after %t0 has completed.
368          @kernels::@kernel_1             // Kernel function.
369          blocks in (%cst, %cst, %cst)    // Grid size.
370          threads in (%cst, %cst, %cst)   // Block size.
371          args(%arg0 : f32,               // (Optional) Kernel arguments.
372               %arg1 : memref<?xf32, 1>)
373    }
374    ```
375  }];
376
377  let skipDefaultBuilders = 1;
378
379  let builders = [
380    OpBuilderDAG<(ins "GPUFuncOp":$kernelFunc, "KernelDim3":$gridSize,
381      "KernelDim3":$blockSize, "ValueRange":$kernelOperands)>
382  ];
383
384  let extraClassDeclaration = [{
385    /// The number of operands passed to the kernel function.
386    unsigned getNumKernelOperands();
387
388    /// The name of the kernel's containing module.
389    StringRef getKernelModuleName();
390
391    /// The name of the kernel.
392    StringRef getKernelName();
393
394    /// The i-th operand passed to the kernel function.
395    Value getKernelOperand(unsigned i);
396
397    /// Get the SSA values passed as operands to specify the grid size.
398    KernelDim3 getGridSizeOperandValues();
399
400    /// Get the SSA values passed as operands to specify the block size.
401    KernelDim3 getBlockSizeOperandValues();
402
403    /// The number of launch configuration operands, placed at the leading
404    /// positions of the operand list.
405    static constexpr unsigned kNumConfigOperands = 6;
406
407    // This needs to quietly verify if attributes with names defined below are
408    // present since it is run before the verifier of this op.
409    friend LogicalResult GPUDialect::verifyOperationAttribute(Operation *,
410                                                              NamedAttribute);
411
412    /// The name of the symbol reference attribute specifying the kernel to launch.
413    static StringRef getKernelAttrName() { return "kernel"; }
414  }];
415
416  let verifier = [{ return ::verify(*this); }];
417  let assemblyFormat = [{
418      custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
419      $kernel
420      `blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)`
421      `threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)`
422      custom<LaunchFuncOperands>($operands, type($operands))
423      attr-dict
424  }];
425}
426
427def GPU_LaunchOp : GPU_Op<"launch">,
428    Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
429               Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ)>,
430    Results<(outs)> {
431  let summary = "GPU kernel launch operation";
432
433  let description = [{
434    Launch a kernel on the specified grid of thread blocks. The body of the
435    kernel is defined by the single region that this operation contains. The
436    operation takes six operands, with first three operands being grid sizes
437    along x,y,z dimensions and the following three arguments being block sizes
438    along x,y,z dimension. When a lower-dimensional kernel is required,
439    unused sizes must be explicitly set to `1`.
440
441    The body region has _twelve_ arguments, grouped as follows:
442
443    -   three arguments that contain block identifiers along x,y,z dimensions;
444    -   three arguments that contain thread identifiers along x,y,z dimensions;
445    -   operands of the `gpu.launch` operation as is (i.e. the operands for
446        grid and block sizes).
447
448    Syntax:
449
450    ```
451    operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment
452                             `threads` `(` ssa-id-list `)` `in` ssa-reassignment
453                               region attr-dict?
454    ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
455    ```
456
457    Example:
458
459    ```mlir
460    gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2)
461               threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) {
462      // Block and thread identifiers, as well as block/grid sizes are
463      // immediately usable inside body region.
464      "some_op"(%bx, %tx) : (index, index) -> ()
465      // Assuming %val1 is defined outside the gpu.launch region.
466      %42 = load %val1[%bx] : memref<?xf32, 1>
467    }
468
469    // Generic syntax explains how the pretty syntax maps to the IR structure.
470    "gpu.launch"(%cst, %cst, %c1,  // Grid sizes.
471                 %cst, %c1, %c1)   // Block sizes.
472
473        {/*attributes*/}
474        // All sizes and identifiers have "index" size.
475        : (index, index, index, index, index, index) -> () {
476    // The operation passes block and thread identifiers, followed by grid and
477    // block sizes.
478    ^bb0(%bx : index, %by : index, %bz : index,
479         %tx : index, %ty : index, %tz : index,
480         %num_bx : index, %num_by : index, %num_bz : index,
481         %num_tx : index, %num_ty : index, %num_tz : index)
482      "some_op"(%bx, %tx) : (index, index) -> ()
483      %3 = "std.load"(%val1, %bx) : (memref<?xf32, 1>, index) -> f32
484    }
485    ```
486
487    Rationale: using operation/block arguments gives analyses a clear way of
488    understanding that a value has additional semantics (e.g., we will need to
489    know what value corresponds to threadIdx.x for coalescing). We can recover
490    these properties by analyzing the operations producing values, but it is
491    easier just to have that information by construction.
492  }];
493
494  let regions = (region AnyRegion:$body);
495
496  let skipDefaultBuilders = 1;
497
498  let builders = [
499    OpBuilderDAG<(ins "Value":$gridSizeX, "Value":$gridSizeY,
500      "Value":$gridSizeZ, "Value":$blockSizeX, "Value":$blockSizeY,
501      "Value":$blockSizeZ)>
502  ];
503
504  let extraClassDeclaration = [{
505    /// Get the SSA values corresponding to kernel block identifiers.
506    KernelDim3 getBlockIds();
507    /// Get the SSA values corresponding to kernel thread identifiers.
508    KernelDim3 getThreadIds();
509    /// Get the SSA values corresponding to kernel grid size.
510    KernelDim3 getGridSize();
511    /// Get the SSA values corresponding to kernel block size.
512    KernelDim3 getBlockSize();
513
514    /// Get the SSA values passed as operands to specify the grid size.
515    KernelDim3 getGridSizeOperandValues();
516    /// Get the SSA values passed as operands to specify the block size.
517    KernelDim3 getBlockSizeOperandValues();
518
519    static StringRef getBlocksKeyword() { return "blocks"; }
520    static StringRef getThreadsKeyword() { return "threads"; }
521
522    /// The number of launch configuration operands, placed at the leading
523    /// positions of the operand list.
524    static constexpr unsigned kNumConfigOperands = 6;
525
526    /// The number of region attributes containing the launch configuration,
527    /// placed in the leading positions of the argument list.
528    static constexpr unsigned kNumConfigRegionAttributes = 12;
529  }];
530
531  let parser = [{ return parseLaunchOp(parser, result); }];
532  let printer = [{ printLaunchOp(p, *this); }];
533  let verifier = [{ return ::verify(*this); }];
534}
535
536def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, NoSideEffect,
537                                     Terminator]>,
538    Arguments<(ins Variadic<AnyType>:$operands)>, Results<(outs)> {
539  let summary = "Terminator for GPU functions.";
540  let description = [{
541    A terminator operation for regions that appear in the body of  `gpu.func`
542    functions. The operands to the `gpu.return` are the result values returned
543    by an invocation of the `gpu.func`.
544  }];
545
546  let builders = [OpBuilderDAG<(ins), [{ // empty}]>];
547
548  let parser = [{ return parseReturnOp(parser, result); }];
549  let printer = [{ p << getOperationName(); }];
550  let verifier = [{ return ::verify(*this); }];
551}
552
553def GPU_TerminatorOp : GPU_Op<"terminator", [HasParent<"LaunchOp">,
554                                             NoSideEffect, Terminator]>,
555    Arguments<(ins)>, Results<(outs)> {
556  let summary = "Terminator for GPU launch regions.";
557  let description = [{
558    A terminator operation for regions that appear in the body of `gpu.launch`
559    operation.  These regions are not expected to return any value so the
560    terminator takes no operands.
561  }];
562
563  let parser = [{ return success(); }];
564  let printer = [{ p << getOperationName(); }];
565}
566
567def GPU_YieldOp : GPU_Op<"yield", [NoSideEffect, Terminator]>,
568    Arguments<(ins Variadic<AnyType>:$values)> {
569  let summary = "GPU yield operation";
570  let description = [{
571    gpu.yield` is a special terminator operation for blocks inside regions
572    in gpu ops. It returns values to the immediately enclosing gpu op.
573
574    Example:
575
576    ```mlir
577    gpu.yield %f0, %f1 : f32, f32
578    ```
579  }];
580}
581
582// add, mul mirror the XLA ComparisonDirection enum.
583def GPU_AllReduceOpAdd : StrEnumAttrCase<"add">;
584def GPU_AllReduceOpAnd : StrEnumAttrCase<"and">;
585def GPU_AllReduceOpMax : StrEnumAttrCase<"max">;
586def GPU_AllReduceOpMin : StrEnumAttrCase<"min">;
587def GPU_AllReduceOpMul : StrEnumAttrCase<"mul">;
588def GPU_AllReduceOpOr : StrEnumAttrCase<"or">;
589def GPU_AllReduceOpXor : StrEnumAttrCase<"xor">;
590
591def GPU_AllReduceOperationAttr : StrEnumAttr<"AllReduceOperationAttr",
592    "built-in reduction operations supported by gpu.allreduce.",
593    [
594      GPU_AllReduceOpAdd,
595      GPU_AllReduceOpAnd,
596      GPU_AllReduceOpMax,
597      GPU_AllReduceOpMin,
598      GPU_AllReduceOpMul,
599      GPU_AllReduceOpOr,
600      GPU_AllReduceOpXor
601    ]>;
602
603def GPU_AllReduceOp : GPU_Op<"all_reduce",
604    [SameOperandsAndResultType, IsolatedFromAbove]>,
605    Arguments<(ins AnyType:$value,
606               OptionalAttr<GPU_AllReduceOperationAttr>:$op)>,
607    Results<(outs AnyType)> {
608  let summary = "Reduce values among workgroup.";
609  let description = [{
610    The `all_reduce` op reduces the value of every work item across a local
611    workgroup. The result is equal for all work items of a workgroup.
612
613    For example, both
614
615    ```mlir
616    %1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32)
617    %2 = "gpu.all_reduce"(%0) ({
618    ^bb(%lhs : f32, %rhs : f32):
619      %sum = addf %lhs, %rhs : f32
620      "gpu.yield"(%sum) : (f32) -> ()
621    }) : (f32) -> (f32)
622    ```
623
624    compute the sum of each work item's %0 value. The first version specifies
625    the accumulation as operation, whereas the second version specifies the
626    accumulation as code region. The accumulation operation must be one of:
627    `add`, `and`, `max`, `min`, `mul`, `or`, `xor`.
628
629    Either none or all work items of a workgroup need to execute this op
630    in convergence.
631  }];
632  let regions = (region AnyRegion:$body);
633  let verifier = [{ return ::verifyAllReduce(*this); }];
634}
635
636def GPU_ShuffleOpXor : StrEnumAttrCase<"xor">;
637
638def GPU_ShuffleModeAttr : StrEnumAttr<"ShuffleModeAttr",
639    "Indexing modes supported by gpu.shuffle.",
640    [
641      GPU_ShuffleOpXor,
642    ]>;
643
644def GPU_ShuffleOp : GPU_Op<"shuffle", [NoSideEffect]>,
645    Arguments<(ins AnyType:$value, I32:$offset, I32:$width,
646               GPU_ShuffleModeAttr:$mode)>,
647    Results<(outs AnyType:$result, I1:$valid)> {
648  let summary = "Shuffles values within a subgroup.";
649  let description = [{
650    The "shuffle" op moves values to a different invocation within the same
651    subgroup.
652
653    Example:
654
655    ```mlir
656    %1, %2 = gpu.shuffle %0, %offset, %width xor : f32
657    ```
658
659    For lane k returns the value from lane `k ^ offset` and `true` if that lane
660    is smaller than %width. Otherwise it returns an unspecified value and
661    `false`. A lane is the index of an invocation relative to its subgroup.
662
663    The width specifies the number of invocations that participate in the
664    shuffle. The width needs to be the same for all invocations that participate
665    in the shuffle. Exactly the first `width` invocations of a subgroup need to
666    execute this op in convergence.
667  }];
668  let verifier = [{ return ::verifyShuffleOp(*this); }];
669  let printer = [{ printShuffleOp(p, *this); }];
670  let parser = [{ return parseShuffleOp(parser, result); }];
671}
672
673def GPU_BarrierOp : GPU_Op<"barrier"> {
674  let summary = "Synchronizes all work items of a workgroup.";
675  let description = [{
676    The "barrier" op synchronizes all work items of a workgroup. It is used
677    to coordinate communication between the work items of the workgroup.
678
679    ```mlir
680    gpu.barrier
681    ```
682
683    waits until all work items in the workgroup have reached this point
684    and all memory accesses made by these work items prior to the op are
685    visible to all work items in the workgroup. Data hazards between work items
686    accessing the same memory can be avoided by synchronizing work items
687    in-between these accesses.
688
689    Either none or all work items of a workgroup need to execute this op
690    in convergence.
691  }];
692  let parser = [{ return success(); }];
693  let printer = [{ p << getOperationName(); }];
694}
695
696def GPU_GPUModuleOp : GPU_Op<"module", [
697  IsolatedFromAbove, SymbolTable, Symbol,
698  SingleBlockImplicitTerminator<"ModuleEndOp">
699]> {
700  let summary = "A top level compilation unit containing code to be run on a GPU.";
701  let description = [{
702    GPU module contains code that is intended to be run on a GPU. A host device
703    can launch this code through a gpu.launc_func that creates a fully
704    qualified symbol through the gpu.module's symbol and a gpu.func symbol
705    contained in the gpu.module.
706
707    The module's top-level scope is modeled by a single region with a single
708    block. GPU modules are required to have a name that is used for symbol
709    resolution by the gpu.launch_func operation.
710
711    Using an op with a region to define a GPU module enables "embedding" GPU
712    modules with SIMT execution models in other dialects in a clean manner and
713    allows filtering of code regions to execute passes on only code intended to
714    or not intended to be run on the separate device.
715
716    ```
717      gpu.module @symbol_name {
718      gpu.func {}
719        ...
720      gpu.module_end
721    }
722
723    ```
724  }];
725  let builders = [OpBuilderDAG<(ins "StringRef":$name)>];
726  let parser = [{ return ::parseGPUModuleOp(parser, result); }];
727  let printer = [{ return ::print(p, *this); }];
728  let regions = (region SizedRegion<1>:$body);
729
730  // We need to ensure the block inside the region is properly terminated;
731  // the auto-generated builders do not guarantee that.
732  let skipDefaultBuilders = 1;
733}
734
735def GPU_ModuleEndOp : GPU_Op<"module_end", [
736  Terminator, HasParent<"GPUModuleOp">
737]> {
738  let summary = "A pseudo op that marks the end of a gpu.module.";
739  let description = [{
740    This op terminates the only block inside the only region of a `gpu.module`.
741  }];
742
743  let parser = [{ return success(); }];
744  let printer = [{ p << getOperationName(); }];
745}
746
747def GPU_HostRegisterOp : GPU_Op<"host_register">,
748    Arguments<(ins AnyUnrankedMemRef:$value)> {
749  let summary = "Registers a memref for access from device.";
750  let description = [{
751    This op maps the provided host buffer into the device address space.
752
753    This operation may not be supported in every environment, there is not yet a
754    way to check at runtime whether this feature is supported.
755
756    Writes from the host are guaranteed to be visible to device kernels that are
757    launched afterwards. Writes from the device are guaranteed to be visible on
758    the host after synchronizing with the device kernel completion.
759  }];
760
761  let assemblyFormat = "$value attr-dict `:` type($value)";
762  let verifier = [{ return success(); }];
763}
764
765def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> {
766  let summary = "Wait for async gpu ops to complete.";
767  let description = [{
768    This op synchronizes the host or the device with a list of dependent ops.
769
770    If the op contains the `async` keyword, it returns a new async token which
771    is synchronized with the op arguments. This new token is merely a shortcut
772    to the argument list, and one could replace the uses of the result with the
773    arguments for the same effect. The async version of this op is primarily
774    used to make each async token have a single use during lowering and
775    thereby make forks in async execution explicit. Example usage:
776
777    ```mlir
778    %t0 = gpu.foo async : !gpu.async.token
779    %t1 = gpu.bar async : !gpu.async.token
780    %t2 = gpu.wait async [%t0, %t1]
781    // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just
782    // as if the async dependencies were [%t0, %t1].
783    %t3 = gpu.baz async [%t2]
784    ```
785
786    If the op does not contain the `async` keyword, it does not return a new
787    async token but blocks until all ops producing the async dependency tokens
788    finished execution. All dependent memory operations are visible to the host
789    once this op completes. Example usage:
790
791    ```mlir
792    %t0 = gpu.foo async : !gpu.async.token
793    %t1 = gpu.bar async : !gpu.async.token
794    // The gpu.wait op blocks until gpu.foo and gpu.bar have completed.
795    gpu.wait [%t0, %t1]
796    ```
797  }];
798
799  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
800  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
801
802  let assemblyFormat = [{
803    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
804  }];
805}
806
807def GPU_AllocOp : GPU_Op<"alloc", [
808    GPU_AsyncOpInterface,
809    AttrSizedOperandSegments,
810    MemoryEffects<[MemAlloc<DefaultResource>]>
811  ]> {
812
813  let summary = "GPU memory allocation operation.";
814  let description = [{
815    The `gpu.alloc` operation allocates a region of memory on the GPU. It is
816    similar to the `std.alloc` op, but supports asynchronous GPU execution.
817
818    The op does not execute before all async dependencies have finished
819    executing.
820
821    If the `async` keyword is present, the op is executed asynchronously (i.e.
822    it does not block until the execution has finished on the device). In
823    that case, it also returns a !gpu.async.token.
824
825    Example:
826
827    ```mlir
828    %memref, %token = gpu.alloc async [%dep] (%width) : memref<64x?xf32, 1>
829    ```
830  }];
831
832  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
833                   Variadic<Index>:$dynamicSizes, Variadic<Index>:$symbolOperands);
834  let results = (outs Res<AnyMemRef, "", [MemAlloc<DefaultResource>]>:$memref,
835                 Optional<GPU_AsyncToken>:$asyncToken);
836
837  let extraClassDeclaration = [{
838    MemRefType getType() { return memref().getType().cast<MemRefType>(); }
839  }];
840
841  let assemblyFormat = [{
842    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) ` `
843    `(` $dynamicSizes `)` (`` `[` $symbolOperands^ `]`)? attr-dict `:` type($memref)
844  }];
845}
846
847def GPU_DeallocOp : GPU_Op<"dealloc", [
848    GPU_AsyncOpInterface, MemoryEffects<[MemFree]>
849  ]> {
850
851  let summary = "GPU memory deallocation operation";
852
853  let description = [{
854    The `gpu.dealloc` operation frees the region of memory referenced by a
855    memref which was originally created by the `gpu.alloc` operation. It is
856    similar to the `std.dealloc` op, but supports asynchronous GPU execution.
857
858    The op does not execute before all async dependencies have finished
859    executing.
860
861    If the `async` keyword is present, the op is executed asynchronously (i.e.
862    it does not block until the execution has finished on the device). In
863    that case, it returns a !gpu.async.token.
864
865    Example:
866
867    ```mlir
868    %token = gpu.dealloc async [%dep] %memref : memref<8x64xf32, 1>
869    ```
870  }];
871
872  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
873                   Arg<AnyMemRef, "", [MemFree]>:$memref);
874  let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
875
876  let assemblyFormat = [{
877    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
878    $memref attr-dict `:` type($memref)
879  }];
880}
881
882#endif // GPU_OPS
883