1//===-- GPUOps.td - GPU dialect operation definitions ------*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// Defines some operations of the GPU dialect. 10// 11//===----------------------------------------------------------------------===// 12 13#ifndef GPU_OPS 14#define GPU_OPS 15 16include "mlir/Dialect/GPU/GPUBase.td" 17include "mlir/Dialect/LLVMIR/LLVMOpBase.td" 18include "mlir/IR/SymbolInterfaces.td" 19include "mlir/Interfaces/SideEffectInterfaces.td" 20 21//===----------------------------------------------------------------------===// 22// GPU Dialect operations. 23//===----------------------------------------------------------------------===// 24 25class GPU_Op<string mnemonic, list<OpTrait> traits = []> : 26 Op<GPU_Dialect, mnemonic, traits>; 27 28class GPU_IndexOp<string mnemonic, list<OpTrait> traits = []> : 29 GPU_Op<mnemonic, !listconcat(traits, [NoSideEffect])>, 30 Arguments<(ins StrAttr:$dimension)>, Results<(outs Index)> { 31 let verifier = [{ return ::verifyIndexOp(*this); }]; 32} 33 34def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> { 35 let description = [{ 36 Returns the number of threads in the thread block (aka the block size) along 37 the x, y, or z `dimension`. 38 39 Example: 40 41 ```mlir 42 %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index) 43 ``` 44 }]; 45} 46def GPU_BlockIdOp : GPU_IndexOp<"block_id"> { 47 let description = [{ 48 Returns the block id, i.e. the index of the current block within the grid 49 along the x, y, or z `dimension`. 50 51 Example: 52 53 ```mlir 54 %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index) 55 ``` 56 }]; 57} 58def GPU_GridDimOp : GPU_IndexOp<"grid_dim"> { 59 let description = [{ 60 Returns the number of thread blocks in the grid along the x, y, or z 61 `dimension`. 62 63 Example: 64 65 ```mlir 66 %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) 67 ``` 68 }]; 69} 70def GPU_ThreadIdOp : GPU_IndexOp<"thread_id"> { 71 let description = [{ 72 Returns the thread id, i.e. the index of the current thread within the block 73 along the x, y, or z `dimension`. 74 75 Example: 76 77 ```mlir 78 %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) 79 ``` 80 }]; 81} 82 83def GPU_SubgroupIdOp : GPU_Op<"subgroup_id", [NoSideEffect]>, 84 Arguments<(ins)>, Results<(outs Index:$result)> { 85 let description = [{ 86 Returns the subgroup id, i.e. the index of the current subgroup within the 87 workgroup. 88 89 Example: 90 91 ```mlir 92 %sgId = gpu.subgroup_id : index 93 ``` 94 }]; 95 96 let assemblyFormat = "attr-dict `:` type($result)"; 97 let verifier = [{ return success(); }]; 98} 99 100def GPU_NumSubgroupsOp : GPU_Op<"num_subgroups", [NoSideEffect]>, 101 Arguments<(ins)>, Results<(outs Index:$result)> { 102 let description = [{ 103 Returns the number of subgroups within a workgroup. 104 105 Example: 106 107 ```mlir 108 %numSg = gpu.num_subgroups : index 109 ``` 110 }]; 111 112 let assemblyFormat = "attr-dict `:` type($result)"; 113 let verifier = [{ return success(); }]; 114} 115 116def GPU_SubgroupSizeOp : GPU_Op<"subgroup_size", [NoSideEffect]>, 117 Arguments<(ins)>, Results<(outs Index:$result)> { 118 let description = [{ 119 Returns the number of threads within a subgroup. 120 121 Example: 122 123 ```mlir 124 %sgSz = gpu.subgroup_size : index 125 ``` 126 }]; 127 128 let assemblyFormat = "attr-dict `:` type($result)"; 129 let verifier = [{ return success(); }]; 130} 131 132def GPU_GPUFuncOp : GPU_Op<"func", [HasParent<"GPUModuleOp">, 133 AutomaticAllocationScope, FunctionLike, 134 IsolatedFromAbove, Symbol]> { 135 let summary = "Function executable on a GPU"; 136 137 let description = [{ 138 Defines a function that can be executed on a GPU. This supports memory 139 attribution and its body has a particular execution model. 140 141 GPU functions are either kernels (as indicated by the `kernel` attribute) or 142 regular functions. The former can be launched from the host side, while the 143 latter are device side only. 144 145 The memory attribution defines SSA values that correspond to memory buffers 146 allocated in the memory hierarchy of the GPU (see below). 147 148 The operation has one attached region that corresponds to the body of the 149 function. The region arguments consist of the function arguments without 150 modification, followed by buffers defined in memory annotations. The body of 151 a GPU function, when launched, is executed by multiple work items. There are 152 no guarantees on the order in which work items execute, or on the connection 153 between them. In particular, work items are not necessarily executed in 154 lock-step. Synchronization ops such as "gpu.barrier" should be used to 155 coordinate work items. Declarations of GPU functions, i.e. not having the 156 body region, are not supported. 157 158 Syntax: 159 160 ``` 161 op ::= `gpu.func` symbol-ref-id `(` argument-list `)` (`->` 162 function-result-list)? 163 memory-attribution `kernel`? function-attributes? region 164 165 memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? 166 (`private` `(` ssa-id-and-type-list `)`)? 167 ``` 168 169 Example: 170 171 ```mlir 172 gpu.func @foo(%arg0: index) 173 workgroup(%workgroup: memref<32xf32, 3>) 174 private(%private: memref<1xf32, 5>) 175 kernel 176 attributes {qux: "quux"} { 177 gpu.return 178 } 179 ``` 180 181 The generic form illustrates the concept 182 183 ```mlir 184 "gpu.func"(%arg: index) {sym_name: "foo", kernel, qux: "quux"} ({ 185 ^bb0(%arg0: index, %workgroup: memref<32xf32, 3>, 186 %private: memref<1xf32, 5>): 187 "gpu.return"() : () -> () 188 }) : (index) -> () 189 ``` 190 191 Note the non-default memory spaces used in memref types in memory 192 attribution. 193 }]; 194 195 let regions = (region AnyRegion:$body); 196 197 let skipDefaultBuilders = 1; 198 199 let builders = [ 200 OpBuilderDAG<(ins "StringRef":$name, "FunctionType":$type, 201 CArg<"TypeRange", "{}">:$workgroupAttributions, 202 CArg<"TypeRange", "{}">:$privateAttributions, 203 CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)> 204 ]; 205 206 let extraClassDeclaration = [{ 207 /// Returns `true` if the GPU function defined by this Op is a kernel, i.e. 208 /// it is intended to be launched from host. 209 bool isKernel() { 210 return (*this)->getAttrOfType<UnitAttr>( 211 GPUDialect::getKernelFuncAttrName()) != nullptr; 212 } 213 214 /// Change the type of this function in place. This is an extremely 215 /// dangerous operation and it is up to the caller to ensure that this is 216 /// legal for this function, and to restore invariants: 217 /// - the entry block args must be updated to match the function params. 218 /// - the argument/result attributes may need an update: if the new type 219 /// has less parameters we drop the extra attributes, if there are more 220 /// parameters they won't have any attributes. 221 // TODO: consider removing this function thanks to rewrite patterns. 222 void setType(FunctionType newType); 223 224 /// Returns the number of buffers located in the workgroup memory. 225 unsigned getNumWorkgroupAttributions() { 226 return (*this)->getAttrOfType<IntegerAttr>( 227 getNumWorkgroupAttributionsAttrName()).getInt(); 228 } 229 230 /// Returns a list of block arguments that correspond to buffers located in 231 /// the workgroup memory 232 ArrayRef<BlockArgument> getWorkgroupAttributions() { 233 auto begin = 234 std::next(getBody().args_begin(), getType().getNumInputs()); 235 auto end = std::next(begin, getNumWorkgroupAttributions()); 236 return {begin, end}; 237 } 238 239 /// Adds a new block argument that corresponds to buffers located in 240 /// workgroup memory. 241 BlockArgument addWorkgroupAttribution(Type type); 242 243 /// Returns the number of buffers located in the private memory. 244 unsigned getNumPrivateAttributions() { 245 return getBody().getNumArguments() - getType().getNumInputs() - 246 getNumWorkgroupAttributions(); 247 } 248 249 /// Returns a list of block arguments that correspond to buffers located in 250 /// the private memory. 251 ArrayRef<BlockArgument> getPrivateAttributions() { 252 // Buffers on the private memory always come after buffers on the workgroup 253 // memory. 254 auto begin = 255 std::next(getBody().args_begin(), 256 getType().getNumInputs() + getNumWorkgroupAttributions()); 257 return {begin, getBody().args_end()}; 258 } 259 260 /// Adds a new block argument that corresponds to buffers located in 261 /// private memory. 262 BlockArgument addPrivateAttribution(Type type); 263 264 /// Returns the name of the attribute containing the number of buffers 265 /// located in the workgroup memory. 266 static StringRef getNumWorkgroupAttributionsAttrName() { 267 return "workgroup_attributions"; 268 } 269 270 // FunctionLike trait needs access to the functions below. 271 friend class OpTrait::FunctionLike<GPUFuncOp>; 272 273 /// Hooks for the input/output type enumeration in FunctionLike . 274 unsigned getNumFuncArguments() { return getType().getNumInputs(); } 275 unsigned getNumFuncResults() { return getType().getNumResults(); } 276 277 /// Returns the keywords used in the custom syntax for this Op. 278 static StringRef getWorkgroupKeyword() { return "workgroup"; } 279 static StringRef getPrivateKeyword() { return "private"; } 280 static StringRef getKernelKeyword() { return "kernel"; } 281 282 /// Hook for FunctionLike verifier. 283 LogicalResult verifyType(); 284 285 /// Verifies the body of the function. 286 LogicalResult verifyBody(); 287 }]; 288 289 // let verifier = [{ return ::verifFuncOpy(*this); }]; 290 let printer = [{ printGPUFuncOp(p, *this); }]; 291 let parser = [{ return parseGPUFuncOp(parser, result); }]; 292} 293 294def GPU_LaunchFuncOp : GPU_Op<"launch_func", 295 [GPU_AsyncOpInterface, AttrSizedOperandSegments]>, 296 Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies, 297 SymbolRefAttr:$kernel, 298 Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, 299 Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, 300 Variadic<AnyType>:$operands)>, 301 Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> { 302 let summary = "Launches a function as a GPU kernel"; 303 304 let description = [{ 305 Launch a kernel function on the specified grid of thread blocks. 306 `gpu.launch` operations are lowered to `gpu.launch_func` operations by 307 outlining the kernel body into a function in a dedicated module, which 308 reflects the separate compilation process. The kernel function is required 309 to have the `gpu.kernel` attribute. The module containing the kernel 310 function is required to be a gpu.module. And finally, the module containing 311 the kernel module (which thus cannot be the top-level module) is required 312 to have the `gpu.container_module` attribute. The `gpu.launch_func` 313 operation has a symbol attribute named `kernel` to identify the fully 314 specified kernel function to launch (both the gpu.module and func). 315 316 The `gpu.launch_func` supports async dependencies: the kernel does not start 317 executing until the ops producing those async dependencies have completed. 318 319 By the default, the host implicitly blocks until kernel execution has 320 completed. If the `async` keyword is present, the host does not block but 321 instead a `!gpu.async.token` is returned. Other async GPU ops can take this 322 token as dependency. 323 324 The operation requires at least the grid and block sizes along the x,y,z 325 dimensions as arguments. When a lower-dimensional kernel is required, 326 unused sizes must be explicitly set to `1`. 327 328 The remaining operands are passed as arguments to the kernel function. 329 330 Example: 331 332 ```mlir 333 module attributes {gpu.container_module} { 334 335 // This module creates a separate compilation unit for the GPU compiler. 336 gpu.module @kernels { 337 func @kernel_1(%arg0 : f32, %arg1 : memref<?xf32, 1>) 338 attributes { nvvm.kernel = true } { 339 340 // Operations that produce block/thread IDs and dimensions are 341 // injected when outlining the `gpu.launch` body to a function called 342 // by `gpu.launch_func`. 343 %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) 344 %tIdY = "gpu.thread_id"() {dimension = "y"} : () -> (index) 345 %tIdZ = "gpu.thread_id"() {dimension = "z"} : () -> (index) 346 347 %bDimX = "gpu.block_dim"() {dimension = "x"} : () -> (index) 348 %bDimY = "gpu.block_dim"() {dimension = "y"} : () -> (index) 349 %bDimZ = "gpu.block_dim"() {dimension = "z"} : () -> (index) 350 351 %bIdX = "gpu.block_id"() {dimension = "x"} : () -> (index) 352 %bIdY = "gpu.block_id"() {dimension = "y"} : () -> (index) 353 %bIdZ = "gpu.block_id"() {dimension = "z"} : () -> (index) 354 355 %gDimX = "gpu.grid_dim"() {dimension = "x"} : () -> (index) 356 %gDimY = "gpu.grid_dim"() {dimension = "y"} : () -> (index) 357 %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) 358 359 "some_op"(%bx, %tx) : (index, index) -> () 360 %42 = load %arg1[%bx] : memref<?xf32, 1> 361 } 362 } 363 364 %t0 = gpu.wait async 365 gpu.launch_func 366 async // (Optional) Don't block host, return token. 367 [%t0] // (Optional) Execute only after %t0 has completed. 368 @kernels::@kernel_1 // Kernel function. 369 blocks in (%cst, %cst, %cst) // Grid size. 370 threads in (%cst, %cst, %cst) // Block size. 371 args(%arg0 : f32, // (Optional) Kernel arguments. 372 %arg1 : memref<?xf32, 1>) 373 } 374 ``` 375 }]; 376 377 let skipDefaultBuilders = 1; 378 379 let builders = [ 380 OpBuilderDAG<(ins "GPUFuncOp":$kernelFunc, "KernelDim3":$gridSize, 381 "KernelDim3":$blockSize, "ValueRange":$kernelOperands)> 382 ]; 383 384 let extraClassDeclaration = [{ 385 /// The number of operands passed to the kernel function. 386 unsigned getNumKernelOperands(); 387 388 /// The name of the kernel's containing module. 389 StringRef getKernelModuleName(); 390 391 /// The name of the kernel. 392 StringRef getKernelName(); 393 394 /// The i-th operand passed to the kernel function. 395 Value getKernelOperand(unsigned i); 396 397 /// Get the SSA values passed as operands to specify the grid size. 398 KernelDim3 getGridSizeOperandValues(); 399 400 /// Get the SSA values passed as operands to specify the block size. 401 KernelDim3 getBlockSizeOperandValues(); 402 403 /// The number of launch configuration operands, placed at the leading 404 /// positions of the operand list. 405 static constexpr unsigned kNumConfigOperands = 6; 406 407 // This needs to quietly verify if attributes with names defined below are 408 // present since it is run before the verifier of this op. 409 friend LogicalResult GPUDialect::verifyOperationAttribute(Operation *, 410 NamedAttribute); 411 412 /// The name of the symbol reference attribute specifying the kernel to launch. 413 static StringRef getKernelAttrName() { return "kernel"; } 414 }]; 415 416 let verifier = [{ return ::verify(*this); }]; 417 let assemblyFormat = [{ 418 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 419 $kernel 420 `blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)` 421 `threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)` 422 custom<LaunchFuncOperands>($operands, type($operands)) 423 attr-dict 424 }]; 425} 426 427def GPU_LaunchOp : GPU_Op<"launch">, 428 Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, 429 Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ)>, 430 Results<(outs)> { 431 let summary = "GPU kernel launch operation"; 432 433 let description = [{ 434 Launch a kernel on the specified grid of thread blocks. The body of the 435 kernel is defined by the single region that this operation contains. The 436 operation takes six operands, with first three operands being grid sizes 437 along x,y,z dimensions and the following three arguments being block sizes 438 along x,y,z dimension. When a lower-dimensional kernel is required, 439 unused sizes must be explicitly set to `1`. 440 441 The body region has _twelve_ arguments, grouped as follows: 442 443 - three arguments that contain block identifiers along x,y,z dimensions; 444 - three arguments that contain thread identifiers along x,y,z dimensions; 445 - operands of the `gpu.launch` operation as is (i.e. the operands for 446 grid and block sizes). 447 448 Syntax: 449 450 ``` 451 operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment 452 `threads` `(` ssa-id-list `)` `in` ssa-reassignment 453 region attr-dict? 454 ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` 455 ``` 456 457 Example: 458 459 ```mlir 460 gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2) 461 threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) { 462 // Block and thread identifiers, as well as block/grid sizes are 463 // immediately usable inside body region. 464 "some_op"(%bx, %tx) : (index, index) -> () 465 // Assuming %val1 is defined outside the gpu.launch region. 466 %42 = load %val1[%bx] : memref<?xf32, 1> 467 } 468 469 // Generic syntax explains how the pretty syntax maps to the IR structure. 470 "gpu.launch"(%cst, %cst, %c1, // Grid sizes. 471 %cst, %c1, %c1) // Block sizes. 472 473 {/*attributes*/} 474 // All sizes and identifiers have "index" size. 475 : (index, index, index, index, index, index) -> () { 476 // The operation passes block and thread identifiers, followed by grid and 477 // block sizes. 478 ^bb0(%bx : index, %by : index, %bz : index, 479 %tx : index, %ty : index, %tz : index, 480 %num_bx : index, %num_by : index, %num_bz : index, 481 %num_tx : index, %num_ty : index, %num_tz : index) 482 "some_op"(%bx, %tx) : (index, index) -> () 483 %3 = "std.load"(%val1, %bx) : (memref<?xf32, 1>, index) -> f32 484 } 485 ``` 486 487 Rationale: using operation/block arguments gives analyses a clear way of 488 understanding that a value has additional semantics (e.g., we will need to 489 know what value corresponds to threadIdx.x for coalescing). We can recover 490 these properties by analyzing the operations producing values, but it is 491 easier just to have that information by construction. 492 }]; 493 494 let regions = (region AnyRegion:$body); 495 496 let skipDefaultBuilders = 1; 497 498 let builders = [ 499 OpBuilderDAG<(ins "Value":$gridSizeX, "Value":$gridSizeY, 500 "Value":$gridSizeZ, "Value":$blockSizeX, "Value":$blockSizeY, 501 "Value":$blockSizeZ)> 502 ]; 503 504 let extraClassDeclaration = [{ 505 /// Get the SSA values corresponding to kernel block identifiers. 506 KernelDim3 getBlockIds(); 507 /// Get the SSA values corresponding to kernel thread identifiers. 508 KernelDim3 getThreadIds(); 509 /// Get the SSA values corresponding to kernel grid size. 510 KernelDim3 getGridSize(); 511 /// Get the SSA values corresponding to kernel block size. 512 KernelDim3 getBlockSize(); 513 514 /// Get the SSA values passed as operands to specify the grid size. 515 KernelDim3 getGridSizeOperandValues(); 516 /// Get the SSA values passed as operands to specify the block size. 517 KernelDim3 getBlockSizeOperandValues(); 518 519 static StringRef getBlocksKeyword() { return "blocks"; } 520 static StringRef getThreadsKeyword() { return "threads"; } 521 522 /// The number of launch configuration operands, placed at the leading 523 /// positions of the operand list. 524 static constexpr unsigned kNumConfigOperands = 6; 525 526 /// The number of region attributes containing the launch configuration, 527 /// placed in the leading positions of the argument list. 528 static constexpr unsigned kNumConfigRegionAttributes = 12; 529 }]; 530 531 let parser = [{ return parseLaunchOp(parser, result); }]; 532 let printer = [{ printLaunchOp(p, *this); }]; 533 let verifier = [{ return ::verify(*this); }]; 534} 535 536def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, NoSideEffect, 537 Terminator]>, 538 Arguments<(ins Variadic<AnyType>:$operands)>, Results<(outs)> { 539 let summary = "Terminator for GPU functions."; 540 let description = [{ 541 A terminator operation for regions that appear in the body of `gpu.func` 542 functions. The operands to the `gpu.return` are the result values returned 543 by an invocation of the `gpu.func`. 544 }]; 545 546 let builders = [OpBuilderDAG<(ins), [{ // empty}]>]; 547 548 let parser = [{ return parseReturnOp(parser, result); }]; 549 let printer = [{ p << getOperationName(); }]; 550 let verifier = [{ return ::verify(*this); }]; 551} 552 553def GPU_TerminatorOp : GPU_Op<"terminator", [HasParent<"LaunchOp">, 554 NoSideEffect, Terminator]>, 555 Arguments<(ins)>, Results<(outs)> { 556 let summary = "Terminator for GPU launch regions."; 557 let description = [{ 558 A terminator operation for regions that appear in the body of `gpu.launch` 559 operation. These regions are not expected to return any value so the 560 terminator takes no operands. 561 }]; 562 563 let parser = [{ return success(); }]; 564 let printer = [{ p << getOperationName(); }]; 565} 566 567def GPU_YieldOp : GPU_Op<"yield", [NoSideEffect, Terminator]>, 568 Arguments<(ins Variadic<AnyType>:$values)> { 569 let summary = "GPU yield operation"; 570 let description = [{ 571 gpu.yield` is a special terminator operation for blocks inside regions 572 in gpu ops. It returns values to the immediately enclosing gpu op. 573 574 Example: 575 576 ```mlir 577 gpu.yield %f0, %f1 : f32, f32 578 ``` 579 }]; 580} 581 582// add, mul mirror the XLA ComparisonDirection enum. 583def GPU_AllReduceOpAdd : StrEnumAttrCase<"add">; 584def GPU_AllReduceOpAnd : StrEnumAttrCase<"and">; 585def GPU_AllReduceOpMax : StrEnumAttrCase<"max">; 586def GPU_AllReduceOpMin : StrEnumAttrCase<"min">; 587def GPU_AllReduceOpMul : StrEnumAttrCase<"mul">; 588def GPU_AllReduceOpOr : StrEnumAttrCase<"or">; 589def GPU_AllReduceOpXor : StrEnumAttrCase<"xor">; 590 591def GPU_AllReduceOperationAttr : StrEnumAttr<"AllReduceOperationAttr", 592 "built-in reduction operations supported by gpu.allreduce.", 593 [ 594 GPU_AllReduceOpAdd, 595 GPU_AllReduceOpAnd, 596 GPU_AllReduceOpMax, 597 GPU_AllReduceOpMin, 598 GPU_AllReduceOpMul, 599 GPU_AllReduceOpOr, 600 GPU_AllReduceOpXor 601 ]>; 602 603def GPU_AllReduceOp : GPU_Op<"all_reduce", 604 [SameOperandsAndResultType, IsolatedFromAbove]>, 605 Arguments<(ins AnyType:$value, 606 OptionalAttr<GPU_AllReduceOperationAttr>:$op)>, 607 Results<(outs AnyType)> { 608 let summary = "Reduce values among workgroup."; 609 let description = [{ 610 The `all_reduce` op reduces the value of every work item across a local 611 workgroup. The result is equal for all work items of a workgroup. 612 613 For example, both 614 615 ```mlir 616 %1 = "gpu.all_reduce"(%0) ({}) { op = "add" } : (f32) -> (f32) 617 %2 = "gpu.all_reduce"(%0) ({ 618 ^bb(%lhs : f32, %rhs : f32): 619 %sum = addf %lhs, %rhs : f32 620 "gpu.yield"(%sum) : (f32) -> () 621 }) : (f32) -> (f32) 622 ``` 623 624 compute the sum of each work item's %0 value. The first version specifies 625 the accumulation as operation, whereas the second version specifies the 626 accumulation as code region. The accumulation operation must be one of: 627 `add`, `and`, `max`, `min`, `mul`, `or`, `xor`. 628 629 Either none or all work items of a workgroup need to execute this op 630 in convergence. 631 }]; 632 let regions = (region AnyRegion:$body); 633 let verifier = [{ return ::verifyAllReduce(*this); }]; 634} 635 636def GPU_ShuffleOpXor : StrEnumAttrCase<"xor">; 637 638def GPU_ShuffleModeAttr : StrEnumAttr<"ShuffleModeAttr", 639 "Indexing modes supported by gpu.shuffle.", 640 [ 641 GPU_ShuffleOpXor, 642 ]>; 643 644def GPU_ShuffleOp : GPU_Op<"shuffle", [NoSideEffect]>, 645 Arguments<(ins AnyType:$value, I32:$offset, I32:$width, 646 GPU_ShuffleModeAttr:$mode)>, 647 Results<(outs AnyType:$result, I1:$valid)> { 648 let summary = "Shuffles values within a subgroup."; 649 let description = [{ 650 The "shuffle" op moves values to a different invocation within the same 651 subgroup. 652 653 Example: 654 655 ```mlir 656 %1, %2 = gpu.shuffle %0, %offset, %width xor : f32 657 ``` 658 659 For lane k returns the value from lane `k ^ offset` and `true` if that lane 660 is smaller than %width. Otherwise it returns an unspecified value and 661 `false`. A lane is the index of an invocation relative to its subgroup. 662 663 The width specifies the number of invocations that participate in the 664 shuffle. The width needs to be the same for all invocations that participate 665 in the shuffle. Exactly the first `width` invocations of a subgroup need to 666 execute this op in convergence. 667 }]; 668 let verifier = [{ return ::verifyShuffleOp(*this); }]; 669 let printer = [{ printShuffleOp(p, *this); }]; 670 let parser = [{ return parseShuffleOp(parser, result); }]; 671} 672 673def GPU_BarrierOp : GPU_Op<"barrier"> { 674 let summary = "Synchronizes all work items of a workgroup."; 675 let description = [{ 676 The "barrier" op synchronizes all work items of a workgroup. It is used 677 to coordinate communication between the work items of the workgroup. 678 679 ```mlir 680 gpu.barrier 681 ``` 682 683 waits until all work items in the workgroup have reached this point 684 and all memory accesses made by these work items prior to the op are 685 visible to all work items in the workgroup. Data hazards between work items 686 accessing the same memory can be avoided by synchronizing work items 687 in-between these accesses. 688 689 Either none or all work items of a workgroup need to execute this op 690 in convergence. 691 }]; 692 let parser = [{ return success(); }]; 693 let printer = [{ p << getOperationName(); }]; 694} 695 696def GPU_GPUModuleOp : GPU_Op<"module", [ 697 IsolatedFromAbove, SymbolTable, Symbol, 698 SingleBlockImplicitTerminator<"ModuleEndOp"> 699]> { 700 let summary = "A top level compilation unit containing code to be run on a GPU."; 701 let description = [{ 702 GPU module contains code that is intended to be run on a GPU. A host device 703 can launch this code through a gpu.launc_func that creates a fully 704 qualified symbol through the gpu.module's symbol and a gpu.func symbol 705 contained in the gpu.module. 706 707 The module's top-level scope is modeled by a single region with a single 708 block. GPU modules are required to have a name that is used for symbol 709 resolution by the gpu.launch_func operation. 710 711 Using an op with a region to define a GPU module enables "embedding" GPU 712 modules with SIMT execution models in other dialects in a clean manner and 713 allows filtering of code regions to execute passes on only code intended to 714 or not intended to be run on the separate device. 715 716 ``` 717 gpu.module @symbol_name { 718 gpu.func {} 719 ... 720 gpu.module_end 721 } 722 723 ``` 724 }]; 725 let builders = [OpBuilderDAG<(ins "StringRef":$name)>]; 726 let parser = [{ return ::parseGPUModuleOp(parser, result); }]; 727 let printer = [{ return ::print(p, *this); }]; 728 let regions = (region SizedRegion<1>:$body); 729 730 // We need to ensure the block inside the region is properly terminated; 731 // the auto-generated builders do not guarantee that. 732 let skipDefaultBuilders = 1; 733} 734 735def GPU_ModuleEndOp : GPU_Op<"module_end", [ 736 Terminator, HasParent<"GPUModuleOp"> 737]> { 738 let summary = "A pseudo op that marks the end of a gpu.module."; 739 let description = [{ 740 This op terminates the only block inside the only region of a `gpu.module`. 741 }]; 742 743 let parser = [{ return success(); }]; 744 let printer = [{ p << getOperationName(); }]; 745} 746 747def GPU_HostRegisterOp : GPU_Op<"host_register">, 748 Arguments<(ins AnyUnrankedMemRef:$value)> { 749 let summary = "Registers a memref for access from device."; 750 let description = [{ 751 This op maps the provided host buffer into the device address space. 752 753 This operation may not be supported in every environment, there is not yet a 754 way to check at runtime whether this feature is supported. 755 756 Writes from the host are guaranteed to be visible to device kernels that are 757 launched afterwards. Writes from the device are guaranteed to be visible on 758 the host after synchronizing with the device kernel completion. 759 }]; 760 761 let assemblyFormat = "$value attr-dict `:` type($value)"; 762 let verifier = [{ return success(); }]; 763} 764 765def GPU_WaitOp : GPU_Op<"wait", [GPU_AsyncOpInterface]> { 766 let summary = "Wait for async gpu ops to complete."; 767 let description = [{ 768 This op synchronizes the host or the device with a list of dependent ops. 769 770 If the op contains the `async` keyword, it returns a new async token which 771 is synchronized with the op arguments. This new token is merely a shortcut 772 to the argument list, and one could replace the uses of the result with the 773 arguments for the same effect. The async version of this op is primarily 774 used to make each async token have a single use during lowering and 775 thereby make forks in async execution explicit. Example usage: 776 777 ```mlir 778 %t0 = gpu.foo async : !gpu.async.token 779 %t1 = gpu.bar async : !gpu.async.token 780 %t2 = gpu.wait async [%t0, %t1] 781 // gpu.baz doesn't run until gpu.foo and gpu.bar have both completed, just 782 // as if the async dependencies were [%t0, %t1]. 783 %t3 = gpu.baz async [%t2] 784 ``` 785 786 If the op does not contain the `async` keyword, it does not return a new 787 async token but blocks until all ops producing the async dependency tokens 788 finished execution. All dependent memory operations are visible to the host 789 once this op completes. Example usage: 790 791 ```mlir 792 %t0 = gpu.foo async : !gpu.async.token 793 %t1 = gpu.bar async : !gpu.async.token 794 // The gpu.wait op blocks until gpu.foo and gpu.bar have completed. 795 gpu.wait [%t0, %t1] 796 ``` 797 }]; 798 799 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies); 800 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 801 802 let assemblyFormat = [{ 803 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict 804 }]; 805} 806 807def GPU_AllocOp : GPU_Op<"alloc", [ 808 GPU_AsyncOpInterface, 809 AttrSizedOperandSegments, 810 MemoryEffects<[MemAlloc<DefaultResource>]> 811 ]> { 812 813 let summary = "GPU memory allocation operation."; 814 let description = [{ 815 The `gpu.alloc` operation allocates a region of memory on the GPU. It is 816 similar to the `std.alloc` op, but supports asynchronous GPU execution. 817 818 The op does not execute before all async dependencies have finished 819 executing. 820 821 If the `async` keyword is present, the op is executed asynchronously (i.e. 822 it does not block until the execution has finished on the device). In 823 that case, it also returns a !gpu.async.token. 824 825 Example: 826 827 ```mlir 828 %memref, %token = gpu.alloc async [%dep] (%width) : memref<64x?xf32, 1> 829 ``` 830 }]; 831 832 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 833 Variadic<Index>:$dynamicSizes, Variadic<Index>:$symbolOperands); 834 let results = (outs Res<AnyMemRef, "", [MemAlloc<DefaultResource>]>:$memref, 835 Optional<GPU_AsyncToken>:$asyncToken); 836 837 let extraClassDeclaration = [{ 838 MemRefType getType() { return memref().getType().cast<MemRefType>(); } 839 }]; 840 841 let assemblyFormat = [{ 842 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) ` ` 843 `(` $dynamicSizes `)` (`` `[` $symbolOperands^ `]`)? attr-dict `:` type($memref) 844 }]; 845} 846 847def GPU_DeallocOp : GPU_Op<"dealloc", [ 848 GPU_AsyncOpInterface, MemoryEffects<[MemFree]> 849 ]> { 850 851 let summary = "GPU memory deallocation operation"; 852 853 let description = [{ 854 The `gpu.dealloc` operation frees the region of memory referenced by a 855 memref which was originally created by the `gpu.alloc` operation. It is 856 similar to the `std.dealloc` op, but supports asynchronous GPU execution. 857 858 The op does not execute before all async dependencies have finished 859 executing. 860 861 If the `async` keyword is present, the op is executed asynchronously (i.e. 862 it does not block until the execution has finished on the device). In 863 that case, it returns a !gpu.async.token. 864 865 Example: 866 867 ```mlir 868 %token = gpu.dealloc async [%dep] %memref : memref<8x64xf32, 1> 869 ``` 870 }]; 871 872 let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies, 873 Arg<AnyMemRef, "", [MemFree]>:$memref); 874 let results = (outs Optional<GPU_AsyncToken>:$asyncToken); 875 876 let assemblyFormat = [{ 877 custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) 878 $memref attr-dict `:` type($memref) 879 }]; 880} 881 882#endif // GPU_OPS 883