1 // Copyright 2019 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "ComputeProgram.hpp"
16 #include "Constants.hpp"
17
18 #include "System/Debug.hpp"
19 #include "Vulkan/VkPipelineLayout.hpp"
20
21 #include "marl/defer.h"
22 #include "marl/trace.h"
23 #include "marl/waitgroup.h"
24
25 #include <queue>
26
27 namespace {
28
29 enum
30 {
31 X,
32 Y,
33 Z
34 };
35
36 } // anonymous namespace
37
38 namespace sw {
39
ComputeProgram(vk::Device * device,SpirvShader const * shader,vk::PipelineLayout const * pipelineLayout,const vk::DescriptorSet::Bindings & descriptorSets)40 ComputeProgram::ComputeProgram(vk::Device *device, SpirvShader const *shader, vk::PipelineLayout const *pipelineLayout, const vk::DescriptorSet::Bindings &descriptorSets)
41 : device(device)
42 , shader(shader)
43 , pipelineLayout(pipelineLayout)
44 , descriptorSets(descriptorSets)
45 {
46 }
47
~ComputeProgram()48 ComputeProgram::~ComputeProgram()
49 {
50 }
51
generate()52 void ComputeProgram::generate()
53 {
54 MARL_SCOPED_EVENT("ComputeProgram::generate");
55
56 SpirvRoutine routine(pipelineLayout);
57 shader->emitProlog(&routine);
58 emit(&routine);
59 shader->emitEpilog(&routine);
60 shader->clearPhis(&routine);
61 }
62
setWorkgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3])63 void ComputeProgram::setWorkgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3])
64 {
65 // TODO(b/146486064): Consider only assigning these to the SpirvRoutine iff
66 // they are ever going to be read.
67 routine->numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
68 routine->workgroupID = Insert(Insert(Insert(Int4(0), workgroupID[X], X), workgroupID[Y], Y), workgroupID[Z], Z);
69 routine->workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
70 routine->subgroupsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, subgroupsPerWorkgroup));
71 routine->invocationsPerSubgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerSubgroup));
72
73 routine->setInputBuiltin(shader, spv::BuiltInNumWorkgroups, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
74 for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
75 {
76 value[builtin.FirstComponent + component] =
77 As<SIMD::Float>(SIMD::Int(Extract(routine->numWorkgroups, component)));
78 }
79 });
80
81 routine->setInputBuiltin(shader, spv::BuiltInWorkgroupId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
82 for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
83 {
84 value[builtin.FirstComponent + component] =
85 As<SIMD::Float>(SIMD::Int(workgroupID[component]));
86 }
87 });
88
89 routine->setInputBuiltin(shader, spv::BuiltInWorkgroupSize, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
90 for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
91 {
92 value[builtin.FirstComponent + component] =
93 As<SIMD::Float>(SIMD::Int(Extract(routine->workgroupSize, component)));
94 }
95 });
96
97 routine->setInputBuiltin(shader, spv::BuiltInNumSubgroups, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
98 ASSERT(builtin.SizeInComponents == 1);
99 value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->subgroupsPerWorkgroup));
100 });
101
102 routine->setInputBuiltin(shader, spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
103 ASSERT(builtin.SizeInComponents == 1);
104 value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->invocationsPerSubgroup));
105 });
106
107 routine->setImmutableInputBuiltins(shader);
108 }
109
setSubgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3],SIMD::Int localInvocationIndex,Int subgroupIndex)110 void ComputeProgram::setSubgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3], SIMD::Int localInvocationIndex, Int subgroupIndex)
111 {
112 Int4 numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
113 Int4 workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
114
115 // TODO: Fix Int4 swizzles so we can just use workgroupSize.x, workgroupSize.y.
116 Int workgroupSizeX = Extract(workgroupSize, X);
117 Int workgroupSizeY = Extract(workgroupSize, Y);
118
119 SIMD::Int localInvocationID[3];
120 {
121 SIMD::Int idx = localInvocationIndex;
122 localInvocationID[Z] = idx / SIMD::Int(workgroupSizeX * workgroupSizeY);
123 idx -= localInvocationID[Z] * SIMD::Int(workgroupSizeX * workgroupSizeY); // modulo
124 localInvocationID[Y] = idx / SIMD::Int(workgroupSizeX);
125 idx -= localInvocationID[Y] * SIMD::Int(workgroupSizeX); // modulo
126 localInvocationID[X] = idx;
127 }
128
129 Int4 wgID = Insert(Insert(Insert(SIMD::Int(0), workgroupID[X], X), workgroupID[Y], Y), workgroupID[Z], Z);
130 auto localBase = workgroupSize * wgID;
131 SIMD::Int globalInvocationID[3];
132 globalInvocationID[X] = SIMD::Int(Extract(localBase, X)) + localInvocationID[X];
133 globalInvocationID[Y] = SIMD::Int(Extract(localBase, Y)) + localInvocationID[Y];
134 globalInvocationID[Z] = SIMD::Int(Extract(localBase, Z)) + localInvocationID[Z];
135
136 routine->localInvocationIndex = localInvocationIndex;
137 routine->subgroupIndex = subgroupIndex;
138 routine->localInvocationID[X] = localInvocationID[X];
139 routine->localInvocationID[Y] = localInvocationID[Y];
140 routine->localInvocationID[Z] = localInvocationID[Z];
141 routine->globalInvocationID[X] = globalInvocationID[X];
142 routine->globalInvocationID[Y] = globalInvocationID[Y];
143 routine->globalInvocationID[Z] = globalInvocationID[Z];
144
145 routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationIndex, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
146 ASSERT(builtin.SizeInComponents == 1);
147 value[builtin.FirstComponent] = As<SIMD::Float>(localInvocationIndex);
148 });
149
150 routine->setInputBuiltin(shader, spv::BuiltInSubgroupId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
151 ASSERT(builtin.SizeInComponents == 1);
152 value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupIndex));
153 });
154
155 routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
156 for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
157 {
158 value[builtin.FirstComponent + component] =
159 As<SIMD::Float>(localInvocationID[component]);
160 }
161 });
162
163 routine->setInputBuiltin(shader, spv::BuiltInGlobalInvocationId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
164 for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
165 {
166 value[builtin.FirstComponent + component] =
167 As<SIMD::Float>(globalInvocationID[component]);
168 }
169 });
170 }
171
emit(SpirvRoutine * routine)172 void ComputeProgram::emit(SpirvRoutine *routine)
173 {
174 Pointer<Byte> data = Arg<0>();
175 Int workgroupX = Arg<1>();
176 Int workgroupY = Arg<2>();
177 Int workgroupZ = Arg<3>();
178 Pointer<Byte> workgroupMemory = Arg<4>();
179 Int firstSubgroup = Arg<5>();
180 Int subgroupCount = Arg<6>();
181
182 routine->descriptorSets = data + OFFSET(Data, descriptorSets);
183 routine->descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
184 routine->pushConstants = data + OFFSET(Data, pushConstants);
185 routine->constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants));
186 routine->workgroupMemory = workgroupMemory;
187
188 Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
189
190 Int workgroupID[3] = { workgroupX, workgroupY, workgroupZ };
191 setWorkgroupBuiltins(data, routine, workgroupID);
192
193 For(Int i = 0, i < subgroupCount, i++)
194 {
195 auto subgroupIndex = firstSubgroup + i;
196
197 // TODO: Replace SIMD::Int(0, 1, 2, 3) with SIMD-width equivalent
198 auto localInvocationIndex = SIMD::Int(subgroupIndex * SIMD::Width) + SIMD::Int(0, 1, 2, 3);
199
200 // Disable lanes where (invocationIDs >= invocationsPerWorkgroup)
201 auto activeLaneMask = CmpLT(localInvocationIndex, SIMD::Int(invocationsPerWorkgroup));
202
203 setSubgroupBuiltins(data, routine, workgroupID, localInvocationIndex, subgroupIndex);
204
205 shader->emit(routine, activeLaneMask, activeLaneMask, descriptorSets);
206 }
207 }
208
run(vk::DescriptorSet::Array const & descriptorSetObjects,vk::DescriptorSet::Bindings const & descriptorSets,vk::DescriptorSet::DynamicOffsets const & descriptorDynamicOffsets,vk::Pipeline::PushConstantStorage const & pushConstants,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)209 void ComputeProgram::run(
210 vk::DescriptorSet::Array const &descriptorSetObjects,
211 vk::DescriptorSet::Bindings const &descriptorSets,
212 vk::DescriptorSet::DynamicOffsets const &descriptorDynamicOffsets,
213 vk::Pipeline::PushConstantStorage const &pushConstants,
214 uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ,
215 uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
216 {
217 auto &modes = shader->getModes();
218
219 auto invocationsPerSubgroup = SIMD::Width;
220 auto invocationsPerWorkgroup = modes.WorkgroupSizeX * modes.WorkgroupSizeY * modes.WorkgroupSizeZ;
221 auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
222
223 Data data;
224 data.descriptorSets = descriptorSets;
225 data.descriptorDynamicOffsets = descriptorDynamicOffsets;
226 data.numWorkgroups[X] = groupCountX;
227 data.numWorkgroups[Y] = groupCountY;
228 data.numWorkgroups[Z] = groupCountZ;
229 data.numWorkgroups[3] = 0;
230 data.workgroupSize[X] = modes.WorkgroupSizeX;
231 data.workgroupSize[Y] = modes.WorkgroupSizeY;
232 data.workgroupSize[Z] = modes.WorkgroupSizeZ;
233 data.workgroupSize[3] = 0;
234 data.invocationsPerSubgroup = invocationsPerSubgroup;
235 data.invocationsPerWorkgroup = invocationsPerWorkgroup;
236 data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
237 data.pushConstants = pushConstants;
238 data.constants = &sw::Constants::Get();
239
240 marl::WaitGroup wg;
241 const uint32_t batchCount = 16;
242
243 auto groupCount = groupCountX * groupCountY * groupCountZ;
244
245 for(uint32_t batchID = 0; batchID < batchCount && batchID < groupCount; batchID++)
246 {
247 wg.add(1);
248 marl::schedule([=, &data] {
249 defer(wg.done());
250 std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
251
252 for(uint32_t groupIndex = batchID; groupIndex < groupCount; groupIndex += batchCount)
253 {
254 auto modulo = groupIndex;
255 auto groupOffsetZ = modulo / (groupCountX * groupCountY);
256 modulo -= groupOffsetZ * (groupCountX * groupCountY);
257 auto groupOffsetY = modulo / groupCountX;
258 modulo -= groupOffsetY * groupCountX;
259 auto groupOffsetX = modulo;
260
261 auto groupZ = baseGroupZ + groupOffsetZ;
262 auto groupY = baseGroupY + groupOffsetY;
263 auto groupX = baseGroupX + groupOffsetX;
264 MARL_SCOPED_EVENT("groupX: %d, groupY: %d, groupZ: %d", groupX, groupY, groupZ);
265
266 using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
267 std::queue<Coroutine> coroutines;
268
269 if(modes.ContainsControlBarriers)
270 {
271 // Make a function call per subgroup so each subgroup
272 // can yield, bringing all subgroups to the barrier
273 // together.
274 for(int subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
275 {
276 auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
277 coroutines.push(std::move(coroutine));
278 }
279 }
280 else
281 {
282 auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
283 coroutines.push(std::move(coroutine));
284 }
285
286 while(coroutines.size() > 0)
287 {
288 auto coroutine = std::move(coroutines.front());
289 coroutines.pop();
290
291 SpirvShader::YieldResult result;
292 if(coroutine->await(result))
293 {
294 // TODO: Consider result (when the enum is more than 1 entry).
295 coroutines.push(std::move(coroutine));
296 }
297 }
298 }
299 });
300 }
301
302 wg.wait();
303
304 if(shader->containsImageWrite())
305 {
306 vk::DescriptorSet::ContentsChanged(descriptorSetObjects, pipelineLayout, device);
307 }
308 }
309
310 } // namespace sw
311