1 // Copyright 2019 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "ComputeProgram.hpp"
16 #include "Constants.hpp"
17 
18 #include "System/Debug.hpp"
19 #include "Vulkan/VkPipelineLayout.hpp"
20 
21 #include "marl/defer.h"
22 #include "marl/trace.h"
23 #include "marl/waitgroup.h"
24 
25 #include <queue>
26 
27 namespace {
28 
29 enum
30 {
31 	X,
32 	Y,
33 	Z
34 };
35 
36 }  // anonymous namespace
37 
38 namespace sw {
39 
ComputeProgram(vk::Device * device,SpirvShader const * shader,vk::PipelineLayout const * pipelineLayout,const vk::DescriptorSet::Bindings & descriptorSets)40 ComputeProgram::ComputeProgram(vk::Device *device, SpirvShader const *shader, vk::PipelineLayout const *pipelineLayout, const vk::DescriptorSet::Bindings &descriptorSets)
41     : device(device)
42     , shader(shader)
43     , pipelineLayout(pipelineLayout)
44     , descriptorSets(descriptorSets)
45 {
46 }
47 
~ComputeProgram()48 ComputeProgram::~ComputeProgram()
49 {
50 }
51 
generate()52 void ComputeProgram::generate()
53 {
54 	MARL_SCOPED_EVENT("ComputeProgram::generate");
55 
56 	SpirvRoutine routine(pipelineLayout);
57 	shader->emitProlog(&routine);
58 	emit(&routine);
59 	shader->emitEpilog(&routine);
60 	shader->clearPhis(&routine);
61 }
62 
setWorkgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3])63 void ComputeProgram::setWorkgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3])
64 {
65 	// TODO(b/146486064): Consider only assigning these to the SpirvRoutine iff
66 	// they are ever going to be read.
67 	routine->numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
68 	routine->workgroupID = Insert(Insert(Insert(Int4(0), workgroupID[X], X), workgroupID[Y], Y), workgroupID[Z], Z);
69 	routine->workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
70 	routine->subgroupsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, subgroupsPerWorkgroup));
71 	routine->invocationsPerSubgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerSubgroup));
72 
73 	routine->setInputBuiltin(shader, spv::BuiltInNumWorkgroups, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
74 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
75 		{
76 			value[builtin.FirstComponent + component] =
77 			    As<SIMD::Float>(SIMD::Int(Extract(routine->numWorkgroups, component)));
78 		}
79 	});
80 
81 	routine->setInputBuiltin(shader, spv::BuiltInWorkgroupId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
82 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
83 		{
84 			value[builtin.FirstComponent + component] =
85 			    As<SIMD::Float>(SIMD::Int(workgroupID[component]));
86 		}
87 	});
88 
89 	routine->setInputBuiltin(shader, spv::BuiltInWorkgroupSize, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
90 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
91 		{
92 			value[builtin.FirstComponent + component] =
93 			    As<SIMD::Float>(SIMD::Int(Extract(routine->workgroupSize, component)));
94 		}
95 	});
96 
97 	routine->setInputBuiltin(shader, spv::BuiltInNumSubgroups, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
98 		ASSERT(builtin.SizeInComponents == 1);
99 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->subgroupsPerWorkgroup));
100 	});
101 
102 	routine->setInputBuiltin(shader, spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
103 		ASSERT(builtin.SizeInComponents == 1);
104 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->invocationsPerSubgroup));
105 	});
106 
107 	routine->setImmutableInputBuiltins(shader);
108 }
109 
setSubgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3],SIMD::Int localInvocationIndex,Int subgroupIndex)110 void ComputeProgram::setSubgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3], SIMD::Int localInvocationIndex, Int subgroupIndex)
111 {
112 	Int4 numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
113 	Int4 workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
114 
115 	// TODO: Fix Int4 swizzles so we can just use workgroupSize.x, workgroupSize.y.
116 	Int workgroupSizeX = Extract(workgroupSize, X);
117 	Int workgroupSizeY = Extract(workgroupSize, Y);
118 
119 	SIMD::Int localInvocationID[3];
120 	{
121 		SIMD::Int idx = localInvocationIndex;
122 		localInvocationID[Z] = idx / SIMD::Int(workgroupSizeX * workgroupSizeY);
123 		idx -= localInvocationID[Z] * SIMD::Int(workgroupSizeX * workgroupSizeY);  // modulo
124 		localInvocationID[Y] = idx / SIMD::Int(workgroupSizeX);
125 		idx -= localInvocationID[Y] * SIMD::Int(workgroupSizeX);  // modulo
126 		localInvocationID[X] = idx;
127 	}
128 
129 	Int4 wgID = Insert(Insert(Insert(SIMD::Int(0), workgroupID[X], X), workgroupID[Y], Y), workgroupID[Z], Z);
130 	auto localBase = workgroupSize * wgID;
131 	SIMD::Int globalInvocationID[3];
132 	globalInvocationID[X] = SIMD::Int(Extract(localBase, X)) + localInvocationID[X];
133 	globalInvocationID[Y] = SIMD::Int(Extract(localBase, Y)) + localInvocationID[Y];
134 	globalInvocationID[Z] = SIMD::Int(Extract(localBase, Z)) + localInvocationID[Z];
135 
136 	routine->localInvocationIndex = localInvocationIndex;
137 	routine->subgroupIndex = subgroupIndex;
138 	routine->localInvocationID[X] = localInvocationID[X];
139 	routine->localInvocationID[Y] = localInvocationID[Y];
140 	routine->localInvocationID[Z] = localInvocationID[Z];
141 	routine->globalInvocationID[X] = globalInvocationID[X];
142 	routine->globalInvocationID[Y] = globalInvocationID[Y];
143 	routine->globalInvocationID[Z] = globalInvocationID[Z];
144 
145 	routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationIndex, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
146 		ASSERT(builtin.SizeInComponents == 1);
147 		value[builtin.FirstComponent] = As<SIMD::Float>(localInvocationIndex);
148 	});
149 
150 	routine->setInputBuiltin(shader, spv::BuiltInSubgroupId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
151 		ASSERT(builtin.SizeInComponents == 1);
152 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupIndex));
153 	});
154 
155 	routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
156 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
157 		{
158 			value[builtin.FirstComponent + component] =
159 			    As<SIMD::Float>(localInvocationID[component]);
160 		}
161 	});
162 
163 	routine->setInputBuiltin(shader, spv::BuiltInGlobalInvocationId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
164 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
165 		{
166 			value[builtin.FirstComponent + component] =
167 			    As<SIMD::Float>(globalInvocationID[component]);
168 		}
169 	});
170 }
171 
emit(SpirvRoutine * routine)172 void ComputeProgram::emit(SpirvRoutine *routine)
173 {
174 	Pointer<Byte> data = Arg<0>();
175 	Int workgroupX = Arg<1>();
176 	Int workgroupY = Arg<2>();
177 	Int workgroupZ = Arg<3>();
178 	Pointer<Byte> workgroupMemory = Arg<4>();
179 	Int firstSubgroup = Arg<5>();
180 	Int subgroupCount = Arg<6>();
181 
182 	routine->descriptorSets = data + OFFSET(Data, descriptorSets);
183 	routine->descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
184 	routine->pushConstants = data + OFFSET(Data, pushConstants);
185 	routine->constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants));
186 	routine->workgroupMemory = workgroupMemory;
187 
188 	Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
189 
190 	Int workgroupID[3] = { workgroupX, workgroupY, workgroupZ };
191 	setWorkgroupBuiltins(data, routine, workgroupID);
192 
193 	For(Int i = 0, i < subgroupCount, i++)
194 	{
195 		auto subgroupIndex = firstSubgroup + i;
196 
197 		// TODO: Replace SIMD::Int(0, 1, 2, 3) with SIMD-width equivalent
198 		auto localInvocationIndex = SIMD::Int(subgroupIndex * SIMD::Width) + SIMD::Int(0, 1, 2, 3);
199 
200 		// Disable lanes where (invocationIDs >= invocationsPerWorkgroup)
201 		auto activeLaneMask = CmpLT(localInvocationIndex, SIMD::Int(invocationsPerWorkgroup));
202 
203 		setSubgroupBuiltins(data, routine, workgroupID, localInvocationIndex, subgroupIndex);
204 
205 		shader->emit(routine, activeLaneMask, activeLaneMask, descriptorSets);
206 	}
207 }
208 
run(vk::DescriptorSet::Array const & descriptorSetObjects,vk::DescriptorSet::Bindings const & descriptorSets,vk::DescriptorSet::DynamicOffsets const & descriptorDynamicOffsets,vk::Pipeline::PushConstantStorage const & pushConstants,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)209 void ComputeProgram::run(
210     vk::DescriptorSet::Array const &descriptorSetObjects,
211     vk::DescriptorSet::Bindings const &descriptorSets,
212     vk::DescriptorSet::DynamicOffsets const &descriptorDynamicOffsets,
213     vk::Pipeline::PushConstantStorage const &pushConstants,
214     uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ,
215     uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
216 {
217 	auto &modes = shader->getModes();
218 
219 	auto invocationsPerSubgroup = SIMD::Width;
220 	auto invocationsPerWorkgroup = modes.WorkgroupSizeX * modes.WorkgroupSizeY * modes.WorkgroupSizeZ;
221 	auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
222 
223 	Data data;
224 	data.descriptorSets = descriptorSets;
225 	data.descriptorDynamicOffsets = descriptorDynamicOffsets;
226 	data.numWorkgroups[X] = groupCountX;
227 	data.numWorkgroups[Y] = groupCountY;
228 	data.numWorkgroups[Z] = groupCountZ;
229 	data.numWorkgroups[3] = 0;
230 	data.workgroupSize[X] = modes.WorkgroupSizeX;
231 	data.workgroupSize[Y] = modes.WorkgroupSizeY;
232 	data.workgroupSize[Z] = modes.WorkgroupSizeZ;
233 	data.workgroupSize[3] = 0;
234 	data.invocationsPerSubgroup = invocationsPerSubgroup;
235 	data.invocationsPerWorkgroup = invocationsPerWorkgroup;
236 	data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
237 	data.pushConstants = pushConstants;
238 	data.constants = &sw::Constants::Get();
239 
240 	marl::WaitGroup wg;
241 	const uint32_t batchCount = 16;
242 
243 	auto groupCount = groupCountX * groupCountY * groupCountZ;
244 
245 	for(uint32_t batchID = 0; batchID < batchCount && batchID < groupCount; batchID++)
246 	{
247 		wg.add(1);
248 		marl::schedule([=, &data] {
249 			defer(wg.done());
250 			std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
251 
252 			for(uint32_t groupIndex = batchID; groupIndex < groupCount; groupIndex += batchCount)
253 			{
254 				auto modulo = groupIndex;
255 				auto groupOffsetZ = modulo / (groupCountX * groupCountY);
256 				modulo -= groupOffsetZ * (groupCountX * groupCountY);
257 				auto groupOffsetY = modulo / groupCountX;
258 				modulo -= groupOffsetY * groupCountX;
259 				auto groupOffsetX = modulo;
260 
261 				auto groupZ = baseGroupZ + groupOffsetZ;
262 				auto groupY = baseGroupY + groupOffsetY;
263 				auto groupX = baseGroupX + groupOffsetX;
264 				MARL_SCOPED_EVENT("groupX: %d, groupY: %d, groupZ: %d", groupX, groupY, groupZ);
265 
266 				using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
267 				std::queue<Coroutine> coroutines;
268 
269 				if(modes.ContainsControlBarriers)
270 				{
271 					// Make a function call per subgroup so each subgroup
272 					// can yield, bringing all subgroups to the barrier
273 					// together.
274 					for(int subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
275 					{
276 						auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
277 						coroutines.push(std::move(coroutine));
278 					}
279 				}
280 				else
281 				{
282 					auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
283 					coroutines.push(std::move(coroutine));
284 				}
285 
286 				while(coroutines.size() > 0)
287 				{
288 					auto coroutine = std::move(coroutines.front());
289 					coroutines.pop();
290 
291 					SpirvShader::YieldResult result;
292 					if(coroutine->await(result))
293 					{
294 						// TODO: Consider result (when the enum is more than 1 entry).
295 						coroutines.push(std::move(coroutine));
296 					}
297 				}
298 			}
299 		});
300 	}
301 
302 	wg.wait();
303 
304 	if(shader->containsImageWrite())
305 	{
306 		vk::DescriptorSet::ContentsChanged(descriptorSetObjects, pipelineLayout, device);
307 	}
308 }
309 
310 }  // namespace sw
311