1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "SamplerCore.hpp"
16 
17 #include "Constants.hpp"
18 #include "PixelRoutine.hpp"
19 #include "System/Debug.hpp"
20 #include "Vulkan/VkSampler.hpp"
21 
22 namespace sw {
23 
SamplerCore(Pointer<Byte> & constants,const Sampler & state)24 SamplerCore::SamplerCore(Pointer<Byte> &constants, const Sampler &state)
25     : constants(constants)
26     , state(state)
27 {
28 }
sampleTexture(Pointer<Byte> & texture,Float4 uvwa[4],Float4 & dRef,Float && lodOrBias,Float4 & dsx,Float4 & dsy,Vector4i & offset,Int4 & sample,SamplerFunction function)29 Vector4f SamplerCore::sampleTexture(Pointer<Byte> &texture, Float4 uvwa[4], Float4 &dRef, Float &&lodOrBias, Float4 &dsx, Float4 &dsy, Vector4i &offset, Int4 &sample, SamplerFunction function)
30 {
31 	Vector4f c;
32 
33 	Float4 u = uvwa[0];
34 	Float4 v = uvwa[1];
35 	Float4 w = uvwa[2];
36 	Float4 a;  // Array layer coordinate
37 	switch(state.textureType)
38 	{
39 		case VK_IMAGE_VIEW_TYPE_1D_ARRAY: a = uvwa[1]; break;
40 		case VK_IMAGE_VIEW_TYPE_2D_ARRAY: a = uvwa[2]; break;
41 		case VK_IMAGE_VIEW_TYPE_CUBE_ARRAY: a = uvwa[3]; break;
42 		default: break;
43 	}
44 
45 	Float lod;
46 	Float anisotropy;
47 	Float4 uDelta;
48 	Float4 vDelta;
49 	Float4 M;  // Major axis
50 
51 	if(state.isCube())
52 	{
53 		Int4 face = cubeFace(u, v, uvwa[0], uvwa[1], uvwa[2], M);
54 		w = As<Float4>(face);
55 	}
56 
57 	if(function == Implicit || function == Bias || function == Grad || function == Query)
58 	{
59 		if(state.is1D())
60 		{
61 			computeLod1D(texture, lod, u, dsx, dsy, function);
62 		}
63 		else if(state.is2D())
64 		{
65 			computeLod2D(texture, lod, anisotropy, uDelta, vDelta, u, v, dsx, dsy, function);
66 		}
67 		else if(state.isCube())
68 		{
69 			computeLodCube(texture, lod, uvwa[0], uvwa[1], uvwa[2], dsx, dsy, M, function);
70 		}
71 		else
72 		{
73 			computeLod3D(texture, lod, u, v, w, dsx, dsy, function);
74 		}
75 
76 		Float bias = state.mipLodBias;
77 
78 		if(function == Bias)
79 		{
80 			// Add SPIR-V Bias operand to the sampler provided bias and clamp to maxSamplerLodBias limit.
81 			bias = Min(Max(bias + lodOrBias, -vk::MAX_SAMPLER_LOD_BIAS), vk::MAX_SAMPLER_LOD_BIAS);
82 		}
83 
84 		lod += bias;
85 	}
86 	else if(function == Lod)
87 	{
88 		// Vulkan 1.1: "The absolute value of mipLodBias must be less than or equal to VkPhysicalDeviceLimits::maxSamplerLodBias"
89 		// Hence no explicit clamping to maxSamplerLodBias is required in this case.
90 		lod = lodOrBias + state.mipLodBias;
91 	}
92 	else if(function == Fetch)
93 	{
94 		// TODO: Eliminate int-float-int conversion.
95 		lod = Float(As<Int>(lodOrBias));
96 	}
97 	else if(function == Base || function == Gather)
98 	{
99 		lod = Float(0);
100 	}
101 	else
102 		UNREACHABLE("Sampler function %d", int(function));
103 
104 	if(function != Base && function != Fetch && function != Gather)
105 	{
106 		if(function == Query)
107 		{
108 			c.y = Float4(lod);  // Unclamped LOD.
109 		}
110 
111 		lod = Max(lod, state.minLod);
112 		lod = Min(lod, state.maxLod);
113 
114 		if(function == Query)
115 		{
116 			if(state.mipmapFilter == MIPMAP_POINT)
117 			{
118 				lod = Round(lod);  // TODO: Preferred formula is ceil(lod + 0.5) - 1
119 			}
120 
121 			c.x = lod;
122 			//	c.y contains unclamped LOD.
123 
124 			return c;
125 		}
126 	}
127 
128 	bool force32BitFiltering = state.highPrecisionFiltering && !isYcbcrFormat() && (state.textureFilter != FILTER_POINT);
129 	bool use32BitFiltering = hasFloatTexture() || hasUnnormalizedIntegerTexture() || force32BitFiltering ||
130 	                         state.isCube() || state.unnormalizedCoordinates || state.compareEnable ||
131 	                         borderModeActive() || (function == Gather) || (function == Fetch);
132 
133 	if(use32BitFiltering)
134 	{
135 		c = sampleFloatFilter(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta, function);
136 
137 		if(!hasFloatTexture() && !hasUnnormalizedIntegerTexture() && !state.compareEnable)
138 		{
139 			switch(state.textureFormat)
140 			{
141 				case VK_FORMAT_R5G6B5_UNORM_PACK16:
142 					c.x *= Float4(1.0f / 0xF800);
143 					c.y *= Float4(1.0f / 0xFC00);
144 					c.z *= Float4(1.0f / 0xF800);
145 					break;
146 				case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
147 					c.x *= Float4(1.0f / 0xF000);
148 					c.y *= Float4(1.0f / 0xF000);
149 					c.z *= Float4(1.0f / 0xF000);
150 					c.w *= Float4(1.0f / 0xF000);
151 					break;
152 				case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
153 					c.x *= Float4(1.0f / 0xF800);
154 					c.y *= Float4(1.0f / 0xF800);
155 					c.z *= Float4(1.0f / 0xF800);
156 					c.w *= Float4(1.0f / 0x8000);
157 					break;
158 				case VK_FORMAT_R8_SNORM:
159 				case VK_FORMAT_R8G8_SNORM:
160 				case VK_FORMAT_R8G8B8A8_SNORM:
161 				case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
162 					c.x = Max(c.x * Float4(1.0f / 0x7F00), Float4(-1.0f));
163 					c.y = Max(c.y * Float4(1.0f / 0x7F00), Float4(-1.0f));
164 					c.z = Max(c.z * Float4(1.0f / 0x7F00), Float4(-1.0f));
165 					c.w = Max(c.w * Float4(1.0f / 0x7F00), Float4(-1.0f));
166 					break;
167 				case VK_FORMAT_R8_UNORM:
168 				case VK_FORMAT_R8G8_UNORM:
169 				case VK_FORMAT_R8G8B8A8_UNORM:
170 				case VK_FORMAT_B8G8R8A8_UNORM:
171 				case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
172 				case VK_FORMAT_B8G8R8A8_SRGB:
173 				case VK_FORMAT_R8G8B8A8_SRGB:
174 				case VK_FORMAT_R8_SRGB:
175 				case VK_FORMAT_R8G8_SRGB:
176 					c.x *= Float4(1.0f / 0xFF00u);
177 					c.y *= Float4(1.0f / 0xFF00u);
178 					c.z *= Float4(1.0f / 0xFF00u);
179 					c.w *= Float4(1.0f / 0xFF00u);
180 					break;
181 				case VK_FORMAT_R16_SNORM:
182 				case VK_FORMAT_R16G16_SNORM:
183 				case VK_FORMAT_R16G16B16A16_SNORM:
184 					c.x = Max(c.x * Float4(1.0f / 0x7FFF), Float4(-1.0f));
185 					c.y = Max(c.y * Float4(1.0f / 0x7FFF), Float4(-1.0f));
186 					c.z = Max(c.z * Float4(1.0f / 0x7FFF), Float4(-1.0f));
187 					c.w = Max(c.w * Float4(1.0f / 0x7FFF), Float4(-1.0f));
188 					break;
189 				default:
190 					for(int component = 0; component < textureComponentCount(); component++)
191 					{
192 						c[component] *= Float4(hasUnsignedTextureComponent(component) ? 1.0f / 0xFFFF : 1.0f / 0x7FFF);
193 					}
194 			}
195 		}
196 	}
197 	else  // 16-bit filtering.
198 	{
199 		Vector4s cs = sampleFilter(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta, function);
200 
201 		switch(state.textureFormat)
202 		{
203 			case VK_FORMAT_R5G6B5_UNORM_PACK16:
204 				c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
205 				c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xFC00);
206 				c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF800);
207 				break;
208 			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
209 				c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF000);
210 				c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xF000);
211 				c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF000);
212 				c.w = Float4(As<UShort4>(cs.w)) * Float4(1.0f / 0xF000);
213 				break;
214 			case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
215 				c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xF800);
216 				c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xF800);
217 				c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xF800);
218 				c.w = Float4(As<UShort4>(cs.w)) * Float4(1.0f / 0x8000);
219 				break;
220 			case VK_FORMAT_R8_SNORM:
221 			case VK_FORMAT_R8G8_SNORM:
222 			case VK_FORMAT_R8G8B8A8_SNORM:
223 			case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
224 				c.x = Max(Float4(cs.x) * Float4(1.0f / 0x7F00), Float4(-1.0f));
225 				c.y = Max(Float4(cs.y) * Float4(1.0f / 0x7F00), Float4(-1.0f));
226 				c.z = Max(Float4(cs.z) * Float4(1.0f / 0x7F00), Float4(-1.0f));
227 				c.w = Max(Float4(cs.w) * Float4(1.0f / 0x7F00), Float4(-1.0f));
228 				break;
229 			case VK_FORMAT_R8_UNORM:
230 			case VK_FORMAT_R8G8_UNORM:
231 			case VK_FORMAT_R8G8B8A8_UNORM:
232 			case VK_FORMAT_B8G8R8A8_UNORM:
233 			case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
234 			case VK_FORMAT_B8G8R8A8_SRGB:
235 			case VK_FORMAT_R8G8B8A8_SRGB:
236 			case VK_FORMAT_R8_SRGB:
237 			case VK_FORMAT_R8G8_SRGB:
238 				c.x = Float4(As<UShort4>(cs.x)) * Float4(1.0f / 0xFF00u);
239 				c.y = Float4(As<UShort4>(cs.y)) * Float4(1.0f / 0xFF00u);
240 				c.z = Float4(As<UShort4>(cs.z)) * Float4(1.0f / 0xFF00u);
241 				c.w = Float4(As<UShort4>(cs.w)) * Float4(1.0f / 0xFF00u);
242 				break;
243 			case VK_FORMAT_R16_SNORM:
244 			case VK_FORMAT_R16G16_SNORM:
245 			case VK_FORMAT_R16G16B16A16_SNORM:
246 				c.x = Max(Float4(cs.x) * Float4(1.0f / 0x7FFF), Float4(-1.0f));
247 				c.y = Max(Float4(cs.y) * Float4(1.0f / 0x7FFF), Float4(-1.0f));
248 				c.z = Max(Float4(cs.z) * Float4(1.0f / 0x7FFF), Float4(-1.0f));
249 				c.w = Max(Float4(cs.w) * Float4(1.0f / 0x7FFF), Float4(-1.0f));
250 				break;
251 			default:
252 				for(int component = 0; component < textureComponentCount(); component++)
253 				{
254 					if(hasUnsignedTextureComponent(component))
255 					{
256 						convertUnsigned16(c[component], cs[component]);
257 					}
258 					else
259 					{
260 						convertSigned15(c[component], cs[component]);
261 					}
262 				}
263 		}
264 	}
265 
266 	if(state.textureFilter != FILTER_GATHER)
267 	{
268 		if((state.swizzle.r != VK_COMPONENT_SWIZZLE_R) ||
269 		   (state.swizzle.g != VK_COMPONENT_SWIZZLE_G) ||
270 		   (state.swizzle.b != VK_COMPONENT_SWIZZLE_B) ||
271 		   (state.swizzle.a != VK_COMPONENT_SWIZZLE_A))
272 		{
273 			const Vector4f col = c;
274 			bool integer = hasUnnormalizedIntegerTexture();
275 			c.x = applySwizzle(col, state.swizzle.r, integer);
276 			c.y = applySwizzle(col, state.swizzle.g, integer);
277 			c.z = applySwizzle(col, state.swizzle.b, integer);
278 			c.w = applySwizzle(col, state.swizzle.a, integer);
279 		}
280 	}
281 	else  // Gather
282 	{
283 		VkComponentSwizzle swizzle = gatherSwizzle();
284 
285 		// R/G/B/A swizzles affect the component collected from each texel earlier.
286 		// Handle the ZERO and ONE cases here because we don't need to know the format.
287 
288 		if(swizzle == VK_COMPONENT_SWIZZLE_ZERO)
289 		{
290 			c.x = c.y = c.z = c.w = Float4(0);
291 		}
292 		else if(swizzle == VK_COMPONENT_SWIZZLE_ONE)
293 		{
294 			bool integer = hasUnnormalizedIntegerTexture();
295 			c.x = c.y = c.z = c.w = integer ? As<Float4>(Int4(1)) : RValue<Float4>(Float4(1.0f));
296 		}
297 	}
298 
299 	return c;
300 }
301 
applySwizzle(const Vector4f & c,VkComponentSwizzle swizzle,bool integer)302 Float4 SamplerCore::applySwizzle(const Vector4f &c, VkComponentSwizzle swizzle, bool integer)
303 {
304 	switch(swizzle)
305 	{
306 		default: UNSUPPORTED("VkComponentSwizzle %d", (int)swizzle);
307 		case VK_COMPONENT_SWIZZLE_R: return c.x;
308 		case VK_COMPONENT_SWIZZLE_G: return c.y;
309 		case VK_COMPONENT_SWIZZLE_B: return c.z;
310 		case VK_COMPONENT_SWIZZLE_A: return c.w;
311 		case VK_COMPONENT_SWIZZLE_ZERO: return Float4(0.0f, 0.0f, 0.0f, 0.0f);
312 		case VK_COMPONENT_SWIZZLE_ONE:
313 			if(integer)
314 			{
315 				return Float4(As<Float4>(sw::Int4(1, 1, 1, 1)));
316 			}
317 			else
318 			{
319 				return Float4(1.0f, 1.0f, 1.0f, 1.0f);
320 			}
321 			break;
322 	}
323 };
324 
offsetSample(Short4 & uvw,Pointer<Byte> & mipmap,int halfOffset,bool wrap,int count,Float & lod)325 Short4 SamplerCore::offsetSample(Short4 &uvw, Pointer<Byte> &mipmap, int halfOffset, bool wrap, int count, Float &lod)
326 {
327 	Short4 offset = *Pointer<Short4>(mipmap + halfOffset);
328 
329 	if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
330 	{
331 		offset &= Short4(CmpNLE(Float4(lod), Float4(0.0f)));
332 	}
333 	else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
334 	{
335 		offset &= Short4(CmpLE(Float4(lod), Float4(0.0f)));
336 	}
337 
338 	if(wrap)
339 	{
340 		switch(count)
341 		{
342 			case -1: return uvw - offset;
343 			case 0: return uvw;
344 			case +1: return uvw + offset;
345 			case 2: return uvw + offset + offset;
346 		}
347 	}
348 	else  // Clamp or mirror
349 	{
350 		switch(count)
351 		{
352 			case -1: return SubSat(As<UShort4>(uvw), As<UShort4>(offset));
353 			case 0: return uvw;
354 			case +1: return AddSat(As<UShort4>(uvw), As<UShort4>(offset));
355 			case 2: return AddSat(AddSat(As<UShort4>(uvw), As<UShort4>(offset)), As<UShort4>(offset));
356 		}
357 	}
358 
359 	return uvw;
360 }
361 
sampleFilter(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,SamplerFunction function)362 Vector4s SamplerCore::sampleFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, SamplerFunction function)
363 {
364 	Vector4s c = sampleAniso(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta, false, function);
365 
366 	if(function == Fetch)
367 	{
368 		return c;
369 	}
370 
371 	if(state.mipmapFilter == MIPMAP_LINEAR)
372 	{
373 		Vector4s cc = sampleAniso(texture, u, v, w, a, offset, sample, lod, anisotropy, uDelta, vDelta, true, function);
374 
375 		lod *= Float(1 << 16);
376 
377 		UShort4 utri = UShort4(Float4(lod));  // FIXME: Optimize
378 		Short4 stri = utri >> 1;              // FIXME: Optimize
379 
380 		if(hasUnsignedTextureComponent(0))
381 			cc.x = MulHigh(As<UShort4>(cc.x), utri);
382 		else
383 			cc.x = MulHigh(cc.x, stri);
384 		if(hasUnsignedTextureComponent(1))
385 			cc.y = MulHigh(As<UShort4>(cc.y), utri);
386 		else
387 			cc.y = MulHigh(cc.y, stri);
388 		if(hasUnsignedTextureComponent(2))
389 			cc.z = MulHigh(As<UShort4>(cc.z), utri);
390 		else
391 			cc.z = MulHigh(cc.z, stri);
392 		if(hasUnsignedTextureComponent(3))
393 			cc.w = MulHigh(As<UShort4>(cc.w), utri);
394 		else
395 			cc.w = MulHigh(cc.w, stri);
396 
397 		utri = ~utri;
398 		stri = Short4(0x7FFF) - stri;
399 
400 		if(hasUnsignedTextureComponent(0))
401 			c.x = MulHigh(As<UShort4>(c.x), utri);
402 		else
403 			c.x = MulHigh(c.x, stri);
404 		if(hasUnsignedTextureComponent(1))
405 			c.y = MulHigh(As<UShort4>(c.y), utri);
406 		else
407 			c.y = MulHigh(c.y, stri);
408 		if(hasUnsignedTextureComponent(2))
409 			c.z = MulHigh(As<UShort4>(c.z), utri);
410 		else
411 			c.z = MulHigh(c.z, stri);
412 		if(hasUnsignedTextureComponent(3))
413 			c.w = MulHigh(As<UShort4>(c.w), utri);
414 		else
415 			c.w = MulHigh(c.w, stri);
416 
417 		c.x += cc.x;
418 		c.y += cc.y;
419 		c.z += cc.z;
420 		c.w += cc.w;
421 
422 		if(!hasUnsignedTextureComponent(0)) c.x += c.x;
423 		if(!hasUnsignedTextureComponent(1)) c.y += c.y;
424 		if(!hasUnsignedTextureComponent(2)) c.z += c.z;
425 		if(!hasUnsignedTextureComponent(3)) c.w += c.w;
426 	}
427 
428 	return c;
429 }
430 
sampleAniso(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,bool secondLOD,SamplerFunction function)431 Vector4s SamplerCore::sampleAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD, SamplerFunction function)
432 {
433 	Vector4s c;
434 
435 	if(state.textureFilter != FILTER_ANISOTROPIC)
436 	{
437 		c = sampleQuad(texture, u, v, w, a, offset, sample, lod, secondLOD, function);
438 	}
439 	else
440 	{
441 		Int N = RoundInt(anisotropy);
442 
443 		Vector4s cSum;
444 
445 		cSum.x = Short4(0);
446 		cSum.y = Short4(0);
447 		cSum.z = Short4(0);
448 		cSum.w = Short4(0);
449 
450 		Float4 A = *Pointer<Float4>(constants + OFFSET(Constants, uvWeight) + 16 * N);
451 		Float4 B = *Pointer<Float4>(constants + OFFSET(Constants, uvStart) + 16 * N);
452 		UShort4 cw = *Pointer<UShort4>(constants + OFFSET(Constants, cWeight) + 8 * N);
453 		Short4 sw = Short4(cw >> 1);
454 
455 		Float4 du = uDelta;
456 		Float4 dv = vDelta;
457 
458 		Float4 u0 = u + B * du;
459 		Float4 v0 = v + B * dv;
460 
461 		du *= A;
462 		dv *= A;
463 
464 		Int i = 0;
465 
466 		Do
467 		{
468 			c = sampleQuad(texture, u0, v0, w, a, offset, sample, lod, secondLOD, function);
469 
470 			u0 += du;
471 			v0 += dv;
472 
473 			if(hasUnsignedTextureComponent(0))
474 				cSum.x += As<Short4>(MulHigh(As<UShort4>(c.x), cw));
475 			else
476 				cSum.x += MulHigh(c.x, sw);
477 			if(hasUnsignedTextureComponent(1))
478 				cSum.y += As<Short4>(MulHigh(As<UShort4>(c.y), cw));
479 			else
480 				cSum.y += MulHigh(c.y, sw);
481 			if(hasUnsignedTextureComponent(2))
482 				cSum.z += As<Short4>(MulHigh(As<UShort4>(c.z), cw));
483 			else
484 				cSum.z += MulHigh(c.z, sw);
485 			if(hasUnsignedTextureComponent(3))
486 				cSum.w += As<Short4>(MulHigh(As<UShort4>(c.w), cw));
487 			else
488 				cSum.w += MulHigh(c.w, sw);
489 
490 			i++;
491 		}
492 		Until(i >= N);
493 
494 		if(hasUnsignedTextureComponent(0))
495 			c.x = cSum.x;
496 		else
497 			c.x = AddSat(cSum.x, cSum.x);
498 		if(hasUnsignedTextureComponent(1))
499 			c.y = cSum.y;
500 		else
501 			c.y = AddSat(cSum.y, cSum.y);
502 		if(hasUnsignedTextureComponent(2))
503 			c.z = cSum.z;
504 		else
505 			c.z = AddSat(cSum.z, cSum.z);
506 		if(hasUnsignedTextureComponent(3))
507 			c.w = cSum.w;
508 		else
509 			c.w = AddSat(cSum.w, cSum.w);
510 	}
511 
512 	return c;
513 }
514 
sampleQuad(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD,SamplerFunction function)515 Vector4s SamplerCore::sampleQuad(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD, SamplerFunction function)
516 {
517 	if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
518 	{
519 		return sampleQuad2D(texture, u, v, w, a, offset, sample, lod, secondLOD, function);
520 	}
521 	else
522 	{
523 		return sample3D(texture, u, v, w, offset, sample, lod, secondLOD, function);
524 	}
525 }
526 
sampleQuad2D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD,SamplerFunction function)527 Vector4s SamplerCore::sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD, SamplerFunction function)
528 {
529 	Vector4s c;
530 
531 	int componentCount = textureComponentCount();
532 	bool gather = (state.textureFilter == FILTER_GATHER);
533 
534 	Pointer<Byte> mipmap;
535 	Pointer<Byte> buffer;
536 	selectMipmap(texture, mipmap, buffer, lod, secondLOD);
537 
538 	Short4 uuuu = address(u, state.addressingModeU, mipmap);
539 	Short4 vvvv = address(v, state.addressingModeV, mipmap);
540 	Short4 wwww = address(w, state.addressingModeW, mipmap);
541 	Short4 layerIndex = computeLayerIndex(a, mipmap);
542 
543 	if(state.textureFilter == FILTER_POINT)
544 	{
545 		c = sampleTexel(uuuu, vvvv, wwww, layerIndex, offset, sample, mipmap, buffer, function);
546 	}
547 	else
548 	{
549 		Short4 uuuu0 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, -1, lod);
550 		Short4 vvvv0 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, -1, lod);
551 		Short4 uuuu1 = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, +1, lod);
552 		Short4 vvvv1 = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, +1, lod);
553 
554 		Vector4s c00 = sampleTexel(uuuu0, vvvv0, wwww, layerIndex, offset, sample, mipmap, buffer, function);
555 		Vector4s c10 = sampleTexel(uuuu1, vvvv0, wwww, layerIndex, offset, sample, mipmap, buffer, function);
556 		Vector4s c01 = sampleTexel(uuuu0, vvvv1, wwww, layerIndex, offset, sample, mipmap, buffer, function);
557 		Vector4s c11 = sampleTexel(uuuu1, vvvv1, wwww, layerIndex, offset, sample, mipmap, buffer, function);
558 
559 		if(!gather)  // Blend
560 		{
561 			// Fractions
562 			UShort4 f0u = As<UShort4>(uuuu0) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, width)));
563 			UShort4 f0v = As<UShort4>(vvvv0) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, height)));
564 
565 			UShort4 f1u = ~f0u;
566 			UShort4 f1v = ~f0v;
567 
568 			UShort4 f0u0v = MulHigh(f0u, f0v);
569 			UShort4 f1u0v = MulHigh(f1u, f0v);
570 			UShort4 f0u1v = MulHigh(f0u, f1v);
571 			UShort4 f1u1v = MulHigh(f1u, f1v);
572 
573 			// Signed fractions
574 			Short4 f1u1vs;
575 			Short4 f0u1vs;
576 			Short4 f1u0vs;
577 			Short4 f0u0vs;
578 
579 			if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
580 			{
581 				f1u1vs = f1u1v >> 1;
582 				f0u1vs = f0u1v >> 1;
583 				f1u0vs = f1u0v >> 1;
584 				f0u0vs = f0u0v >> 1;
585 			}
586 
587 			// Bilinear interpolation
588 			if(componentCount >= 1)
589 			{
590 				if(has16bitTextureComponents() && hasUnsignedTextureComponent(0))
591 				{
592 					c00.x = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0u) + MulHigh(As<UShort4>(c10.x), f0u);
593 					c01.x = As<UShort4>(c01.x) - MulHigh(As<UShort4>(c01.x), f0u) + MulHigh(As<UShort4>(c11.x), f0u);
594 					c.x = As<UShort4>(c00.x) - MulHigh(As<UShort4>(c00.x), f0v) + MulHigh(As<UShort4>(c01.x), f0v);
595 				}
596 				else
597 				{
598 					if(hasUnsignedTextureComponent(0))
599 					{
600 						c00.x = MulHigh(As<UShort4>(c00.x), f1u1v);
601 						c10.x = MulHigh(As<UShort4>(c10.x), f0u1v);
602 						c01.x = MulHigh(As<UShort4>(c01.x), f1u0v);
603 						c11.x = MulHigh(As<UShort4>(c11.x), f0u0v);
604 					}
605 					else
606 					{
607 						c00.x = MulHigh(c00.x, f1u1vs);
608 						c10.x = MulHigh(c10.x, f0u1vs);
609 						c01.x = MulHigh(c01.x, f1u0vs);
610 						c11.x = MulHigh(c11.x, f0u0vs);
611 					}
612 
613 					c.x = (c00.x + c10.x) + (c01.x + c11.x);
614 					if(!hasUnsignedTextureComponent(0)) c.x = AddSat(c.x, c.x);  // Correct for signed fractions
615 				}
616 			}
617 
618 			if(componentCount >= 2)
619 			{
620 				if(has16bitTextureComponents() && hasUnsignedTextureComponent(1))
621 				{
622 					c00.y = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0u) + MulHigh(As<UShort4>(c10.y), f0u);
623 					c01.y = As<UShort4>(c01.y) - MulHigh(As<UShort4>(c01.y), f0u) + MulHigh(As<UShort4>(c11.y), f0u);
624 					c.y = As<UShort4>(c00.y) - MulHigh(As<UShort4>(c00.y), f0v) + MulHigh(As<UShort4>(c01.y), f0v);
625 				}
626 				else
627 				{
628 					if(hasUnsignedTextureComponent(1))
629 					{
630 						c00.y = MulHigh(As<UShort4>(c00.y), f1u1v);
631 						c10.y = MulHigh(As<UShort4>(c10.y), f0u1v);
632 						c01.y = MulHigh(As<UShort4>(c01.y), f1u0v);
633 						c11.y = MulHigh(As<UShort4>(c11.y), f0u0v);
634 					}
635 					else
636 					{
637 						c00.y = MulHigh(c00.y, f1u1vs);
638 						c10.y = MulHigh(c10.y, f0u1vs);
639 						c01.y = MulHigh(c01.y, f1u0vs);
640 						c11.y = MulHigh(c11.y, f0u0vs);
641 					}
642 
643 					c.y = (c00.y + c10.y) + (c01.y + c11.y);
644 					if(!hasUnsignedTextureComponent(1)) c.y = AddSat(c.y, c.y);  // Correct for signed fractions
645 				}
646 			}
647 
648 			if(componentCount >= 3)
649 			{
650 				if(has16bitTextureComponents() && hasUnsignedTextureComponent(2))
651 				{
652 					c00.z = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0u) + MulHigh(As<UShort4>(c10.z), f0u);
653 					c01.z = As<UShort4>(c01.z) - MulHigh(As<UShort4>(c01.z), f0u) + MulHigh(As<UShort4>(c11.z), f0u);
654 					c.z = As<UShort4>(c00.z) - MulHigh(As<UShort4>(c00.z), f0v) + MulHigh(As<UShort4>(c01.z), f0v);
655 				}
656 				else
657 				{
658 					if(hasUnsignedTextureComponent(2))
659 					{
660 						c00.z = MulHigh(As<UShort4>(c00.z), f1u1v);
661 						c10.z = MulHigh(As<UShort4>(c10.z), f0u1v);
662 						c01.z = MulHigh(As<UShort4>(c01.z), f1u0v);
663 						c11.z = MulHigh(As<UShort4>(c11.z), f0u0v);
664 					}
665 					else
666 					{
667 						c00.z = MulHigh(c00.z, f1u1vs);
668 						c10.z = MulHigh(c10.z, f0u1vs);
669 						c01.z = MulHigh(c01.z, f1u0vs);
670 						c11.z = MulHigh(c11.z, f0u0vs);
671 					}
672 
673 					c.z = (c00.z + c10.z) + (c01.z + c11.z);
674 					if(!hasUnsignedTextureComponent(2)) c.z = AddSat(c.z, c.z);  // Correct for signed fractions
675 				}
676 			}
677 
678 			if(componentCount >= 4)
679 			{
680 				if(has16bitTextureComponents() && hasUnsignedTextureComponent(3))
681 				{
682 					c00.w = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0u) + MulHigh(As<UShort4>(c10.w), f0u);
683 					c01.w = As<UShort4>(c01.w) - MulHigh(As<UShort4>(c01.w), f0u) + MulHigh(As<UShort4>(c11.w), f0u);
684 					c.w = As<UShort4>(c00.w) - MulHigh(As<UShort4>(c00.w), f0v) + MulHigh(As<UShort4>(c01.w), f0v);
685 				}
686 				else
687 				{
688 					if(hasUnsignedTextureComponent(3))
689 					{
690 						c00.w = MulHigh(As<UShort4>(c00.w), f1u1v);
691 						c10.w = MulHigh(As<UShort4>(c10.w), f0u1v);
692 						c01.w = MulHigh(As<UShort4>(c01.w), f1u0v);
693 						c11.w = MulHigh(As<UShort4>(c11.w), f0u0v);
694 					}
695 					else
696 					{
697 						c00.w = MulHigh(c00.w, f1u1vs);
698 						c10.w = MulHigh(c10.w, f0u1vs);
699 						c01.w = MulHigh(c01.w, f1u0vs);
700 						c11.w = MulHigh(c11.w, f0u0vs);
701 					}
702 
703 					c.w = (c00.w + c10.w) + (c01.w + c11.w);
704 					if(!hasUnsignedTextureComponent(3)) c.w = AddSat(c.w, c.w);  // Correct for signed fractions
705 				}
706 			}
707 		}
708 		else  // Gather
709 		{
710 			VkComponentSwizzle swizzle = gatherSwizzle();
711 			switch(swizzle)
712 			{
713 				case VK_COMPONENT_SWIZZLE_ZERO:
714 				case VK_COMPONENT_SWIZZLE_ONE:
715 					// Handled at the final component swizzle.
716 					break;
717 				default:
718 					c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
719 					c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
720 					c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
721 					c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
722 					break;
723 			}
724 		}
725 	}
726 
727 	return c;
728 }
729 
sample3D(Pointer<Byte> & texture,Float4 & u_,Float4 & v_,Float4 & w_,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD,SamplerFunction function)730 Vector4s SamplerCore::sample3D(Pointer<Byte> &texture, Float4 &u_, Float4 &v_, Float4 &w_, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD, SamplerFunction function)
731 {
732 	Vector4s c_;
733 
734 	int componentCount = textureComponentCount();
735 
736 	Pointer<Byte> mipmap;
737 	Pointer<Byte> buffer;
738 	selectMipmap(texture, mipmap, buffer, lod, secondLOD);
739 
740 	Short4 uuuu = address(u_, state.addressingModeU, mipmap);
741 	Short4 vvvv = address(v_, state.addressingModeV, mipmap);
742 	Short4 wwww = address(w_, state.addressingModeW, mipmap);
743 
744 	if(state.textureFilter == FILTER_POINT)
745 	{
746 		c_ = sampleTexel(uuuu, vvvv, wwww, 0, offset, sample, mipmap, buffer, function);
747 	}
748 	else
749 	{
750 		Vector4s c[2][2][2];
751 
752 		Short4 u[2][2][2];
753 		Short4 v[2][2][2];
754 		Short4 s[2][2][2];
755 
756 		for(int i = 0; i < 2; i++)
757 		{
758 			for(int j = 0; j < 2; j++)
759 			{
760 				for(int k = 0; k < 2; k++)
761 				{
762 					u[i][j][k] = offsetSample(uuuu, mipmap, OFFSET(Mipmap, uHalf), state.addressingModeU == ADDRESSING_WRAP, i * 2 - 1, lod);
763 					v[i][j][k] = offsetSample(vvvv, mipmap, OFFSET(Mipmap, vHalf), state.addressingModeV == ADDRESSING_WRAP, j * 2 - 1, lod);
764 					s[i][j][k] = offsetSample(wwww, mipmap, OFFSET(Mipmap, wHalf), state.addressingModeW == ADDRESSING_WRAP, k * 2 - 1, lod);
765 				}
766 			}
767 		}
768 
769 		// Fractions
770 		UShort4 f0u = As<UShort4>(u[0][0][0]) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, width)));
771 		UShort4 f0v = As<UShort4>(v[0][0][0]) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, height)));
772 		UShort4 f0s = As<UShort4>(s[0][0][0]) * UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, depth)));
773 
774 		UShort4 f1u = ~f0u;
775 		UShort4 f1v = ~f0v;
776 		UShort4 f1s = ~f0s;
777 
778 		UShort4 f[2][2][2];
779 		Short4 fs[2][2][2];
780 
781 		f[1][1][1] = MulHigh(f1u, f1v);
782 		f[0][1][1] = MulHigh(f0u, f1v);
783 		f[1][0][1] = MulHigh(f1u, f0v);
784 		f[0][0][1] = MulHigh(f0u, f0v);
785 		f[1][1][0] = MulHigh(f1u, f1v);
786 		f[0][1][0] = MulHigh(f0u, f1v);
787 		f[1][0][0] = MulHigh(f1u, f0v);
788 		f[0][0][0] = MulHigh(f0u, f0v);
789 
790 		f[1][1][1] = MulHigh(f[1][1][1], f1s);
791 		f[0][1][1] = MulHigh(f[0][1][1], f1s);
792 		f[1][0][1] = MulHigh(f[1][0][1], f1s);
793 		f[0][0][1] = MulHigh(f[0][0][1], f1s);
794 		f[1][1][0] = MulHigh(f[1][1][0], f0s);
795 		f[0][1][0] = MulHigh(f[0][1][0], f0s);
796 		f[1][0][0] = MulHigh(f[1][0][0], f0s);
797 		f[0][0][0] = MulHigh(f[0][0][0], f0s);
798 
799 		// Signed fractions
800 		if(!hasUnsignedTextureComponent(0) || !hasUnsignedTextureComponent(1) || !hasUnsignedTextureComponent(2) || !hasUnsignedTextureComponent(3))
801 		{
802 			fs[0][0][0] = f[0][0][0] >> 1;
803 			fs[0][0][1] = f[0][0][1] >> 1;
804 			fs[0][1][0] = f[0][1][0] >> 1;
805 			fs[0][1][1] = f[0][1][1] >> 1;
806 			fs[1][0][0] = f[1][0][0] >> 1;
807 			fs[1][0][1] = f[1][0][1] >> 1;
808 			fs[1][1][0] = f[1][1][0] >> 1;
809 			fs[1][1][1] = f[1][1][1] >> 1;
810 		}
811 
812 		for(int i = 0; i < 2; i++)
813 		{
814 			for(int j = 0; j < 2; j++)
815 			{
816 				for(int k = 0; k < 2; k++)
817 				{
818 					c[i][j][k] = sampleTexel(u[i][j][k], v[i][j][k], s[i][j][k], 0, offset, sample, mipmap, buffer, function);
819 
820 					if(componentCount >= 1)
821 					{
822 						if(hasUnsignedTextureComponent(0))
823 							c[i][j][k].x = MulHigh(As<UShort4>(c[i][j][k].x), f[1 - i][1 - j][1 - k]);
824 						else
825 							c[i][j][k].x = MulHigh(c[i][j][k].x, fs[1 - i][1 - j][1 - k]);
826 					}
827 					if(componentCount >= 2)
828 					{
829 						if(hasUnsignedTextureComponent(1))
830 							c[i][j][k].y = MulHigh(As<UShort4>(c[i][j][k].y), f[1 - i][1 - j][1 - k]);
831 						else
832 							c[i][j][k].y = MulHigh(c[i][j][k].y, fs[1 - i][1 - j][1 - k]);
833 					}
834 					if(componentCount >= 3)
835 					{
836 						if(hasUnsignedTextureComponent(2))
837 							c[i][j][k].z = MulHigh(As<UShort4>(c[i][j][k].z), f[1 - i][1 - j][1 - k]);
838 						else
839 							c[i][j][k].z = MulHigh(c[i][j][k].z, fs[1 - i][1 - j][1 - k]);
840 					}
841 					if(componentCount >= 4)
842 					{
843 						if(hasUnsignedTextureComponent(3))
844 							c[i][j][k].w = MulHigh(As<UShort4>(c[i][j][k].w), f[1 - i][1 - j][1 - k]);
845 						else
846 							c[i][j][k].w = MulHigh(c[i][j][k].w, fs[1 - i][1 - j][1 - k]);
847 					}
848 
849 					if(i != 0 || j != 0 || k != 0)
850 					{
851 						if(componentCount >= 1) c[0][0][0].x += c[i][j][k].x;
852 						if(componentCount >= 2) c[0][0][0].y += c[i][j][k].y;
853 						if(componentCount >= 3) c[0][0][0].z += c[i][j][k].z;
854 						if(componentCount >= 4) c[0][0][0].w += c[i][j][k].w;
855 					}
856 				}
857 			}
858 		}
859 
860 		if(componentCount >= 1) c_.x = c[0][0][0].x;
861 		if(componentCount >= 2) c_.y = c[0][0][0].y;
862 		if(componentCount >= 3) c_.z = c[0][0][0].z;
863 		if(componentCount >= 4) c_.w = c[0][0][0].w;
864 
865 		// Correct for signed fractions
866 		if(componentCount >= 1)
867 			if(!hasUnsignedTextureComponent(0)) c_.x = AddSat(c_.x, c_.x);
868 		if(componentCount >= 2)
869 			if(!hasUnsignedTextureComponent(1)) c_.y = AddSat(c_.y, c_.y);
870 		if(componentCount >= 3)
871 			if(!hasUnsignedTextureComponent(2)) c_.z = AddSat(c_.z, c_.z);
872 		if(componentCount >= 4)
873 			if(!hasUnsignedTextureComponent(3)) c_.w = AddSat(c_.w, c_.w);
874 	}
875 
876 	return c_;
877 }
878 
sampleFloatFilter(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,SamplerFunction function)879 Vector4f SamplerCore::sampleFloatFilter(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, SamplerFunction function)
880 {
881 	Vector4f c = sampleFloatAniso(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta, false, function);
882 
883 	if(function == Fetch)
884 	{
885 		return c;
886 	}
887 
888 	if(state.mipmapFilter == MIPMAP_LINEAR)
889 	{
890 		Vector4f cc = sampleFloatAniso(texture, u, v, w, a, dRef, offset, sample, lod, anisotropy, uDelta, vDelta, true, function);
891 
892 		Float4 lod4 = Float4(Frac(lod));
893 
894 		c.x = (cc.x - c.x) * lod4 + c.x;
895 		c.y = (cc.y - c.y) * lod4 + c.y;
896 		c.z = (cc.z - c.z) * lod4 + c.z;
897 		c.w = (cc.w - c.w) * lod4 + c.w;
898 	}
899 
900 	return c;
901 }
902 
sampleFloatAniso(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,bool secondLOD,SamplerFunction function)903 Vector4f SamplerCore::sampleFloatAniso(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, bool secondLOD, SamplerFunction function)
904 {
905 	Vector4f c;
906 
907 	if(state.textureFilter != FILTER_ANISOTROPIC)
908 	{
909 		c = sampleFloat(texture, u, v, w, a, dRef, offset, sample, lod, secondLOD, function);
910 	}
911 	else
912 	{
913 		Int N = RoundInt(anisotropy);
914 
915 		Vector4f cSum;
916 
917 		cSum.x = Float4(0.0f);
918 		cSum.y = Float4(0.0f);
919 		cSum.z = Float4(0.0f);
920 		cSum.w = Float4(0.0f);
921 
922 		Float4 A = *Pointer<Float4>(constants + OFFSET(Constants, uvWeight) + 16 * N);
923 		Float4 B = *Pointer<Float4>(constants + OFFSET(Constants, uvStart) + 16 * N);
924 
925 		Float4 du = uDelta;
926 		Float4 dv = vDelta;
927 
928 		Float4 u0 = u + B * du;
929 		Float4 v0 = v + B * dv;
930 
931 		du *= A;
932 		dv *= A;
933 
934 		Int i = 0;
935 
936 		Do
937 		{
938 			c = sampleFloat(texture, u0, v0, w, a, dRef, offset, sample, lod, secondLOD, function);
939 
940 			u0 += du;
941 			v0 += dv;
942 
943 			cSum.x += c.x * A;
944 			cSum.y += c.y * A;
945 			cSum.z += c.z * A;
946 			cSum.w += c.w * A;
947 
948 			i++;
949 		}
950 		Until(i >= N);
951 
952 		c.x = cSum.x;
953 		c.y = cSum.y;
954 		c.z = cSum.z;
955 		c.w = cSum.w;
956 	}
957 
958 	return c;
959 }
960 
sampleFloat(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD,SamplerFunction function)961 Vector4f SamplerCore::sampleFloat(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD, SamplerFunction function)
962 {
963 	if(state.textureType != VK_IMAGE_VIEW_TYPE_3D)
964 	{
965 		return sampleFloat2D(texture, u, v, w, a, dRef, offset, sample, lod, secondLOD, function);
966 	}
967 	else
968 	{
969 		return sampleFloat3D(texture, u, v, w, dRef, offset, sample, lod, secondLOD, function);
970 	}
971 }
972 
sampleFloat2D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,const Float4 & a,Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD,SamplerFunction function)973 Vector4f SamplerCore::sampleFloat2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, const Float4 &a, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD, SamplerFunction function)
974 {
975 	Vector4f c;
976 
977 	int componentCount = textureComponentCount();
978 	bool gather = (state.textureFilter == FILTER_GATHER);
979 
980 	Pointer<Byte> mipmap;
981 	Pointer<Byte> buffer;
982 	selectMipmap(texture, mipmap, buffer, lod, secondLOD);
983 
984 	Int4 x0, x1, y0, y1;
985 	Float4 fu, fv;
986 	Int4 filter = computeFilterOffset(lod);
987 	address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
988 	address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
989 
990 	Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
991 	y0 *= pitchP;
992 
993 	Int4 z;
994 	if(state.isCube() || state.isArrayed())
995 	{
996 		Int4 face = As<Int4>(w);
997 		Int4 layerIndex = computeLayerIndex(a, mipmap, function);
998 
999 		// For cube maps, the layer argument is per cube, each of which has 6 layers
1000 		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
1001 		{
1002 			layerIndex *= Int4(6);
1003 		}
1004 
1005 		z = state.isCube() ? face : layerIndex;
1006 
1007 		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
1008 		{
1009 			z += layerIndex;
1010 		}
1011 
1012 		z *= *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
1013 	}
1014 
1015 	if(state.textureFilter == FILTER_POINT || (function == Fetch))
1016 	{
1017 		c = sampleTexel(x0, y0, z, dRef, sample, mipmap, buffer, function);
1018 	}
1019 	else
1020 	{
1021 		y1 *= pitchP;
1022 
1023 		Vector4f c00 = sampleTexel(x0, y0, z, dRef, sample, mipmap, buffer, function);
1024 		Vector4f c10 = sampleTexel(x1, y0, z, dRef, sample, mipmap, buffer, function);
1025 		Vector4f c01 = sampleTexel(x0, y1, z, dRef, sample, mipmap, buffer, function);
1026 		Vector4f c11 = sampleTexel(x1, y1, z, dRef, sample, mipmap, buffer, function);
1027 
1028 		if(!gather)  // Blend
1029 		{
1030 			if(componentCount >= 1) c00.x = c00.x + fu * (c10.x - c00.x);
1031 			if(componentCount >= 2) c00.y = c00.y + fu * (c10.y - c00.y);
1032 			if(componentCount >= 3) c00.z = c00.z + fu * (c10.z - c00.z);
1033 			if(componentCount >= 4) c00.w = c00.w + fu * (c10.w - c00.w);
1034 
1035 			if(componentCount >= 1) c01.x = c01.x + fu * (c11.x - c01.x);
1036 			if(componentCount >= 2) c01.y = c01.y + fu * (c11.y - c01.y);
1037 			if(componentCount >= 3) c01.z = c01.z + fu * (c11.z - c01.z);
1038 			if(componentCount >= 4) c01.w = c01.w + fu * (c11.w - c01.w);
1039 
1040 			if(componentCount >= 1) c.x = c00.x + fv * (c01.x - c00.x);
1041 			if(componentCount >= 2) c.y = c00.y + fv * (c01.y - c00.y);
1042 			if(componentCount >= 3) c.z = c00.z + fv * (c01.z - c00.z);
1043 			if(componentCount >= 4) c.w = c00.w + fv * (c01.w - c00.w);
1044 		}
1045 		else  // Gather
1046 		{
1047 			VkComponentSwizzle swizzle = gatherSwizzle();
1048 			switch(swizzle)
1049 			{
1050 				case VK_COMPONENT_SWIZZLE_ZERO:
1051 				case VK_COMPONENT_SWIZZLE_ONE:
1052 					// Handled at the final component swizzle.
1053 					break;
1054 				default:
1055 					c.x = c01[swizzle - VK_COMPONENT_SWIZZLE_R];
1056 					c.y = c11[swizzle - VK_COMPONENT_SWIZZLE_R];
1057 					c.z = c10[swizzle - VK_COMPONENT_SWIZZLE_R];
1058 					c.w = c00[swizzle - VK_COMPONENT_SWIZZLE_R];
1059 					break;
1060 			}
1061 		}
1062 	}
1063 
1064 	return c;
1065 }
1066 
sampleFloat3D(Pointer<Byte> & texture,Float4 & u,Float4 & v,Float4 & w,Float4 & dRef,Vector4i & offset,const Int4 & sample,Float & lod,bool secondLOD,SamplerFunction function)1067 Vector4f SamplerCore::sampleFloat3D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Float4 &dRef, Vector4i &offset, const Int4 &sample, Float &lod, bool secondLOD, SamplerFunction function)
1068 {
1069 	Vector4f c;
1070 
1071 	int componentCount = textureComponentCount();
1072 
1073 	Pointer<Byte> mipmap;
1074 	Pointer<Byte> buffer;
1075 	selectMipmap(texture, mipmap, buffer, lod, secondLOD);
1076 
1077 	Int4 x0, x1, y0, y1, z0, z1;
1078 	Float4 fu, fv, fw;
1079 	Int4 filter = computeFilterOffset(lod);
1080 	address(u, x0, x1, fu, mipmap, offset.x, filter, OFFSET(Mipmap, width), state.addressingModeU, function);
1081 	address(v, y0, y1, fv, mipmap, offset.y, filter, OFFSET(Mipmap, height), state.addressingModeV, function);
1082 	address(w, z0, z1, fw, mipmap, offset.z, filter, OFFSET(Mipmap, depth), state.addressingModeW, function);
1083 
1084 	Int4 pitchP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, pitchP), 16);
1085 	Int4 sliceP = *Pointer<Int4>(mipmap + OFFSET(Mipmap, sliceP), 16);
1086 	y0 *= pitchP;
1087 	z0 *= sliceP;
1088 
1089 	if(state.textureFilter == FILTER_POINT || (function == Fetch))
1090 	{
1091 		c = sampleTexel(x0, y0, z0, dRef, sample, mipmap, buffer, function);
1092 	}
1093 	else
1094 	{
1095 		y1 *= pitchP;
1096 		z1 *= sliceP;
1097 
1098 		Vector4f c000 = sampleTexel(x0, y0, z0, dRef, sample, mipmap, buffer, function);
1099 		Vector4f c100 = sampleTexel(x1, y0, z0, dRef, sample, mipmap, buffer, function);
1100 		Vector4f c010 = sampleTexel(x0, y1, z0, dRef, sample, mipmap, buffer, function);
1101 		Vector4f c110 = sampleTexel(x1, y1, z0, dRef, sample, mipmap, buffer, function);
1102 		Vector4f c001 = sampleTexel(x0, y0, z1, dRef, sample, mipmap, buffer, function);
1103 		Vector4f c101 = sampleTexel(x1, y0, z1, dRef, sample, mipmap, buffer, function);
1104 		Vector4f c011 = sampleTexel(x0, y1, z1, dRef, sample, mipmap, buffer, function);
1105 		Vector4f c111 = sampleTexel(x1, y1, z1, dRef, sample, mipmap, buffer, function);
1106 
1107 		// Blend first slice
1108 		if(componentCount >= 1) c000.x = c000.x + fu * (c100.x - c000.x);
1109 		if(componentCount >= 2) c000.y = c000.y + fu * (c100.y - c000.y);
1110 		if(componentCount >= 3) c000.z = c000.z + fu * (c100.z - c000.z);
1111 		if(componentCount >= 4) c000.w = c000.w + fu * (c100.w - c000.w);
1112 
1113 		if(componentCount >= 1) c010.x = c010.x + fu * (c110.x - c010.x);
1114 		if(componentCount >= 2) c010.y = c010.y + fu * (c110.y - c010.y);
1115 		if(componentCount >= 3) c010.z = c010.z + fu * (c110.z - c010.z);
1116 		if(componentCount >= 4) c010.w = c010.w + fu * (c110.w - c010.w);
1117 
1118 		if(componentCount >= 1) c000.x = c000.x + fv * (c010.x - c000.x);
1119 		if(componentCount >= 2) c000.y = c000.y + fv * (c010.y - c000.y);
1120 		if(componentCount >= 3) c000.z = c000.z + fv * (c010.z - c000.z);
1121 		if(componentCount >= 4) c000.w = c000.w + fv * (c010.w - c000.w);
1122 
1123 		// Blend second slice
1124 		if(componentCount >= 1) c001.x = c001.x + fu * (c101.x - c001.x);
1125 		if(componentCount >= 2) c001.y = c001.y + fu * (c101.y - c001.y);
1126 		if(componentCount >= 3) c001.z = c001.z + fu * (c101.z - c001.z);
1127 		if(componentCount >= 4) c001.w = c001.w + fu * (c101.w - c001.w);
1128 
1129 		if(componentCount >= 1) c011.x = c011.x + fu * (c111.x - c011.x);
1130 		if(componentCount >= 2) c011.y = c011.y + fu * (c111.y - c011.y);
1131 		if(componentCount >= 3) c011.z = c011.z + fu * (c111.z - c011.z);
1132 		if(componentCount >= 4) c011.w = c011.w + fu * (c111.w - c011.w);
1133 
1134 		if(componentCount >= 1) c001.x = c001.x + fv * (c011.x - c001.x);
1135 		if(componentCount >= 2) c001.y = c001.y + fv * (c011.y - c001.y);
1136 		if(componentCount >= 3) c001.z = c001.z + fv * (c011.z - c001.z);
1137 		if(componentCount >= 4) c001.w = c001.w + fv * (c011.w - c001.w);
1138 
1139 		// Blend slices
1140 		if(componentCount >= 1) c.x = c000.x + fw * (c001.x - c000.x);
1141 		if(componentCount >= 2) c.y = c000.y + fw * (c001.y - c000.y);
1142 		if(componentCount >= 3) c.z = c000.z + fw * (c001.z - c000.z);
1143 		if(componentCount >= 4) c.w = c000.w + fw * (c001.w - c000.w);
1144 	}
1145 
1146 	return c;
1147 }
1148 
log2sqrt(Float lod)1149 static Float log2sqrt(Float lod)
1150 {
1151 	// log2(sqrt(lod))                              // Equals 0.25 * log2(lod^2).
1152 	lod *= lod;                                     // Squaring doubles the exponent and produces an extra bit of precision.
1153 	lod = Float(As<Int>(lod)) - Float(0x3F800000);  // Interpret as integer and subtract the exponent bias.
1154 	lod *= As<Float>(Int(0x33000000));              // Scale by 0.25 * 2^-23 (mantissa length).
1155 
1156 	return lod;
1157 }
1158 
log2(Float lod)1159 static Float log2(Float lod)
1160 {
1161 	lod *= lod;                                     // Squaring doubles the exponent and produces an extra bit of precision.
1162 	lod = Float(As<Int>(lod)) - Float(0x3F800000);  // Interpret as integer and subtract the exponent bias.
1163 	lod *= As<Float>(Int(0x33800000));              // Scale by 0.5 * 2^-23 (mantissa length).
1164 
1165 	return lod;
1166 }
1167 
computeLod1D(Pointer<Byte> & texture,Float & lod,Float4 & uuuu,Float4 & dsx,Float4 & dsy,SamplerFunction function)1168 void SamplerCore::computeLod1D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &dsx, Float4 &dsy, SamplerFunction function)
1169 {
1170 	Float4 dudxy;
1171 
1172 	if(function != Grad)  // Implicit
1173 	{
1174 		dudxy = uuuu.yz - uuuu.xx;
1175 	}
1176 	else
1177 	{
1178 		dudxy = UnpackLow(dsx, dsy);
1179 	}
1180 
1181 	// Scale by texture dimensions.
1182 	Float4 dUdxy = dudxy * *Pointer<Float4>(texture + OFFSET(Texture, widthWidthHeightHeight));
1183 
1184 	// Note we could take the absolute value here and omit the square root below,
1185 	// but this is more consistent with the 2D calculation and still cheap.
1186 	Float4 dU2dxy = dUdxy * dUdxy;
1187 
1188 	lod = Max(Float(dU2dxy.x), Float(dU2dxy.y));
1189 	lod = log2sqrt(lod);
1190 }
1191 
computeLod2D(Pointer<Byte> & texture,Float & lod,Float & anisotropy,Float4 & uDelta,Float4 & vDelta,Float4 & uuuu,Float4 & vvvv,Float4 & dsx,Float4 & dsy,SamplerFunction function)1192 void SamplerCore::computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisotropy, Float4 &uDelta, Float4 &vDelta, Float4 &uuuu, Float4 &vvvv, Float4 &dsx, Float4 &dsy, SamplerFunction function)
1193 {
1194 	Float4 duvdxy;
1195 
1196 	if(function != Grad)  // Implicit
1197 	{
1198 		duvdxy = Float4(uuuu.yz, vvvv.yz) - Float4(uuuu.xx, vvvv.xx);
1199 	}
1200 	else
1201 	{
1202 		Float4 dudxy = Float4(dsx.xx, dsy.xx);
1203 		Float4 dvdxy = Float4(dsx.yy, dsy.yy);
1204 
1205 		duvdxy = Float4(dudxy.xz, dvdxy.xz);
1206 	}
1207 
1208 	// Scale by texture dimensions.
1209 	Float4 dUVdxy = duvdxy * *Pointer<Float4>(texture + OFFSET(Texture, widthWidthHeightHeight));
1210 
1211 	Float4 dUV2dxy = dUVdxy * dUVdxy;
1212 	Float4 dUV2 = dUV2dxy.xy + dUV2dxy.zw;
1213 
1214 	lod = Max(Float(dUV2.x), Float(dUV2.y));  // Square length of major axis
1215 
1216 	if(state.textureFilter == FILTER_ANISOTROPIC)
1217 	{
1218 		Float det = Abs(Float(dUVdxy.x) * Float(dUVdxy.w) - Float(dUVdxy.y) * Float(dUVdxy.z));
1219 
1220 		Float4 dudx = duvdxy.xxxx;
1221 		Float4 dudy = duvdxy.yyyy;
1222 		Float4 dvdx = duvdxy.zzzz;
1223 		Float4 dvdy = duvdxy.wwww;
1224 
1225 		Int4 mask = As<Int4>(CmpNLT(dUV2.x, dUV2.y));
1226 		uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
1227 		vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
1228 
1229 		anisotropy = lod * Rcp(det, Precision::Relaxed);
1230 		anisotropy = Min(anisotropy, state.maxAnisotropy);
1231 
1232 		lod *= Rcp(anisotropy * anisotropy, Precision::Relaxed);
1233 	}
1234 
1235 	lod = log2sqrt(lod);  // log2(sqrt(lod))
1236 }
1237 
computeLodCube(Pointer<Byte> & texture,Float & lod,Float4 & u,Float4 & v,Float4 & w,Float4 & dsx,Float4 & dsy,Float4 & M,SamplerFunction function)1238 void SamplerCore::computeLodCube(Pointer<Byte> &texture, Float &lod, Float4 &u, Float4 &v, Float4 &w, Float4 &dsx, Float4 &dsy, Float4 &M, SamplerFunction function)
1239 {
1240 	Float4 dudxy, dvdxy, dsdxy;
1241 
1242 	if(function != Grad)  // Implicit
1243 	{
1244 		Float4 U = u * M;
1245 		Float4 V = v * M;
1246 		Float4 W = w * M;
1247 
1248 		dudxy = Abs(U - U.xxxx);
1249 		dvdxy = Abs(V - V.xxxx);
1250 		dsdxy = Abs(W - W.xxxx);
1251 	}
1252 	else
1253 	{
1254 		dudxy = Float4(dsx.xx, dsy.xx);
1255 		dvdxy = Float4(dsx.yy, dsy.yy);
1256 		dsdxy = Float4(dsx.zz, dsy.zz);
1257 
1258 		dudxy = Abs(dudxy * Float4(M.x));
1259 		dvdxy = Abs(dvdxy * Float4(M.x));
1260 		dsdxy = Abs(dsdxy * Float4(M.x));
1261 	}
1262 
1263 	// Compute the largest Manhattan distance in two dimensions.
1264 	// This takes the footprint across adjacent faces into account.
1265 	Float4 duvdxy = dudxy + dvdxy;
1266 	Float4 dusdxy = dudxy + dsdxy;
1267 	Float4 dvsdxy = dvdxy + dsdxy;
1268 
1269 	dudxy = Max(Max(duvdxy, dusdxy), dvsdxy);
1270 
1271 	lod = Max(Float(dudxy.y), Float(dudxy.z));  // FIXME: Max(dudxy.y, dudxy.z);
1272 
1273 	// Scale by texture dimension.
1274 	lod *= *Pointer<Float>(texture + OFFSET(Texture, width));
1275 
1276 	lod = log2(lod);
1277 }
1278 
computeLod3D(Pointer<Byte> & texture,Float & lod,Float4 & uuuu,Float4 & vvvv,Float4 & wwww,Float4 & dsx,Float4 & dsy,SamplerFunction function)1279 void SamplerCore::computeLod3D(Pointer<Byte> &texture, Float &lod, Float4 &uuuu, Float4 &vvvv, Float4 &wwww, Float4 &dsx, Float4 &dsy, SamplerFunction function)
1280 {
1281 	Float4 dudxy, dvdxy, dsdxy;
1282 
1283 	if(function != Grad)  // Implicit
1284 	{
1285 		dudxy = uuuu - uuuu.xxxx;
1286 		dvdxy = vvvv - vvvv.xxxx;
1287 		dsdxy = wwww - wwww.xxxx;
1288 	}
1289 	else
1290 	{
1291 		dudxy = Float4(dsx.xx, dsy.xx);
1292 		dvdxy = Float4(dsx.yy, dsy.yy);
1293 		dsdxy = Float4(dsx.zz, dsy.zz);
1294 	}
1295 
1296 	// Scale by texture dimensions.
1297 	dudxy *= *Pointer<Float4>(texture + OFFSET(Texture, width));
1298 	dvdxy *= *Pointer<Float4>(texture + OFFSET(Texture, height));
1299 	dsdxy *= *Pointer<Float4>(texture + OFFSET(Texture, depth));
1300 
1301 	dudxy *= dudxy;
1302 	dvdxy *= dvdxy;
1303 	dsdxy *= dsdxy;
1304 
1305 	dudxy += dvdxy;
1306 	dudxy += dsdxy;
1307 
1308 	lod = Max(Float(dudxy.y), Float(dudxy.z));  // FIXME: Max(dudxy.y, dudxy.z);
1309 
1310 	lod = log2sqrt(lod);  // log2(sqrt(lod))
1311 }
1312 
cubeFace(Float4 & U,Float4 & V,Float4 & x,Float4 & y,Float4 & z,Float4 & M)1313 Int4 SamplerCore::cubeFace(Float4 &U, Float4 &V, Float4 &x, Float4 &y, Float4 &z, Float4 &M)
1314 {
1315 	// TODO: Comply with Vulkan recommendation:
1316 	// Vulkan 1.1: "The rules should have as the first rule that rz wins over ry and rx, and the second rule that ry wins over rx."
1317 
1318 	Int4 xn = CmpLT(x, Float4(0.0f));  // x < 0
1319 	Int4 yn = CmpLT(y, Float4(0.0f));  // y < 0
1320 	Int4 zn = CmpLT(z, Float4(0.0f));  // z < 0
1321 
1322 	Float4 absX = Abs(x);
1323 	Float4 absY = Abs(y);
1324 	Float4 absZ = Abs(z);
1325 
1326 	Int4 xy = CmpNLE(absX, absY);  // abs(x) > abs(y)
1327 	Int4 yz = CmpNLE(absY, absZ);  // abs(y) > abs(z)
1328 	Int4 zx = CmpNLE(absZ, absX);  // abs(z) > abs(x)
1329 	Int4 xMajor = xy & ~zx;        // abs(x) > abs(y) && abs(x) > abs(z)
1330 	Int4 yMajor = yz & ~xy;        // abs(y) > abs(z) && abs(y) > abs(x)
1331 	Int4 zMajor = zx & ~yz;        // abs(z) > abs(x) && abs(z) > abs(y)
1332 
1333 	// FACE_POSITIVE_X = 000b
1334 	// FACE_NEGATIVE_X = 001b
1335 	// FACE_POSITIVE_Y = 010b
1336 	// FACE_NEGATIVE_Y = 011b
1337 	// FACE_POSITIVE_Z = 100b
1338 	// FACE_NEGATIVE_Z = 101b
1339 
1340 	Int yAxis = SignMask(yMajor);
1341 	Int zAxis = SignMask(zMajor);
1342 
1343 	Int4 n = ((xn & xMajor) | (yn & yMajor) | (zn & zMajor)) & Int4(0x80000000);
1344 	Int negative = SignMask(n);
1345 
1346 	Int faces = *Pointer<Int>(constants + OFFSET(Constants, transposeBit0) + negative * 4);
1347 	faces |= *Pointer<Int>(constants + OFFSET(Constants, transposeBit1) + yAxis * 4);
1348 	faces |= *Pointer<Int>(constants + OFFSET(Constants, transposeBit2) + zAxis * 4);
1349 
1350 	Int4 face;
1351 	face.x = faces & 0x7;
1352 	face.y = (faces >> 4) & 0x7;
1353 	face.z = (faces >> 8) & 0x7;
1354 	face.w = (faces >> 12) & 0x7;
1355 
1356 	M = Max(Max(absX, absY), absZ);
1357 
1358 	// U = xMajor ? (neg ^ -z) : ((zMajor & neg) ^ x)
1359 	U = As<Float4>((xMajor & (n ^ As<Int4>(-z))) | (~xMajor & ((zMajor & n) ^ As<Int4>(x))));
1360 
1361 	// V = !yMajor ? -y : (n ^ z)
1362 	V = As<Float4>((~yMajor & As<Int4>(-y)) | (yMajor & (n ^ As<Int4>(z))));
1363 
1364 	M = reciprocal(M) * Float4(0.5f);
1365 	U = U * M + Float4(0.5f);
1366 	V = V * M + Float4(0.5f);
1367 
1368 	return face;
1369 }
1370 
applyOffset(Short4 & uvw,Int4 & offset,const Int4 & whd,AddressingMode mode)1371 Short4 SamplerCore::applyOffset(Short4 &uvw, Int4 &offset, const Int4 &whd, AddressingMode mode)
1372 {
1373 	Int4 tmp = Int4(As<UShort4>(uvw));
1374 	tmp = tmp + offset;
1375 
1376 	switch(mode)
1377 	{
1378 		case AddressingMode::ADDRESSING_WRAP:
1379 			tmp = (tmp + whd * Int4(-MIN_TEXEL_OFFSET)) % whd;
1380 			break;
1381 		case AddressingMode::ADDRESSING_CLAMP:
1382 		case AddressingMode::ADDRESSING_MIRROR:
1383 		case AddressingMode::ADDRESSING_MIRRORONCE:
1384 		case AddressingMode::ADDRESSING_BORDER:  // FIXME: Implement and test ADDRESSING_MIRROR, ADDRESSING_MIRRORONCE, ADDRESSING_BORDER
1385 			tmp = Min(Max(tmp, Int4(0)), whd - Int4(1));
1386 			break;
1387 		case AddressingMode::ADDRESSING_SEAMLESS:
1388 			ASSERT(false);  // Cube sampling doesn't support offset.
1389 		default:
1390 			ASSERT(false);
1391 	}
1392 
1393 	return As<Short4>(UShort4(tmp));
1394 }
1395 
computeIndices(UInt index[4],Short4 uuuu,Short4 vvvv,Short4 wwww,const Short4 & layerIndex,Vector4i & offset,const Int4 & sample,const Pointer<Byte> & mipmap,SamplerFunction function)1396 void SamplerCore::computeIndices(UInt index[4], Short4 uuuu, Short4 vvvv, Short4 wwww, const Short4 &layerIndex, Vector4i &offset, const Int4 &sample, const Pointer<Byte> &mipmap, SamplerFunction function)
1397 {
1398 	uuuu = MulHigh(As<UShort4>(uuuu), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, width))));
1399 
1400 	if(function.offset)
1401 	{
1402 		uuuu = applyOffset(uuuu, offset.x, *Pointer<Int4>(mipmap + OFFSET(Mipmap, width)), state.addressingModeU);
1403 	}
1404 
1405 	UInt4 indices = Int4(uuuu);
1406 
1407 	if(state.is2D() || state.is3D() || state.isCube())
1408 	{
1409 		vvvv = MulHigh(As<UShort4>(vvvv), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, height))));
1410 
1411 		if(function.offset)
1412 		{
1413 			vvvv = applyOffset(vvvv, offset.y, *Pointer<Int4>(mipmap + OFFSET(Mipmap, height)), state.addressingModeV);
1414 		}
1415 
1416 		Short4 uv0uv1 = As<Short4>(UnpackLow(uuuu, vvvv));
1417 		Short4 uv2uv3 = As<Short4>(UnpackHigh(uuuu, vvvv));
1418 		Int2 i01 = MulAdd(uv0uv1, *Pointer<Short4>(mipmap + OFFSET(Mipmap, onePitchP)));
1419 		Int2 i23 = MulAdd(uv2uv3, *Pointer<Short4>(mipmap + OFFSET(Mipmap, onePitchP)));
1420 
1421 		indices = UInt4(As<UInt2>(i01), As<UInt2>(i23));
1422 	}
1423 
1424 	if(state.is3D())
1425 	{
1426 		wwww = MulHigh(As<UShort4>(wwww), UShort4(*Pointer<Int4>(mipmap + OFFSET(Mipmap, depth))));
1427 
1428 		if(function.offset)
1429 		{
1430 			wwww = applyOffset(wwww, offset.z, *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth)), state.addressingModeW);
1431 		}
1432 
1433 		indices += As<UInt4>(Int4(As<UShort4>(wwww))) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
1434 	}
1435 
1436 	if(state.isArrayed())
1437 	{
1438 		Int4 layer = Int4(As<UShort4>(layerIndex));
1439 
1440 		if(state.textureType == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY)
1441 		{
1442 			layer *= Int4(6);
1443 		}
1444 
1445 		UInt4 layerOffset = As<UInt4>(layer) * *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sliceP));
1446 
1447 		indices += layerOffset;
1448 	}
1449 
1450 	if(function.sample)
1451 	{
1452 		UInt4 sampleOffset = Min(As<UInt4>(sample), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
1453 		                     *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
1454 		indices += sampleOffset;
1455 	}
1456 
1457 	index[0] = Extract(indices, 0);
1458 	index[1] = Extract(indices, 1);
1459 	index[2] = Extract(indices, 2);
1460 	index[3] = Extract(indices, 3);
1461 }
1462 
computeIndices(UInt index[4],Int4 uuuu,Int4 vvvv,Int4 wwww,const Int4 & sample,Int4 valid,const Pointer<Byte> & mipmap,SamplerFunction function)1463 void SamplerCore::computeIndices(UInt index[4], Int4 uuuu, Int4 vvvv, Int4 wwww, const Int4 &sample, Int4 valid, const Pointer<Byte> &mipmap, SamplerFunction function)
1464 {
1465 	UInt4 indices = uuuu;
1466 
1467 	if(state.is2D() || state.is3D() || state.isCube())
1468 	{
1469 		indices += As<UInt4>(vvvv);
1470 	}
1471 
1472 	if(state.is3D() || state.isCube() || state.isArrayed())
1473 	{
1474 		indices += As<UInt4>(wwww);
1475 	}
1476 
1477 	if(function.sample)
1478 	{
1479 		indices += Min(As<UInt4>(sample), *Pointer<UInt4>(mipmap + OFFSET(Mipmap, sampleMax), 16)) *
1480 		           *Pointer<UInt4>(mipmap + OFFSET(Mipmap, samplePitchP), 16);
1481 	}
1482 
1483 	if(borderModeActive())
1484 	{
1485 		// Texels out of range are still sampled before being replaced
1486 		// with the border color, so sample them at linear index 0.
1487 		indices &= As<UInt4>(valid);
1488 	}
1489 
1490 	for(int i = 0; i < 4; i++)
1491 	{
1492 		index[i] = Extract(As<Int4>(indices), i);
1493 	}
1494 }
1495 
sampleTexel(UInt index[4],Pointer<Byte> buffer)1496 Vector4s SamplerCore::sampleTexel(UInt index[4], Pointer<Byte> buffer)
1497 {
1498 	Vector4s c;
1499 
1500 	if(has16bitPackedTextureFormat())
1501 	{
1502 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1503 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1504 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1505 		c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1506 
1507 		switch(state.textureFormat)
1508 		{
1509 			case VK_FORMAT_R5G6B5_UNORM_PACK16:
1510 				c.z = (c.x & Short4(0x001Fu)) << 11;
1511 				c.y = (c.x & Short4(0x07E0u)) << 5;
1512 				c.x = (c.x & Short4(0xF800u));
1513 				break;
1514 			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1515 				c.w = (c.x << 12) & Short4(0xF000u);
1516 				c.z = (c.x) & Short4(0xF000u);
1517 				c.y = (c.x << 4) & Short4(0xF000u);
1518 				c.x = (c.x << 8) & Short4(0xF000u);
1519 				break;
1520 			case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1521 				c.w = (c.x) & Short4(0x8000u);
1522 				c.z = (c.x << 11) & Short4(0xF800u);
1523 				c.y = (c.x << 6) & Short4(0xF800u);
1524 				c.x = (c.x << 1) & Short4(0xF800u);
1525 				break;
1526 			default:
1527 				ASSERT(false);
1528 		}
1529 	}
1530 	else if(has8bitTextureComponents())
1531 	{
1532 		switch(textureComponentCount())
1533 		{
1534 			case 4:
1535 			{
1536 				Byte4 c0 = Pointer<Byte4>(buffer)[index[0]];
1537 				Byte4 c1 = Pointer<Byte4>(buffer)[index[1]];
1538 				Byte4 c2 = Pointer<Byte4>(buffer)[index[2]];
1539 				Byte4 c3 = Pointer<Byte4>(buffer)[index[3]];
1540 				c.x = Unpack(c0, c1);
1541 				c.y = Unpack(c2, c3);
1542 
1543 				switch(state.textureFormat)
1544 				{
1545 					case VK_FORMAT_B8G8R8A8_UNORM:
1546 					case VK_FORMAT_B8G8R8A8_SRGB:
1547 						c.z = As<Short4>(UnpackLow(c.x, c.y));
1548 						c.x = As<Short4>(UnpackHigh(c.x, c.y));
1549 						c.y = c.z;
1550 						c.w = c.x;
1551 						c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
1552 						c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
1553 						c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
1554 						c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
1555 						break;
1556 					case VK_FORMAT_R8G8B8A8_UNORM:
1557 					case VK_FORMAT_R8G8B8A8_SNORM:
1558 					case VK_FORMAT_R8G8B8A8_SINT:
1559 					case VK_FORMAT_R8G8B8A8_SRGB:
1560 					case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1561 					case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
1562 					case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1563 					case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1564 						c.z = As<Short4>(UnpackHigh(c.x, c.y));
1565 						c.x = As<Short4>(UnpackLow(c.x, c.y));
1566 						c.y = c.x;
1567 						c.w = c.z;
1568 						c.x = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.x));
1569 						c.y = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.y));
1570 						c.z = UnpackLow(As<Byte8>(Short4(0)), As<Byte8>(c.z));
1571 						c.w = UnpackHigh(As<Byte8>(Short4(0)), As<Byte8>(c.w));
1572 						// Propagate sign bit
1573 						if(state.textureFormat == VK_FORMAT_R8G8B8A8_SINT ||
1574 						   state.textureFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32)
1575 						{
1576 							c.x >>= 8;
1577 							c.y >>= 8;
1578 							c.z >>= 8;
1579 							c.w >>= 8;
1580 						}
1581 						break;
1582 					case VK_FORMAT_R8G8B8A8_UINT:
1583 					case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1584 						c.z = As<Short4>(UnpackHigh(c.x, c.y));
1585 						c.x = As<Short4>(UnpackLow(c.x, c.y));
1586 						c.y = c.x;
1587 						c.w = c.z;
1588 						c.x = UnpackLow(As<Byte8>(c.x), As<Byte8>(Short4(0)));
1589 						c.y = UnpackHigh(As<Byte8>(c.y), As<Byte8>(Short4(0)));
1590 						c.z = UnpackLow(As<Byte8>(c.z), As<Byte8>(Short4(0)));
1591 						c.w = UnpackHigh(As<Byte8>(c.w), As<Byte8>(Short4(0)));
1592 						break;
1593 					default:
1594 						ASSERT(false);
1595 				}
1596 			}
1597 			break;
1598 			case 2:
1599 				c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1600 				c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1601 				c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1602 				c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1603 
1604 				switch(state.textureFormat)
1605 				{
1606 					case VK_FORMAT_R8G8_UNORM:
1607 					case VK_FORMAT_R8G8_SNORM:
1608 					case VK_FORMAT_R8G8_SRGB:
1609 						c.y = (c.x & Short4(0xFF00u));
1610 						c.x = (c.x << 8);
1611 						break;
1612 					case VK_FORMAT_R8G8_SINT:
1613 						c.y = c.x >> 8;
1614 						c.x = (c.x << 8) >> 8;  // Propagate sign bit
1615 						break;
1616 					case VK_FORMAT_R8G8_UINT:
1617 						c.y = As<Short4>(As<UShort4>(c.x) >> 8);
1618 						c.x &= Short4(0x00FFu);
1619 						break;
1620 					default:
1621 						ASSERT(false);
1622 				}
1623 				break;
1624 			case 1:
1625 			{
1626 				Int c0 = Int(*Pointer<Byte>(buffer + index[0]));
1627 				Int c1 = Int(*Pointer<Byte>(buffer + index[1]));
1628 				Int c2 = Int(*Pointer<Byte>(buffer + index[2]));
1629 				Int c3 = Int(*Pointer<Byte>(buffer + index[3]));
1630 				c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
1631 
1632 				switch(state.textureFormat)
1633 				{
1634 					case VK_FORMAT_R8_SINT:
1635 					case VK_FORMAT_R8_UINT:
1636 					case VK_FORMAT_S8_UINT:
1637 					{
1638 						Int zero(0);
1639 						c.x = Unpack(As<Byte4>(c0), As<Byte4>(zero));
1640 						// Propagate sign bit
1641 						if(state.textureFormat == VK_FORMAT_R8_SINT)
1642 						{
1643 							c.x = (c.x << 8) >> 8;
1644 						}
1645 					}
1646 					break;
1647 					case VK_FORMAT_R8_SNORM:
1648 					case VK_FORMAT_R8_UNORM:
1649 					case VK_FORMAT_R8_SRGB:
1650 						// TODO: avoid populating the low bits at all.
1651 						c.x = Unpack(As<Byte4>(c0));
1652 						c.x &= Short4(0xFF00u);
1653 						break;
1654 					default:
1655 						c.x = Unpack(As<Byte4>(c0));
1656 						break;
1657 				}
1658 			}
1659 			break;
1660 			default:
1661 				ASSERT(false);
1662 		}
1663 	}
1664 	else if(has16bitTextureComponents())
1665 	{
1666 		switch(textureComponentCount())
1667 		{
1668 			case 4:
1669 				c.x = Pointer<Short4>(buffer)[index[0]];
1670 				c.y = Pointer<Short4>(buffer)[index[1]];
1671 				c.z = Pointer<Short4>(buffer)[index[2]];
1672 				c.w = Pointer<Short4>(buffer)[index[3]];
1673 				transpose4x4(c.x, c.y, c.z, c.w);
1674 				break;
1675 			case 2:
1676 				c.x = *Pointer<Short4>(buffer + 4 * index[0]);
1677 				c.x = As<Short4>(UnpackLow(c.x, *Pointer<Short4>(buffer + 4 * index[1])));
1678 				c.z = *Pointer<Short4>(buffer + 4 * index[2]);
1679 				c.z = As<Short4>(UnpackLow(c.z, *Pointer<Short4>(buffer + 4 * index[3])));
1680 				c.y = c.x;
1681 				c.x = UnpackLow(As<Int2>(c.x), As<Int2>(c.z));
1682 				c.y = UnpackHigh(As<Int2>(c.y), As<Int2>(c.z));
1683 				break;
1684 			case 1:
1685 				c.x = Insert(c.x, Pointer<Short>(buffer)[index[0]], 0);
1686 				c.x = Insert(c.x, Pointer<Short>(buffer)[index[1]], 1);
1687 				c.x = Insert(c.x, Pointer<Short>(buffer)[index[2]], 2);
1688 				c.x = Insert(c.x, Pointer<Short>(buffer)[index[3]], 3);
1689 				break;
1690 			default:
1691 				ASSERT(false);
1692 		}
1693 	}
1694 	else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UNORM_PACK32)
1695 	{
1696 		Int4 cc;
1697 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1698 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1699 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1700 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1701 
1702 		c = a2b10g10r10Unpack(cc);
1703 	}
1704 	else if(state.textureFormat == VK_FORMAT_A2R10G10B10_UNORM_PACK32)
1705 	{
1706 		Int4 cc;
1707 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1708 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1709 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1710 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1711 
1712 		c = a2r10g10b10Unpack(cc);
1713 	}
1714 	else if(state.textureFormat == VK_FORMAT_A2B10G10R10_UINT_PACK32)
1715 	{
1716 		Int4 cc;
1717 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1718 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1719 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1720 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1721 
1722 		c.x = Short4((cc & Int4(0x3FF)));
1723 		c.y = Short4(((cc >> 10) & Int4(0x3FF)));
1724 		c.z = Short4(((cc >> 20) & Int4(0x3FF)));
1725 		c.w = Short4(((cc >> 30) & Int4(0x3)));
1726 	}
1727 	else if(state.textureFormat == VK_FORMAT_A2R10G10B10_UINT_PACK32)
1728 	{
1729 		Int4 cc;
1730 		cc = Insert(cc, Pointer<Int>(buffer)[index[0]], 0);
1731 		cc = Insert(cc, Pointer<Int>(buffer)[index[1]], 1);
1732 		cc = Insert(cc, Pointer<Int>(buffer)[index[2]], 2);
1733 		cc = Insert(cc, Pointer<Int>(buffer)[index[3]], 3);
1734 
1735 		c.z = Short4((cc & Int4(0x3FF)));
1736 		c.y = Short4(((cc >> 10) & Int4(0x3FF)));
1737 		c.x = Short4(((cc >> 20) & Int4(0x3FF)));
1738 		c.w = Short4(((cc >> 30) & Int4(0x3)));
1739 	}
1740 	else
1741 		ASSERT(false);
1742 
1743 	if(state.textureFormat.isSRGBformat())
1744 	{
1745 		for(int i = 0; i < textureComponentCount(); i++)
1746 		{
1747 			if(isRGBComponent(i))
1748 			{
1749 				// The current table-based sRGB conversion requires 0xFF00 to represent 1.0.
1750 				ASSERT(state.textureFormat.has8bitTextureComponents());
1751 
1752 				sRGBtoLinearFF00(c[i]);
1753 			}
1754 		}
1755 	}
1756 
1757 	return c;
1758 }
1759 
sampleTexel(Short4 & uuuu,Short4 & vvvv,Short4 & wwww,const Short4 & layerIndex,Vector4i & offset,const Int4 & sample,Pointer<Byte> & mipmap,Pointer<Byte> buffer,SamplerFunction function)1760 Vector4s SamplerCore::sampleTexel(Short4 &uuuu, Short4 &vvvv, Short4 &wwww, const Short4 &layerIndex, Vector4i &offset, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer, SamplerFunction function)
1761 {
1762 	Vector4s c;
1763 
1764 	UInt index[4];
1765 	computeIndices(index, uuuu, vvvv, wwww, layerIndex, offset, sample, mipmap, function);
1766 
1767 	if(isYcbcrFormat())
1768 	{
1769 		// Pointers to the planes of YCbCr images are stored in consecutive mipmap levels.
1770 		Pointer<Byte> bufferY = buffer;                                                                         // *Pointer<Pointer<Byte>>(mipmap + 0 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));
1771 		Pointer<Byte> bufferU = *Pointer<Pointer<Byte>>(mipmap + 1 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));  // U/V for 2-plane interleaved formats.
1772 		Pointer<Byte> bufferV = *Pointer<Pointer<Byte>>(mipmap + 2 * sizeof(Mipmap) + OFFSET(Mipmap, buffer));
1773 
1774 		// Luminance
1775 		Int c0 = Int(bufferY[index[0]]);
1776 		Int c1 = Int(bufferY[index[1]]);
1777 		Int c2 = Int(bufferY[index[2]]);
1778 		Int c3 = Int(bufferY[index[3]]);
1779 		c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
1780 		UShort4 Y = As<UShort4>(Unpack(As<Byte4>(c0)));
1781 
1782 		UShort4 Cb, Cr;
1783 
1784 		// Chroma
1785 		{
1786 			computeIndices(index, uuuu, vvvv, wwww, layerIndex, offset, sample, mipmap + sizeof(Mipmap), function);
1787 			UShort4 U, V;
1788 
1789 			if(state.textureFormat == VK_FORMAT_G8_B8_R8_3PLANE_420_UNORM)
1790 			{
1791 				c0 = Int(bufferU[index[0]]);
1792 				c1 = Int(bufferU[index[1]]);
1793 				c2 = Int(bufferU[index[2]]);
1794 				c3 = Int(bufferU[index[3]]);
1795 				c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
1796 				U = As<UShort4>(Unpack(As<Byte4>(c0)));
1797 
1798 				c0 = Int(bufferV[index[0]]);
1799 				c1 = Int(bufferV[index[1]]);
1800 				c2 = Int(bufferV[index[2]]);
1801 				c3 = Int(bufferV[index[3]]);
1802 				c0 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
1803 				V = As<UShort4>(Unpack(As<Byte4>(c0)));
1804 			}
1805 			else if(state.textureFormat == VK_FORMAT_G8_B8R8_2PLANE_420_UNORM)
1806 			{
1807 				Short4 UV;
1808 				UV = Insert(UV, Pointer<Short>(bufferU)[index[0]], 0);  // TODO: Insert(UShort4, UShort)
1809 				UV = Insert(UV, Pointer<Short>(bufferU)[index[1]], 1);
1810 				UV = Insert(UV, Pointer<Short>(bufferU)[index[2]], 2);
1811 				UV = Insert(UV, Pointer<Short>(bufferU)[index[3]], 3);
1812 				U = (UV & Short4(0x00FFu)) | (UV << 8);
1813 				V = (UV & Short4(0xFF00u)) | As<Short4>(As<UShort4>(UV) >> 8);
1814 			}
1815 			else
1816 				UNSUPPORTED("state.textureFormat %d", (int)state.textureFormat);
1817 
1818 			if(!state.swappedChroma)
1819 			{
1820 				Cb = U;
1821 				Cr = V;
1822 			}
1823 			else
1824 			{
1825 				Cb = V;
1826 				Cr = U;
1827 			}
1828 		}
1829 
1830 		if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY)
1831 		{
1832 			// YCbCr formats are treated as signed 15-bit.
1833 			c.x = Cr >> 1;
1834 			c.y = Y >> 1;
1835 			c.z = Cb >> 1;
1836 		}
1837 		else
1838 		{
1839 			// Scaling and bias for studio-swing range: Y = [16 .. 235], U/V = [16 .. 240]
1840 			// Scale down by 0x0101 to normalize the 8.8 samples, and up by 0x7FFF for signed 15-bit output.
1841 			float yOffset = static_cast<float>(state.studioSwing ? 16 * 0x0101 : 0);
1842 			float uvOffset = static_cast<float>(128 * 0x0101);
1843 			float yFactor = static_cast<float>(0x7FFF) / static_cast<float>(state.studioSwing ? 219 * 0x0101 : 255 * 0x0101);
1844 			float uvFactor = static_cast<float>(0x7FFF) / static_cast<float>(state.studioSwing ? 224 * 0x0101 : 255 * 0x0101);
1845 
1846 			Float4 y = (Float4(Y) - Float4(yOffset)) * Float4(yFactor);
1847 			Float4 u = (Float4(Cb) - Float4(uvOffset)) * Float4(uvFactor);
1848 			Float4 v = (Float4(Cr) - Float4(uvOffset)) * Float4(uvFactor);
1849 
1850 			if(state.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_IDENTITY)
1851 			{
1852 				c.x = Short4(v);
1853 				c.y = Short4(y);
1854 				c.z = Short4(u);
1855 			}
1856 			else
1857 			{
1858 				// Generic YCbCr to RGB transformation:
1859 				// R = Y                               +           2 * (1 - Kr) * Cr
1860 				// G = Y - 2 * Kb * (1 - Kb) / Kg * Cb - 2 * Kr * (1 - Kr) / Kg * Cr
1861 				// B = Y +           2 * (1 - Kb) * Cb
1862 
1863 				float Kb = 0.114f;
1864 				float Kr = 0.299f;
1865 
1866 				switch(state.ycbcrModel)
1867 				{
1868 					case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_709:
1869 						Kb = 0.0722f;
1870 						Kr = 0.2126f;
1871 						break;
1872 					case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_601:
1873 						Kb = 0.114f;
1874 						Kr = 0.299f;
1875 						break;
1876 					case VK_SAMPLER_YCBCR_MODEL_CONVERSION_YCBCR_2020:
1877 						Kb = 0.0593f;
1878 						Kr = 0.2627f;
1879 						break;
1880 					default:
1881 						UNSUPPORTED("ycbcrModel %d", int(state.ycbcrModel));
1882 				}
1883 
1884 				const float Kg = 1.0f - Kr - Kb;
1885 
1886 				const float Rr = 2 * (1 - Kr);
1887 				const float Gb = -2 * Kb * (1 - Kb) / Kg;
1888 				const float Gr = -2 * Kr * (1 - Kr) / Kg;
1889 				const float Bb = 2 * (1 - Kb);
1890 
1891 				Float4 r = y + Float4(Rr) * v;
1892 				Float4 g = y + Float4(Gb) * u + Float4(Gr) * v;
1893 				Float4 b = y + Float4(Bb) * u;
1894 
1895 				c.x = Short4(r);
1896 				c.y = Short4(g);
1897 				c.z = Short4(b);
1898 			}
1899 		}
1900 	}
1901 	else
1902 	{
1903 		return sampleTexel(index, buffer);
1904 	}
1905 
1906 	return c;
1907 }
1908 
sampleTexel(Int4 & uuuu,Int4 & vvvv,Int4 & wwww,Float4 & dRef,const Int4 & sample,Pointer<Byte> & mipmap,Pointer<Byte> buffer,SamplerFunction function)1909 Vector4f SamplerCore::sampleTexel(Int4 &uuuu, Int4 &vvvv, Int4 &wwww, Float4 &dRef, const Int4 &sample, Pointer<Byte> &mipmap, Pointer<Byte> buffer, SamplerFunction function)
1910 {
1911 	Int4 valid;
1912 
1913 	if(borderModeActive())
1914 	{
1915 		// Valid texels have positive coordinates.
1916 		Int4 negative = uuuu;
1917 		if(state.is2D() || state.is3D() || state.isCube()) negative |= vvvv;
1918 		if(state.is3D() || state.isCube() || state.isArrayed()) negative |= wwww;
1919 		valid = CmpNLT(negative, Int4(0));
1920 	}
1921 
1922 	UInt index[4];
1923 	computeIndices(index, uuuu, vvvv, wwww, sample, valid, mipmap, function);
1924 
1925 	Vector4f c;
1926 
1927 	if(hasFloatTexture() || has32bitIntegerTextureComponents())
1928 	{
1929 		UInt4 t0, t1, t2, t3;
1930 
1931 		switch(state.textureFormat)
1932 		{
1933 			case VK_FORMAT_R16_SFLOAT:
1934 				t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 2));
1935 				t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 2));
1936 				t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 2));
1937 				t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 2));
1938 
1939 				c.x.x = Extract(As<Float4>(halfToFloatBits(t0)), 0);
1940 				c.x.y = Extract(As<Float4>(halfToFloatBits(t1)), 0);
1941 				c.x.z = Extract(As<Float4>(halfToFloatBits(t2)), 0);
1942 				c.x.w = Extract(As<Float4>(halfToFloatBits(t3)), 0);
1943 				break;
1944 			case VK_FORMAT_R16G16_SFLOAT:
1945 				t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 4));
1946 				t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 4));
1947 				t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 4));
1948 				t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 4));
1949 
1950 				// FIXME: shuffles
1951 				c.x = As<Float4>(halfToFloatBits(t0));
1952 				c.y = As<Float4>(halfToFloatBits(t1));
1953 				c.z = As<Float4>(halfToFloatBits(t2));
1954 				c.w = As<Float4>(halfToFloatBits(t3));
1955 				transpose4x4(c.x, c.y, c.z, c.w);
1956 				break;
1957 			case VK_FORMAT_R16G16B16A16_SFLOAT:
1958 				t0 = Int4(*Pointer<UShort4>(buffer + index[0] * 8));
1959 				t1 = Int4(*Pointer<UShort4>(buffer + index[1] * 8));
1960 				t2 = Int4(*Pointer<UShort4>(buffer + index[2] * 8));
1961 				t3 = Int4(*Pointer<UShort4>(buffer + index[3] * 8));
1962 
1963 				c.x = As<Float4>(halfToFloatBits(t0));
1964 				c.y = As<Float4>(halfToFloatBits(t1));
1965 				c.z = As<Float4>(halfToFloatBits(t2));
1966 				c.w = As<Float4>(halfToFloatBits(t3));
1967 				transpose4x4(c.x, c.y, c.z, c.w);
1968 				break;
1969 			case VK_FORMAT_R32_SFLOAT:
1970 			case VK_FORMAT_R32_SINT:
1971 			case VK_FORMAT_R32_UINT:
1972 			case VK_FORMAT_D32_SFLOAT:
1973 				// FIXME: Optimal shuffling?
1974 				c.x.x = *Pointer<Float>(buffer + index[0] * 4);
1975 				c.x.y = *Pointer<Float>(buffer + index[1] * 4);
1976 				c.x.z = *Pointer<Float>(buffer + index[2] * 4);
1977 				c.x.w = *Pointer<Float>(buffer + index[3] * 4);
1978 				break;
1979 			case VK_FORMAT_R32G32_SFLOAT:
1980 			case VK_FORMAT_R32G32_SINT:
1981 			case VK_FORMAT_R32G32_UINT:
1982 				// FIXME: Optimal shuffling?
1983 				c.x.xy = *Pointer<Float4>(buffer + index[0] * 8);
1984 				c.x.zw = *Pointer<Float4>(buffer + index[1] * 8 - 8);
1985 				c.z.xy = *Pointer<Float4>(buffer + index[2] * 8);
1986 				c.z.zw = *Pointer<Float4>(buffer + index[3] * 8 - 8);
1987 				c.y = c.x;
1988 				c.x = Float4(c.x.xz, c.z.xz);
1989 				c.y = Float4(c.y.yw, c.z.yw);
1990 				break;
1991 			case VK_FORMAT_R32G32B32A32_SFLOAT:
1992 			case VK_FORMAT_R32G32B32A32_SINT:
1993 			case VK_FORMAT_R32G32B32A32_UINT:
1994 				c.x = *Pointer<Float4>(buffer + index[0] * 16, 16);
1995 				c.y = *Pointer<Float4>(buffer + index[1] * 16, 16);
1996 				c.z = *Pointer<Float4>(buffer + index[2] * 16, 16);
1997 				c.w = *Pointer<Float4>(buffer + index[3] * 16, 16);
1998 				transpose4x4(c.x, c.y, c.z, c.w);
1999 				break;
2000 			case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
2001 			{
2002 				Float4 t;  // TODO: add Insert(UInt4, RValue<UInt>)
2003 				t.x = *Pointer<Float>(buffer + index[0] * 4);
2004 				t.y = *Pointer<Float>(buffer + index[1] * 4);
2005 				t.z = *Pointer<Float>(buffer + index[2] * 4);
2006 				t.w = *Pointer<Float>(buffer + index[3] * 4);
2007 				t0 = As<UInt4>(t);
2008 				c.w = Float4(UInt4(1) << ((t0 >> 27) & UInt4(0x1F))) * Float4(1.0f / (1 << 24));
2009 				c.x = Float4(t0 & UInt4(0x1FF)) * c.w;
2010 				c.y = Float4((t0 >> 9) & UInt4(0x1FF)) * c.w;
2011 				c.z = Float4((t0 >> 18) & UInt4(0x1FF)) * c.w;
2012 				break;
2013 			}
2014 			case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2015 			{
2016 				Float4 t;  // TODO: add Insert(UInt4, RValue<UInt>)
2017 				t.x = *Pointer<Float>(buffer + index[0] * 4);
2018 				t.y = *Pointer<Float>(buffer + index[1] * 4);
2019 				t.z = *Pointer<Float>(buffer + index[2] * 4);
2020 				t.w = *Pointer<Float>(buffer + index[3] * 4);
2021 				t0 = As<UInt4>(t);
2022 				c.x = As<Float4>(halfToFloatBits((t0 << 4) & UInt4(0x7FF0)));
2023 				c.y = As<Float4>(halfToFloatBits((t0 >> 7) & UInt4(0x7FF0)));
2024 				c.z = As<Float4>(halfToFloatBits((t0 >> 17) & UInt4(0x7FE0)));
2025 				break;
2026 			}
2027 			default:
2028 				UNSUPPORTED("Format %d", VkFormat(state.textureFormat));
2029 		}
2030 	}
2031 	else
2032 	{
2033 		ASSERT(!isYcbcrFormat());
2034 
2035 		Vector4s cs = sampleTexel(index, buffer);
2036 
2037 		bool isInteger = state.textureFormat.isUnnormalizedInteger();
2038 		int componentCount = textureComponentCount();
2039 		for(int n = 0; n < componentCount; n++)
2040 		{
2041 			if(hasUnsignedTextureComponent(n))
2042 			{
2043 				if(isInteger)
2044 				{
2045 					c[n] = As<Float4>(Int4(As<UShort4>(cs[n])));
2046 				}
2047 				else
2048 				{
2049 					c[n] = Float4(As<UShort4>(cs[n]));
2050 				}
2051 			}
2052 			else
2053 			{
2054 				if(isInteger)
2055 				{
2056 					c[n] = As<Float4>(Int4(cs[n]));
2057 				}
2058 				else
2059 				{
2060 					c[n] = Float4(cs[n]);
2061 				}
2062 			}
2063 		}
2064 	}
2065 
2066 	if(state.compareEnable)
2067 	{
2068 		Float4 ref = dRef;
2069 
2070 		if(!hasFloatTexture())
2071 		{
2072 			// D16_UNORM: clamp reference, normalize texel value
2073 			ref = Min(Max(ref, Float4(0.0f)), Float4(1.0f));
2074 			c.x = c.x * Float4(1.0f / 0xFFFF);
2075 		}
2076 
2077 		Int4 boolean;
2078 
2079 		switch(state.compareOp)
2080 		{
2081 			case VK_COMPARE_OP_LESS_OR_EQUAL: boolean = CmpLE(ref, c.x); break;
2082 			case VK_COMPARE_OP_GREATER_OR_EQUAL: boolean = CmpNLT(ref, c.x); break;
2083 			case VK_COMPARE_OP_LESS: boolean = CmpLT(ref, c.x); break;
2084 			case VK_COMPARE_OP_GREATER: boolean = CmpNLE(ref, c.x); break;
2085 			case VK_COMPARE_OP_EQUAL: boolean = CmpEQ(ref, c.x); break;
2086 			case VK_COMPARE_OP_NOT_EQUAL: boolean = CmpNEQ(ref, c.x); break;
2087 			case VK_COMPARE_OP_ALWAYS: boolean = Int4(-1); break;
2088 			case VK_COMPARE_OP_NEVER: boolean = Int4(0); break;
2089 			default: ASSERT(false);
2090 		}
2091 
2092 		c.x = As<Float4>(boolean & As<Int4>(Float4(1.0f)));
2093 		c.y = Float4(0.0f);
2094 		c.z = Float4(0.0f);
2095 		c.w = Float4(1.0f);
2096 	}
2097 
2098 	if(borderModeActive())
2099 	{
2100 		c = replaceBorderTexel(c, valid);
2101 	}
2102 
2103 	return c;
2104 }
2105 
replaceBorderTexel(const Vector4f & c,Int4 valid)2106 Vector4f SamplerCore::replaceBorderTexel(const Vector4f &c, Int4 valid)
2107 {
2108 	Int4 borderRGB;
2109 	Int4 borderA;
2110 
2111 	bool scaled = !hasFloatTexture() && !hasUnnormalizedIntegerTexture() && !state.compareEnable;
2112 	bool sign = !hasUnsignedTextureComponent(0);
2113 	Int4 float_one = scaled ? As<Int4>(Float4(static_cast<float>(sign ? 0x7FFF : 0xFFFF))) : As<Int4>(Float4(1.0f));
2114 
2115 	switch(state.border)
2116 	{
2117 		case VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK:
2118 		case VK_BORDER_COLOR_INT_TRANSPARENT_BLACK:
2119 			borderRGB = Int4(0);
2120 			borderA = Int4(0);
2121 			break;
2122 		case VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK:
2123 			borderRGB = Int4(0);
2124 			borderA = float_one;
2125 			break;
2126 		case VK_BORDER_COLOR_INT_OPAQUE_BLACK:
2127 			borderRGB = Int4(0);
2128 			borderA = Int4(1);
2129 			break;
2130 		case VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE:
2131 			borderRGB = float_one;
2132 			borderA = float_one;
2133 			break;
2134 		case VK_BORDER_COLOR_INT_OPAQUE_WHITE:
2135 			borderRGB = Int4(1);
2136 			borderA = Int4(1);
2137 			break;
2138 		default:
2139 			UNSUPPORTED("sint/uint/sfloat border: %u", state.border);
2140 	}
2141 
2142 	Vector4f out;
2143 	out.x = As<Float4>((valid & As<Int4>(c.x)) | (~valid & borderRGB));  // TODO: IfThenElse()
2144 	out.y = As<Float4>((valid & As<Int4>(c.y)) | (~valid & borderRGB));
2145 	out.z = As<Float4>((valid & As<Int4>(c.z)) | (~valid & borderRGB));
2146 	out.w = As<Float4>((valid & As<Int4>(c.w)) | (~valid & borderA));
2147 
2148 	return out;
2149 }
2150 
selectMipmap(const Pointer<Byte> & texture,Pointer<Byte> & mipmap,Pointer<Byte> & buffer,const Float & lod,bool secondLOD)2151 void SamplerCore::selectMipmap(const Pointer<Byte> &texture, Pointer<Byte> &mipmap, Pointer<Byte> &buffer, const Float &lod, bool secondLOD)
2152 {
2153 	Pointer<Byte> mipmap0 = texture + OFFSET(Texture, mipmap[0]);
2154 
2155 	if(state.mipmapFilter == MIPMAP_NONE)
2156 	{
2157 		mipmap = mipmap0;
2158 	}
2159 	else
2160 	{
2161 		Int ilod;
2162 
2163 		if(state.mipmapFilter == MIPMAP_POINT)
2164 		{
2165 			// TODO: Preferred formula is ceil(lod + 0.5) - 1
2166 			ilod = RoundInt(lod);
2167 		}
2168 		else  // MIPMAP_LINEAR
2169 		{
2170 			ilod = Int(lod);
2171 		}
2172 
2173 		mipmap = mipmap0 + ilod * sizeof(Mipmap) + secondLOD * sizeof(Mipmap);
2174 	}
2175 
2176 	buffer = *Pointer<Pointer<Byte>>(mipmap + OFFSET(Mipmap, buffer));
2177 }
2178 
computeFilterOffset(Float & lod)2179 Int4 SamplerCore::computeFilterOffset(Float &lod)
2180 {
2181 	if(state.textureFilter == FILTER_POINT)
2182 	{
2183 		return Int4(0);
2184 	}
2185 	else if(state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
2186 	{
2187 		return CmpNLE(Float4(lod), Float4(0.0f));
2188 	}
2189 	else if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR)
2190 	{
2191 		return CmpLE(Float4(lod), Float4(0.0f));
2192 	}
2193 
2194 	return Int4(~0);
2195 }
2196 
address(const Float4 & uw,AddressingMode addressingMode,Pointer<Byte> & mipmap)2197 Short4 SamplerCore::address(const Float4 &uw, AddressingMode addressingMode, Pointer<Byte> &mipmap)
2198 {
2199 	if(addressingMode == ADDRESSING_UNUSED)
2200 	{
2201 		return Short4(0);  // TODO(b/134669567): Optimize for 1D filtering
2202 	}
2203 	else if(addressingMode == ADDRESSING_CLAMP || addressingMode == ADDRESSING_BORDER)
2204 	{
2205 		Float4 clamp = Min(Max(uw, Float4(0.0f)), Float4(65535.0f / 65536.0f));
2206 
2207 		return Short4(Int4(clamp * Float4(1 << 16)));
2208 	}
2209 	else if(addressingMode == ADDRESSING_MIRROR)
2210 	{
2211 		Int4 convert = Int4(uw * Float4(1 << 16));
2212 		Int4 mirror = (convert << 15) >> 31;
2213 
2214 		convert ^= mirror;
2215 
2216 		return Short4(convert);
2217 	}
2218 	else if(addressingMode == ADDRESSING_MIRRORONCE)
2219 	{
2220 		// Absolute value
2221 		Int4 convert = Int4(Abs(uw * Float4(1 << 16)));
2222 
2223 		// Clamp
2224 		convert -= Int4(0x00008000, 0x00008000, 0x00008000, 0x00008000);
2225 		convert = As<Int4>(PackSigned(convert, convert));
2226 
2227 		return As<Short4>(Int2(convert)) + Short4(0x8000u);
2228 	}
2229 	else  // Wrap
2230 	{
2231 		return Short4(Int4(uw * Float4(1 << 16)));
2232 	}
2233 }
2234 
computeLayerIndex(const Float4 & a,Pointer<Byte> & mipmap)2235 Short4 SamplerCore::computeLayerIndex(const Float4 &a, Pointer<Byte> &mipmap)
2236 {
2237 	if(!state.isArrayed())
2238 	{
2239 		return {};
2240 	}
2241 
2242 	Int4 layers = *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth));
2243 
2244 	return Short4(Min(Max(RoundInt(a), Int4(0)), layers - Int4(1)));
2245 }
2246 
2247 // TODO: Eliminate when the gather + mirror addressing case is handled by mirroring the footprint.
mirror(Int4 n)2248 static Int4 mirror(Int4 n)
2249 {
2250 	auto positive = CmpNLT(n, Int4(0));
2251 	return (positive & n) | (~positive & (-(Int4(1) + n)));
2252 }
2253 
mod(Int4 n,Int4 d)2254 static Int4 mod(Int4 n, Int4 d)
2255 {
2256 	auto x = n % d;
2257 	auto positive = CmpNLT(x, Int4(0));
2258 	return (positive & x) | (~positive & (x + d));
2259 }
2260 
address(const Float4 & uvw,Int4 & xyz0,Int4 & xyz1,Float4 & f,Pointer<Byte> & mipmap,Int4 & offset,Int4 & filter,int whd,AddressingMode addressingMode,SamplerFunction function)2261 void SamplerCore::address(const Float4 &uvw, Int4 &xyz0, Int4 &xyz1, Float4 &f, Pointer<Byte> &mipmap, Int4 &offset, Int4 &filter, int whd, AddressingMode addressingMode, SamplerFunction function)
2262 {
2263 	if(addressingMode == ADDRESSING_UNUSED)
2264 	{
2265 		f = Float4(0.0f);  // TODO(b/134669567): Optimize for 1D filtering
2266 		return;
2267 	}
2268 
2269 	Int4 dim = *Pointer<Int4>(mipmap + whd, 16);
2270 	Int4 maxXYZ = dim - Int4(1);
2271 
2272 	if(function == Fetch)  // Unnormalized coordinates
2273 	{
2274 		Int4 xyz = function.offset ? As<Int4>(uvw) + offset : As<Int4>(uvw);
2275 		xyz0 = Min(Max(xyz, Int4(0)), maxXYZ);
2276 
2277 		// VK_EXT_image_robustness requires checking for out-of-bounds accesses.
2278 		// TODO(b/162327166): Only perform bounds checks when VK_EXT_image_robustness is enabled.
2279 		// If the above clamping altered the result, the access is out-of-bounds.
2280 		// In that case set the coordinate to -1 to perform texel replacement later.
2281 		Int4 outOfBounds = CmpNEQ(xyz, xyz0);
2282 		xyz0 |= outOfBounds;
2283 	}
2284 	else if(addressingMode == ADDRESSING_CUBEFACE)
2285 	{
2286 		xyz0 = As<Int4>(uvw);
2287 	}
2288 	else
2289 	{
2290 		const int halfBits = 0x3EFFFFFF;  // Value just under 0.5f
2291 		const int oneBits = 0x3F7FFFFF;   // Value just under 1.0f
2292 		const int twoBits = 0x3FFFFFFF;   // Value just under 2.0f
2293 
2294 		Float4 coord = uvw;
2295 
2296 		if(state.unnormalizedCoordinates)
2297 		{
2298 			switch(addressingMode)
2299 			{
2300 				case ADDRESSING_CLAMP:
2301 					coord = Min(Max(coord, Float4(0.0f)), Float4(dim) * As<Float4>(Int4(oneBits)));
2302 					break;
2303 				case ADDRESSING_BORDER:
2304 					// Don't map to a valid range here.
2305 					break;
2306 				default:
2307 					// "If unnormalizedCoordinates is VK_TRUE, addressModeU and addressModeV must each be
2308 					//  either VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE or VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER"
2309 					UNREACHABLE("addressingMode %d", int(addressingMode));
2310 					break;
2311 			}
2312 		}
2313 		else if(state.textureFilter == FILTER_GATHER && addressingMode == ADDRESSING_MIRROR)
2314 		{
2315 			// Gather requires the 'footprint' of the texels from which a component is taken, to also mirror around.
2316 			// Therefore we can't just compute one texel's location and find the other ones at +1 offsets from it.
2317 			// Here we handle that case separately by doing the mirroring per texel coordinate.
2318 			// TODO: Mirror the footprint by adjusting the sign of the 0.5f and 1 offsets.
2319 
2320 			coord = coord * Float4(dim);
2321 			coord -= Float4(0.5f);
2322 			Float4 floor = Floor(coord);
2323 			xyz0 = Int4(floor);
2324 
2325 			if(function.offset)
2326 			{
2327 				xyz0 += offset;
2328 			}
2329 
2330 			xyz1 = xyz0 + Int4(1);
2331 
2332 			xyz0 = (maxXYZ)-mirror(mod(xyz0, Int4(2) * dim) - dim);
2333 			xyz1 = (maxXYZ)-mirror(mod(xyz1, Int4(2) * dim) - dim);
2334 
2335 			return;
2336 		}
2337 		else
2338 		{
2339 			if(!function.offset)
2340 			{
2341 				switch(addressingMode)
2342 				{
2343 					case ADDRESSING_CLAMP:
2344 					case ADDRESSING_SEAMLESS:
2345 						// While cube face coordinates are nominally already in the [0.0, 1.0] range
2346 						// due to the projection, and numerical imprecision is tolerated due to the
2347 						// border of pixels for seamless filtering, the projection doesn't cause
2348 						// range normalization for Inf and NaN values. So we always clamp.
2349 						{
2350 							Float4 one = As<Float4>(Int4(oneBits));
2351 							coord = Min(Max(coord, Float4(0.0f)), one);
2352 						}
2353 						break;
2354 					case ADDRESSING_MIRROR:
2355 					{
2356 						Float4 half = As<Float4>(Int4(halfBits));
2357 						Float4 one = As<Float4>(Int4(oneBits));
2358 						Float4 two = As<Float4>(Int4(twoBits));
2359 						coord = one - Abs(two * Frac(coord * half) - one);
2360 					}
2361 					break;
2362 					case ADDRESSING_MIRRORONCE:
2363 					{
2364 						Float4 half = As<Float4>(Int4(halfBits));
2365 						Float4 one = As<Float4>(Int4(oneBits));
2366 						Float4 two = As<Float4>(Int4(twoBits));
2367 						coord = one - Abs(two * Frac(Min(Max(coord, -one), two) * half) - one);
2368 					}
2369 					break;
2370 					case ADDRESSING_BORDER:
2371 						// Don't map to a valid range here.
2372 						break;
2373 					default:  // Wrap
2374 						coord = Frac(coord);
2375 						break;
2376 				}
2377 			}
2378 
2379 			coord = coord * Float4(dim);
2380 		}
2381 
2382 		if(state.textureFilter == FILTER_POINT)
2383 		{
2384 			if(addressingMode == ADDRESSING_BORDER || function.offset)
2385 			{
2386 				xyz0 = Int4(Floor(coord));
2387 			}
2388 			else  // Can't have negative coordinates, so floor() is redundant when casting to int.
2389 			{
2390 				xyz0 = Int4(coord);
2391 			}
2392 		}
2393 		else
2394 		{
2395 			if(state.textureFilter == FILTER_MIN_POINT_MAG_LINEAR ||
2396 			   state.textureFilter == FILTER_MIN_LINEAR_MAG_POINT)
2397 			{
2398 				coord -= As<Float4>(As<Int4>(Float4(0.5f)) & filter);
2399 			}
2400 			else
2401 			{
2402 				coord -= Float4(0.5f);
2403 			}
2404 
2405 			Float4 floor = Floor(coord);
2406 			xyz0 = Int4(floor);
2407 			f = coord - floor;
2408 		}
2409 
2410 		if(function.offset)
2411 		{
2412 			xyz0 += offset;
2413 		}
2414 
2415 		if(addressingMode == ADDRESSING_SEAMLESS)  // Adjust for border.
2416 		{
2417 			xyz0 += Int4(1);
2418 		}
2419 
2420 		xyz1 = xyz0 - filter;  // Increment
2421 
2422 		if(addressingMode == ADDRESSING_BORDER)
2423 		{
2424 			// Replace the coordinates with -1 if they're out of range.
2425 			Int4 border0 = CmpLT(xyz0, Int4(0)) | CmpNLT(xyz0, dim);
2426 			Int4 border1 = CmpLT(xyz1, Int4(0)) | CmpNLT(xyz1, dim);
2427 			xyz0 |= border0;
2428 			xyz1 |= border1;
2429 		}
2430 		else if(function.offset)
2431 		{
2432 			switch(addressingMode)
2433 			{
2434 				case ADDRESSING_SEAMLESS:
2435 					UNREACHABLE("addressingMode %d", int(addressingMode));  // Cube sampling doesn't support offset.
2436 				case ADDRESSING_MIRROR:
2437 				case ADDRESSING_MIRRORONCE:
2438 					// TODO: Implement ADDRESSING_MIRROR and ADDRESSING_MIRRORONCE.
2439 					// Fall through to Clamp.
2440 				case ADDRESSING_CLAMP:
2441 					xyz0 = Min(Max(xyz0, Int4(0)), maxXYZ);
2442 					xyz1 = Min(Max(xyz1, Int4(0)), maxXYZ);
2443 					break;
2444 				default:  // Wrap
2445 					xyz0 = mod(xyz0, dim);
2446 					xyz1 = mod(xyz1, dim);
2447 					break;
2448 			}
2449 		}
2450 		else if(state.textureFilter != FILTER_POINT)
2451 		{
2452 			switch(addressingMode)
2453 			{
2454 				case ADDRESSING_SEAMLESS:
2455 					break;
2456 				case ADDRESSING_MIRROR:
2457 				case ADDRESSING_MIRRORONCE:
2458 				case ADDRESSING_CLAMP:
2459 					xyz0 = Max(xyz0, Int4(0));
2460 					xyz1 = Min(xyz1, maxXYZ);
2461 					break;
2462 				default:  // Wrap
2463 				{
2464 					Int4 under = CmpLT(xyz0, Int4(0));
2465 					xyz0 = (under & maxXYZ) | (~under & xyz0);  // xyz < 0 ? dim - 1 : xyz   // TODO: IfThenElse()
2466 
2467 					Int4 nover = CmpLT(xyz1, dim);
2468 					xyz1 = nover & xyz1;  // xyz >= dim ? 0 : xyz
2469 				}
2470 				break;
2471 			}
2472 		}
2473 	}
2474 }
2475 
computeLayerIndex(const Float4 & a,Pointer<Byte> & mipmap,SamplerFunction function)2476 Int4 SamplerCore::computeLayerIndex(const Float4 &a, Pointer<Byte> &mipmap, SamplerFunction function)
2477 {
2478 	if(!state.isArrayed())
2479 	{
2480 		return {};
2481 	}
2482 
2483 	Int4 layers = *Pointer<Int4>(mipmap + OFFSET(Mipmap, depth), 16);
2484 	Int4 maxLayer = layers - Int4(1);
2485 
2486 	if(function == Fetch)  // Unnormalized coordinates
2487 	{
2488 		Int4 xyz = As<Int4>(a);
2489 		Int4 xyz0 = Min(Max(xyz, Int4(0)), maxLayer);
2490 
2491 		// VK_EXT_image_robustness requires checking for out-of-bounds accesses.
2492 		// TODO(b/162327166): Only perform bounds checks when VK_EXT_image_robustness is enabled.
2493 		// If the above clamping altered the result, the access is out-of-bounds.
2494 		// In that case set the coordinate to -1 to perform texel replacement later.
2495 		Int4 outOfBounds = CmpNEQ(xyz, xyz0);
2496 		xyz0 |= outOfBounds;
2497 
2498 		return xyz0;
2499 	}
2500 	else
2501 	{
2502 		return Min(Max(RoundInt(a), Int4(0)), maxLayer);
2503 	}
2504 }
2505 
convertSigned15(Float4 & cf,Short4 & cs)2506 void SamplerCore::convertSigned15(Float4 &cf, Short4 &cs)
2507 {
2508 	cf = Float4(cs) * Float4(1.0f / 0x7FFF);
2509 }
2510 
convertUnsigned16(Float4 & cf,Short4 & cs)2511 void SamplerCore::convertUnsigned16(Float4 &cf, Short4 &cs)
2512 {
2513 	cf = Float4(As<UShort4>(cs)) * Float4(1.0f / 0xFFFF);
2514 }
2515 
sRGBtoLinearFF00(Short4 & c)2516 void SamplerCore::sRGBtoLinearFF00(Short4 &c)
2517 {
2518 	c = As<UShort4>(c) >> 8;
2519 
2520 	Pointer<Byte> LUT = Pointer<Byte>(constants + OFFSET(Constants, sRGBtoLinearFF_FF00));
2521 
2522 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 0))), 0);
2523 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 1))), 1);
2524 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 2))), 2);
2525 	c = Insert(c, *Pointer<Short>(LUT + 2 * Int(Extract(c, 3))), 3);
2526 }
2527 
hasFloatTexture() const2528 bool SamplerCore::hasFloatTexture() const
2529 {
2530 	return state.textureFormat.isFloatFormat();
2531 }
2532 
hasUnnormalizedIntegerTexture() const2533 bool SamplerCore::hasUnnormalizedIntegerTexture() const
2534 {
2535 	return state.textureFormat.isUnnormalizedInteger();
2536 }
2537 
hasUnsignedTextureComponent(int component) const2538 bool SamplerCore::hasUnsignedTextureComponent(int component) const
2539 {
2540 	return state.textureFormat.isUnsignedComponent(component);
2541 }
2542 
textureComponentCount() const2543 int SamplerCore::textureComponentCount() const
2544 {
2545 	return state.textureFormat.componentCount();
2546 }
2547 
has16bitPackedTextureFormat() const2548 bool SamplerCore::has16bitPackedTextureFormat() const
2549 {
2550 	return state.textureFormat.has16bitPackedTextureFormat();
2551 }
2552 
has8bitTextureComponents() const2553 bool SamplerCore::has8bitTextureComponents() const
2554 {
2555 	return state.textureFormat.has8bitTextureComponents();
2556 }
2557 
has16bitTextureComponents() const2558 bool SamplerCore::has16bitTextureComponents() const
2559 {
2560 	return state.textureFormat.has16bitTextureComponents();
2561 }
2562 
has32bitIntegerTextureComponents() const2563 bool SamplerCore::has32bitIntegerTextureComponents() const
2564 {
2565 	return state.textureFormat.has32bitIntegerTextureComponents();
2566 }
2567 
isYcbcrFormat() const2568 bool SamplerCore::isYcbcrFormat() const
2569 {
2570 	return state.textureFormat.isYcbcrFormat();
2571 }
2572 
isRGBComponent(int component) const2573 bool SamplerCore::isRGBComponent(int component) const
2574 {
2575 	return state.textureFormat.isRGBComponent(component);
2576 }
2577 
borderModeActive() const2578 bool SamplerCore::borderModeActive() const
2579 {
2580 	return state.addressingModeU == ADDRESSING_BORDER ||
2581 	       state.addressingModeV == ADDRESSING_BORDER ||
2582 	       state.addressingModeW == ADDRESSING_BORDER;
2583 }
2584 
gatherSwizzle() const2585 VkComponentSwizzle SamplerCore::gatherSwizzle() const
2586 {
2587 	switch(state.gatherComponent)
2588 	{
2589 		case 0: return state.swizzle.r;
2590 		case 1: return state.swizzle.g;
2591 		case 2: return state.swizzle.b;
2592 		case 3: return state.swizzle.a;
2593 		default:
2594 			UNREACHABLE("Invalid component");
2595 			return VK_COMPONENT_SWIZZLE_R;
2596 	}
2597 }
2598 
2599 }  // namespace sw
2600