1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Surface.hpp"
16 
17 #include "Color.hpp"
18 #include "Context.hpp"
19 #include "ETC_Decoder.hpp"
20 #include "Renderer.hpp"
21 #include "Common/Half.hpp"
22 #include "Common/Memory.hpp"
23 #include "Common/CPUID.hpp"
24 #include "Common/Resource.hpp"
25 #include "Common/Debug.hpp"
26 #include "Reactor/Reactor.hpp"
27 
28 #if defined(__i386__) || defined(__x86_64__)
29 	#include <xmmintrin.h>
30 	#include <emmintrin.h>
31 #endif
32 
33 #undef min
34 #undef max
35 
36 namespace sw
37 {
38 	extern bool quadLayoutEnabled;
39 	extern bool complementaryDepthBuffer;
40 	extern TranscendentalPrecision logPrecision;
41 
42 	unsigned int *Surface::palette = 0;
43 	unsigned int Surface::paletteID = 0;
44 
write(int x,int y,int z,const Color<float> & color)45 	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
46 	{
47 		ASSERT((x >= -border) && (x < (width + border)));
48 		ASSERT((y >= -border) && (y < (height + border)));
49 		ASSERT((z >= 0) && (z < depth));
50 
51 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
52 
53 		for(int i = 0; i < samples; i++)
54 		{
55 			write(element, color);
56 			element += sliceB;
57 		}
58 	}
59 
write(int x,int y,const Color<float> & color)60 	void Surface::Buffer::write(int x, int y, const Color<float> &color)
61 	{
62 		ASSERT((x >= -border) && (x < (width + border)));
63 		ASSERT((y >= -border) && (y < (height + border)));
64 
65 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB;
66 
67 		for(int i = 0; i < samples; i++)
68 		{
69 			write(element, color);
70 			element += sliceB;
71 		}
72 	}
73 
write(void * element,const Color<float> & color)74 	inline void Surface::Buffer::write(void *element, const Color<float> &color)
75 	{
76 		float r = color.r;
77 		float g = color.g;
78 		float b = color.b;
79 		float a = color.a;
80 
81 		if(isSRGBformat(format))
82 		{
83 			r = linearToSRGB(r);
84 			g = linearToSRGB(g);
85 			b = linearToSRGB(b);
86 		}
87 
88 		switch(format)
89 		{
90 		case FORMAT_A8:
91 			*(unsigned char*)element = unorm<8>(a);
92 			break;
93 		case FORMAT_R8_SNORM:
94 			*(char*)element = snorm<8>(r);
95 			break;
96 		case FORMAT_R8:
97 			*(unsigned char*)element = unorm<8>(r);
98 			break;
99 		case FORMAT_R8I:
100 			*(char*)element = scast<8>(r);
101 			break;
102 		case FORMAT_R8UI:
103 			*(unsigned char*)element = ucast<8>(r);
104 			break;
105 		case FORMAT_R16I:
106 			*(short*)element = scast<16>(r);
107 			break;
108 		case FORMAT_R16UI:
109 			*(unsigned short*)element = ucast<16>(r);
110 			break;
111 		case FORMAT_R32I:
112 			*(int*)element = static_cast<int>(r);
113 			break;
114 		case FORMAT_R32UI:
115 			*(unsigned int*)element = static_cast<unsigned int>(r);
116 			break;
117 		case FORMAT_R3G3B2:
118 			*(unsigned char*)element = (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
119 			break;
120 		case FORMAT_A8R3G3B2:
121 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
122 			break;
123 		case FORMAT_X4R4G4B4:
124 			*(unsigned short*)element = 0xF000 | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
125 			break;
126 		case FORMAT_A4R4G4B4:
127 			*(unsigned short*)element = (unorm<4>(a) << 12) | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
128 			break;
129 		case FORMAT_R4G4B4A4:
130 			*(unsigned short*)element = (unorm<4>(r) << 12) | (unorm<4>(g) << 8) | (unorm<4>(b) << 4) | (unorm<4>(a) << 0);
131 			break;
132 		case FORMAT_R5G6B5:
133 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<6>(g) << 5) | (unorm<5>(b) << 0);
134 			break;
135 		case FORMAT_A1R5G5B5:
136 			*(unsigned short*)element = (unorm<1>(a) << 15) | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
137 			break;
138 		case FORMAT_R5G5B5A1:
139 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<5>(g) << 6) | (unorm<5>(b) << 1) | (unorm<5>(a) << 0);
140 			break;
141 		case FORMAT_X1R5G5B5:
142 			*(unsigned short*)element = 0x8000 | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
143 			break;
144 		case FORMAT_A8R8G8B8:
145 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
146 			break;
147 		case FORMAT_X8R8G8B8:
148 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
149 			break;
150 		case FORMAT_A8B8G8R8_SNORM:
151 			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(a)) << 24) |
152 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
153 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
154 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
155 			break;
156 		case FORMAT_A8B8G8R8:
157 		case FORMAT_SRGB8_A8:
158 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
159 			break;
160 		case FORMAT_A8B8G8R8I:
161 			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(a)) << 24) |
162 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
163 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
164 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
165 			break;
166 		case FORMAT_A8B8G8R8UI:
167 			*(unsigned int*)element = (ucast<8>(a) << 24) | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
168 			break;
169 		case FORMAT_X8B8G8R8_SNORM:
170 			*(unsigned int*)element = 0x7F000000 |
171 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
172 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
173 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
174 			break;
175 		case FORMAT_X8B8G8R8:
176 		case FORMAT_SRGB8_X8:
177 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
178 			break;
179 		case FORMAT_X8B8G8R8I:
180 			*(unsigned int*)element = 0x7F000000 |
181 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
182 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
183 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
184 		case FORMAT_X8B8G8R8UI:
185 			*(unsigned int*)element = 0xFF000000 | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
186 			break;
187 		case FORMAT_A2R10G10B10:
188 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(r) << 20) | (unorm<10>(g) << 10) | (unorm<10>(b) << 0);
189 			break;
190 		case FORMAT_A2B10G10R10:
191 		case FORMAT_A2B10G10R10UI:
192 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(b) << 20) | (unorm<10>(g) << 10) | (unorm<10>(r) << 0);
193 			break;
194 		case FORMAT_G8R8_SNORM:
195 			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(g)) << 8) |
196 			                            (static_cast<unsigned short>(snorm<8>(r)) << 0);
197 			break;
198 		case FORMAT_G8R8:
199 			*(unsigned short*)element = (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
200 			break;
201 		case FORMAT_G8R8I:
202 			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(g)) << 8) |
203 			                            (static_cast<unsigned short>(scast<8>(r)) << 0);
204 			break;
205 		case FORMAT_G8R8UI:
206 			*(unsigned short*)element = (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
207 			break;
208 		case FORMAT_G16R16:
209 			*(unsigned int*)element = (unorm<16>(g) << 16) | (unorm<16>(r) << 0);
210 			break;
211 		case FORMAT_G16R16I:
212 			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(g)) << 16) |
213 			                          (static_cast<unsigned int>(scast<16>(r)) << 0);
214 			break;
215 		case FORMAT_G16R16UI:
216 			*(unsigned int*)element = (ucast<16>(g) << 16) | (ucast<16>(r) << 0);
217 			break;
218 		case FORMAT_G32R32I:
219 		case FORMAT_G32R32UI:
220 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
221 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
222 			break;
223 		case FORMAT_A16B16G16R16:
224 			((unsigned short*)element)[0] = unorm<16>(r);
225 			((unsigned short*)element)[1] = unorm<16>(g);
226 			((unsigned short*)element)[2] = unorm<16>(b);
227 			((unsigned short*)element)[3] = unorm<16>(a);
228 			break;
229 		case FORMAT_A16B16G16R16I:
230 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
231 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
232 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
233 			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(a));
234 			break;
235 		case FORMAT_A16B16G16R16UI:
236 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
237 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
238 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
239 			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(a));
240 			break;
241 		case FORMAT_X16B16G16R16I:
242 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
243 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
244 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
245 			break;
246 		case FORMAT_X16B16G16R16UI:
247 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
248 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
249 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
250 			break;
251 		case FORMAT_A32B32G32R32I:
252 		case FORMAT_A32B32G32R32UI:
253 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
254 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
255 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
256 			((unsigned int*)element)[3] = static_cast<unsigned int>(a);
257 			break;
258 		case FORMAT_X32B32G32R32I:
259 		case FORMAT_X32B32G32R32UI:
260 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
261 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
262 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
263 			break;
264 		case FORMAT_V8U8:
265 			*(unsigned short*)element = (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
266 			break;
267 		case FORMAT_L6V5U5:
268 			*(unsigned short*)element = (unorm<6>(b) << 10) | (snorm<5>(g) << 5) | (snorm<5>(r) << 0);
269 			break;
270 		case FORMAT_Q8W8V8U8:
271 			*(unsigned int*)element = (snorm<8>(a) << 24) | (snorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
272 			break;
273 		case FORMAT_X8L8V8U8:
274 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
275 			break;
276 		case FORMAT_V16U16:
277 			*(unsigned int*)element = (snorm<16>(g) << 16) | (snorm<16>(r) << 0);
278 			break;
279 		case FORMAT_A2W10V10U10:
280 			*(unsigned int*)element = (unorm<2>(a) << 30) | (snorm<10>(b) << 20) | (snorm<10>(g) << 10) | (snorm<10>(r) << 0);
281 			break;
282 		case FORMAT_A16W16V16U16:
283 			((unsigned short*)element)[0] = snorm<16>(r);
284 			((unsigned short*)element)[1] = snorm<16>(g);
285 			((unsigned short*)element)[2] = snorm<16>(b);
286 			((unsigned short*)element)[3] = unorm<16>(a);
287 			break;
288 		case FORMAT_Q16W16V16U16:
289 			((unsigned short*)element)[0] = snorm<16>(r);
290 			((unsigned short*)element)[1] = snorm<16>(g);
291 			((unsigned short*)element)[2] = snorm<16>(b);
292 			((unsigned short*)element)[3] = snorm<16>(a);
293 			break;
294 		case FORMAT_R8G8B8:
295 			((unsigned char*)element)[0] = unorm<8>(b);
296 			((unsigned char*)element)[1] = unorm<8>(g);
297 			((unsigned char*)element)[2] = unorm<8>(r);
298 			break;
299 		case FORMAT_B8G8R8:
300 			((unsigned char*)element)[0] = unorm<8>(r);
301 			((unsigned char*)element)[1] = unorm<8>(g);
302 			((unsigned char*)element)[2] = unorm<8>(b);
303 			break;
304 		case FORMAT_R16F:
305 			*(half*)element = (half)r;
306 			break;
307 		case FORMAT_A16F:
308 			*(half*)element = (half)a;
309 			break;
310 		case FORMAT_G16R16F:
311 			((half*)element)[0] = (half)r;
312 			((half*)element)[1] = (half)g;
313 			break;
314 		case FORMAT_X16B16G16R16F_UNSIGNED:
315 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
316 			// Fall through to FORMAT_X16B16G16R16F.
317 		case FORMAT_X16B16G16R16F:
318 			((half*)element)[3] = 1.0f;
319 			// Fall through to FORMAT_B16G16R16F.
320 		case FORMAT_B16G16R16F:
321 			((half*)element)[0] = (half)r;
322 			((half*)element)[1] = (half)g;
323 			((half*)element)[2] = (half)b;
324 			break;
325 		case FORMAT_A16B16G16R16F:
326 			((half*)element)[0] = (half)r;
327 			((half*)element)[1] = (half)g;
328 			((half*)element)[2] = (half)b;
329 			((half*)element)[3] = (half)a;
330 			break;
331 		case FORMAT_A32F:
332 			*(float*)element = a;
333 			break;
334 		case FORMAT_R32F:
335 			*(float*)element = r;
336 			break;
337 		case FORMAT_G32R32F:
338 			((float*)element)[0] = r;
339 			((float*)element)[1] = g;
340 			break;
341 		case FORMAT_X32B32G32R32F_UNSIGNED:
342 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
343 			// Fall through to FORMAT_X32B32G32R32F.
344 		case FORMAT_X32B32G32R32F:
345 			((float*)element)[3] = 1.0f;
346 			// Fall through to FORMAT_B32G32R32F.
347 		case FORMAT_B32G32R32F:
348 			((float*)element)[0] = r;
349 			((float*)element)[1] = g;
350 			((float*)element)[2] = b;
351 			break;
352 		case FORMAT_A32B32G32R32F:
353 			((float*)element)[0] = r;
354 			((float*)element)[1] = g;
355 			((float*)element)[2] = b;
356 			((float*)element)[3] = a;
357 			break;
358 		case FORMAT_D32F:
359 		case FORMAT_D32FS8:
360 		case FORMAT_D32F_LOCKABLE:
361 		case FORMAT_D32FS8_TEXTURE:
362 		case FORMAT_D32F_SHADOW:
363 		case FORMAT_D32FS8_SHADOW:
364 			*((float*)element) = r;
365 			break;
366 		case FORMAT_D32F_COMPLEMENTARY:
367 		case FORMAT_D32FS8_COMPLEMENTARY:
368 			*((float*)element) = 1 - r;
369 			break;
370 		case FORMAT_S8:
371 			*((unsigned char*)element) = unorm<8>(r);
372 			break;
373 		case FORMAT_L8:
374 			*(unsigned char*)element = unorm<8>(r);
375 			break;
376 		case FORMAT_A4L4:
377 			*(unsigned char*)element = (unorm<4>(a) << 4) | (unorm<4>(r) << 0);
378 			break;
379 		case FORMAT_L16:
380 			*(unsigned short*)element = unorm<16>(r);
381 			break;
382 		case FORMAT_A8L8:
383 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<8>(r) << 0);
384 			break;
385 		case FORMAT_L16F:
386 			*(half*)element = (half)r;
387 			break;
388 		case FORMAT_A16L16F:
389 			((half*)element)[0] = (half)r;
390 			((half*)element)[1] = (half)a;
391 			break;
392 		case FORMAT_L32F:
393 			*(float*)element = r;
394 			break;
395 		case FORMAT_A32L32F:
396 			((float*)element)[0] = r;
397 			((float*)element)[1] = a;
398 			break;
399 		default:
400 			ASSERT(false);
401 		}
402 	}
403 
read(int x,int y,int z) const404 	Color<float> Surface::Buffer::read(int x, int y, int z) const
405 	{
406 		ASSERT((x >= -border) && (x < (width + border)));
407 		ASSERT((y >= -border) && (y < (height + border)));
408 		ASSERT((z >= 0) && (z < depth));
409 
410 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
411 
412 		return read(element);
413 	}
414 
read(int x,int y) const415 	Color<float> Surface::Buffer::read(int x, int y) const
416 	{
417 		ASSERT((x >= -border) && (x < (width + border)));
418 		ASSERT((y >= -border) && (y < (height + border)));
419 
420 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB;
421 
422 		return read(element);
423 	}
424 
read(void * element) const425 	inline Color<float> Surface::Buffer::read(void *element) const
426 	{
427 		float r = 0.0f;
428 		float g = 0.0f;
429 		float b = 0.0f;
430 		float a = 1.0f;
431 
432 		switch(format)
433 		{
434 		case FORMAT_P8:
435 			{
436 				ASSERT(palette);
437 
438 				unsigned int abgr = palette[*(unsigned char*)element];
439 
440 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
441 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
442 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
443 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
444 			}
445 			break;
446 		case FORMAT_A8P8:
447 			{
448 				ASSERT(palette);
449 
450 				unsigned int bgr = palette[((unsigned char*)element)[0]];
451 
452 				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
453 				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
454 				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
455 				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
456 			}
457 			break;
458 		case FORMAT_A8:
459 			r = 0;
460 			g = 0;
461 			b = 0;
462 			a = *(unsigned char*)element * (1.0f / 0xFF);
463 			break;
464 		case FORMAT_R8_SNORM:
465 			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
466 			break;
467 		case FORMAT_R8:
468 			r = *(unsigned char*)element * (1.0f / 0xFF);
469 			break;
470 		case FORMAT_R8I:
471 			r = *(signed char*)element;
472 			break;
473 		case FORMAT_R8UI:
474 			r = *(unsigned char*)element;
475 			break;
476 		case FORMAT_R3G3B2:
477 			{
478 				unsigned char rgb = *(unsigned char*)element;
479 
480 				r = (rgb & 0xE0) * (1.0f / 0xE0);
481 				g = (rgb & 0x1C) * (1.0f / 0x1C);
482 				b = (rgb & 0x03) * (1.0f / 0x03);
483 			}
484 			break;
485 		case FORMAT_A8R3G3B2:
486 			{
487 				unsigned short argb = *(unsigned short*)element;
488 
489 				a = (argb & 0xFF00) * (1.0f / 0xFF00);
490 				r = (argb & 0x00E0) * (1.0f / 0x00E0);
491 				g = (argb & 0x001C) * (1.0f / 0x001C);
492 				b = (argb & 0x0003) * (1.0f / 0x0003);
493 			}
494 			break;
495 		case FORMAT_X4R4G4B4:
496 			{
497 				unsigned short rgb = *(unsigned short*)element;
498 
499 				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
500 				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
501 				b = (rgb & 0x000F) * (1.0f / 0x000F);
502 			}
503 			break;
504 		case FORMAT_A4R4G4B4:
505 			{
506 				unsigned short argb = *(unsigned short*)element;
507 
508 				a = (argb & 0xF000) * (1.0f / 0xF000);
509 				r = (argb & 0x0F00) * (1.0f / 0x0F00);
510 				g = (argb & 0x00F0) * (1.0f / 0x00F0);
511 				b = (argb & 0x000F) * (1.0f / 0x000F);
512 			}
513 			break;
514 		case FORMAT_R4G4B4A4:
515 			{
516 				unsigned short rgba = *(unsigned short*)element;
517 
518 				r = (rgba & 0xF000) * (1.0f / 0xF000);
519 				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
520 				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
521 				a = (rgba & 0x000F) * (1.0f / 0x000F);
522 			}
523 			break;
524 		case FORMAT_R5G6B5:
525 			{
526 				unsigned short rgb = *(unsigned short*)element;
527 
528 				r = (rgb & 0xF800) * (1.0f / 0xF800);
529 				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
530 				b = (rgb & 0x001F) * (1.0f / 0x001F);
531 			}
532 			break;
533 		case FORMAT_A1R5G5B5:
534 			{
535 				unsigned short argb = *(unsigned short*)element;
536 
537 				a = (argb & 0x8000) * (1.0f / 0x8000);
538 				r = (argb & 0x7C00) * (1.0f / 0x7C00);
539 				g = (argb & 0x03E0) * (1.0f / 0x03E0);
540 				b = (argb & 0x001F) * (1.0f / 0x001F);
541 			}
542 			break;
543 		case FORMAT_R5G5B5A1:
544 			{
545 				unsigned short rgba = *(unsigned short*)element;
546 
547 				r = (rgba & 0xF800) * (1.0f / 0xF800);
548 				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
549 				b = (rgba & 0x003E) * (1.0f / 0x003E);
550 				a = (rgba & 0x0001) * (1.0f / 0x0001);
551 			}
552 			break;
553 		case FORMAT_X1R5G5B5:
554 			{
555 				unsigned short xrgb = *(unsigned short*)element;
556 
557 				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
558 				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
559 				b = (xrgb & 0x001F) * (1.0f / 0x001F);
560 			}
561 			break;
562 		case FORMAT_A8R8G8B8:
563 			{
564 				unsigned int argb = *(unsigned int*)element;
565 
566 				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
567 				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
568 				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
569 				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
570 			}
571 			break;
572 		case FORMAT_X8R8G8B8:
573 			{
574 				unsigned int xrgb = *(unsigned int*)element;
575 
576 				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
577 				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
578 				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
579 			}
580 			break;
581 		case FORMAT_A8B8G8R8_SNORM:
582 			{
583 				signed char* abgr = (signed char*)element;
584 
585 				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
586 				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
587 				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
588 				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
589 			}
590 			break;
591 		case FORMAT_A8B8G8R8:
592 		case FORMAT_SRGB8_A8:
593 			{
594 				unsigned int abgr = *(unsigned int*)element;
595 
596 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
597 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
598 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
599 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
600 			}
601 			break;
602 		case FORMAT_A8B8G8R8I:
603 			{
604 				signed char* abgr = (signed char*)element;
605 
606 				r = abgr[0];
607 				g = abgr[1];
608 				b = abgr[2];
609 				a = abgr[3];
610 			}
611 			break;
612 		case FORMAT_A8B8G8R8UI:
613 			{
614 				unsigned char* abgr = (unsigned char*)element;
615 
616 				r = abgr[0];
617 				g = abgr[1];
618 				b = abgr[2];
619 				a = abgr[3];
620 			}
621 			break;
622 		case FORMAT_X8B8G8R8_SNORM:
623 			{
624 				signed char* bgr = (signed char*)element;
625 
626 				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
627 				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
628 				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
629 			}
630 			break;
631 		case FORMAT_X8B8G8R8:
632 		case FORMAT_SRGB8_X8:
633 			{
634 				unsigned int xbgr = *(unsigned int*)element;
635 
636 				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
637 				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
638 				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
639 			}
640 			break;
641 		case FORMAT_X8B8G8R8I:
642 			{
643 				signed char* bgr = (signed char*)element;
644 
645 				r = bgr[0];
646 				g = bgr[1];
647 				b = bgr[2];
648 			}
649 			break;
650 		case FORMAT_X8B8G8R8UI:
651 			{
652 				unsigned char* bgr = (unsigned char*)element;
653 
654 				r = bgr[0];
655 				g = bgr[1];
656 				b = bgr[2];
657 			}
658 			break;
659 		case FORMAT_G8R8_SNORM:
660 			{
661 				signed char* gr = (signed char*)element;
662 
663 				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
664 				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
665 			}
666 			break;
667 		case FORMAT_G8R8:
668 			{
669 				unsigned short gr = *(unsigned short*)element;
670 
671 				g = (gr & 0xFF00) * (1.0f / 0xFF00);
672 				r = (gr & 0x00FF) * (1.0f / 0x00FF);
673 			}
674 			break;
675 		case FORMAT_G8R8I:
676 			{
677 				signed char* gr = (signed char*)element;
678 
679 				r = gr[0];
680 				g = gr[1];
681 			}
682 			break;
683 		case FORMAT_G8R8UI:
684 			{
685 				unsigned char* gr = (unsigned char*)element;
686 
687 				r = gr[0];
688 				g = gr[1];
689 			}
690 			break;
691 		case FORMAT_R16I:
692 			r = *((short*)element);
693 			break;
694 		case FORMAT_R16UI:
695 			r = *((unsigned short*)element);
696 			break;
697 		case FORMAT_G16R16I:
698 			{
699 				short* gr = (short*)element;
700 
701 				r = gr[0];
702 				g = gr[1];
703 			}
704 			break;
705 		case FORMAT_G16R16:
706 			{
707 				unsigned int gr = *(unsigned int*)element;
708 
709 				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
710 				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
711 			}
712 			break;
713 		case FORMAT_G16R16UI:
714 			{
715 				unsigned short* gr = (unsigned short*)element;
716 
717 				r = gr[0];
718 				g = gr[1];
719 			}
720 			break;
721 		case FORMAT_A2R10G10B10:
722 			{
723 				unsigned int argb = *(unsigned int*)element;
724 
725 				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
726 				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
727 				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
728 				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
729 			}
730 			break;
731 		case FORMAT_A2B10G10R10:
732 			{
733 				unsigned int abgr = *(unsigned int*)element;
734 
735 				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
736 				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
737 				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
738 				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
739 			}
740 			break;
741 		case FORMAT_A2B10G10R10UI:
742 			{
743 				unsigned int abgr = *(unsigned int*)element;
744 
745 				a = static_cast<float>((abgr & 0xC0000000) >> 30);
746 				b = static_cast<float>((abgr & 0x3FF00000) >> 20);
747 				g = static_cast<float>((abgr & 0x000FFC00) >> 10);
748 				r = static_cast<float>(abgr & 0x000003FF);
749 			}
750 			break;
751 		case FORMAT_A16B16G16R16I:
752 			{
753 				short* abgr = (short*)element;
754 
755 				r = abgr[0];
756 				g = abgr[1];
757 				b = abgr[2];
758 				a = abgr[3];
759 			}
760 			break;
761 		case FORMAT_A16B16G16R16:
762 			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
763 			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
764 			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
765 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
766 			break;
767 		case FORMAT_A16B16G16R16UI:
768 			{
769 				unsigned short* abgr = (unsigned short*)element;
770 
771 				r = abgr[0];
772 				g = abgr[1];
773 				b = abgr[2];
774 				a = abgr[3];
775 			}
776 			break;
777 		case FORMAT_X16B16G16R16I:
778 			{
779 				short* bgr = (short*)element;
780 
781 				r = bgr[0];
782 				g = bgr[1];
783 				b = bgr[2];
784 			}
785 			break;
786 		case FORMAT_X16B16G16R16UI:
787 			{
788 				unsigned short* bgr = (unsigned short*)element;
789 
790 				r = bgr[0];
791 				g = bgr[1];
792 				b = bgr[2];
793 			}
794 			break;
795 		case FORMAT_A32B32G32R32I:
796 			{
797 				int* abgr = (int*)element;
798 
799 				r = static_cast<float>(abgr[0]);
800 				g = static_cast<float>(abgr[1]);
801 				b = static_cast<float>(abgr[2]);
802 				a = static_cast<float>(abgr[3]);
803 			}
804 			break;
805 		case FORMAT_A32B32G32R32UI:
806 			{
807 				unsigned int* abgr = (unsigned int*)element;
808 
809 				r = static_cast<float>(abgr[0]);
810 				g = static_cast<float>(abgr[1]);
811 				b = static_cast<float>(abgr[2]);
812 				a = static_cast<float>(abgr[3]);
813 			}
814 			break;
815 		case FORMAT_X32B32G32R32I:
816 			{
817 				int* bgr = (int*)element;
818 
819 				r = static_cast<float>(bgr[0]);
820 				g = static_cast<float>(bgr[1]);
821 				b = static_cast<float>(bgr[2]);
822 			}
823 			break;
824 		case FORMAT_X32B32G32R32UI:
825 			{
826 				unsigned int* bgr = (unsigned int*)element;
827 
828 				r = static_cast<float>(bgr[0]);
829 				g = static_cast<float>(bgr[1]);
830 				b = static_cast<float>(bgr[2]);
831 			}
832 			break;
833 		case FORMAT_G32R32I:
834 			{
835 				int* gr = (int*)element;
836 
837 				r = static_cast<float>(gr[0]);
838 				g = static_cast<float>(gr[1]);
839 			}
840 			break;
841 		case FORMAT_G32R32UI:
842 			{
843 				unsigned int* gr = (unsigned int*)element;
844 
845 				r = static_cast<float>(gr[0]);
846 				g = static_cast<float>(gr[1]);
847 			}
848 			break;
849 		case FORMAT_R32I:
850 			r = static_cast<float>(*((int*)element));
851 			break;
852 		case FORMAT_R32UI:
853 			r = static_cast<float>(*((unsigned int*)element));
854 			break;
855 		case FORMAT_V8U8:
856 			{
857 				unsigned short vu = *(unsigned short*)element;
858 
859 				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
860 				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
861 			}
862 			break;
863 		case FORMAT_L6V5U5:
864 			{
865 				unsigned short lvu = *(unsigned short*)element;
866 
867 				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
868 				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
869 				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
870 			}
871 			break;
872 		case FORMAT_Q8W8V8U8:
873 			{
874 				unsigned int qwvu = *(unsigned int*)element;
875 
876 				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
877 				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
878 				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
879 				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
880 			}
881 			break;
882 		case FORMAT_X8L8V8U8:
883 			{
884 				unsigned int xlvu = *(unsigned int*)element;
885 
886 				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
887 				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
888 				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
889 			}
890 			break;
891 		case FORMAT_R8G8B8:
892 			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
893 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
894 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
895 			break;
896 		case FORMAT_B8G8R8:
897 			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
898 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
899 			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
900 			break;
901 		case FORMAT_V16U16:
902 			{
903 				unsigned int vu = *(unsigned int*)element;
904 
905 				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
906 				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
907 			}
908 			break;
909 		case FORMAT_A2W10V10U10:
910 			{
911 				unsigned int awvu = *(unsigned int*)element;
912 
913 				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
914 				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
915 				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
916 				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
917 			}
918 			break;
919 		case FORMAT_A16W16V16U16:
920 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
921 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
922 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
923 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
924 			break;
925 		case FORMAT_Q16W16V16U16:
926 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
927 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
928 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
929 			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
930 			break;
931 		case FORMAT_L8:
932 			r =
933 			g =
934 			b = *(unsigned char*)element * (1.0f / 0xFF);
935 			break;
936 		case FORMAT_A4L4:
937 			{
938 				unsigned char al = *(unsigned char*)element;
939 
940 				r =
941 				g =
942 				b = (al & 0x0F) * (1.0f / 0x0F);
943 				a = (al & 0xF0) * (1.0f / 0xF0);
944 			}
945 			break;
946 		case FORMAT_L16:
947 			r =
948 			g =
949 			b = *(unsigned short*)element * (1.0f / 0xFFFF);
950 			break;
951 		case FORMAT_A8L8:
952 			r =
953 			g =
954 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
955 			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
956 			break;
957 		case FORMAT_L16F:
958 			r =
959 			g =
960 			b = *(half*)element;
961 			break;
962 		case FORMAT_A16L16F:
963 			r =
964 			g =
965 			b = ((half*)element)[0];
966 			a = ((half*)element)[1];
967 			break;
968 		case FORMAT_L32F:
969 			r =
970 			g =
971 			b = *(float*)element;
972 			break;
973 		case FORMAT_A32L32F:
974 			r =
975 			g =
976 			b = ((float*)element)[0];
977 			a = ((float*)element)[1];
978 			break;
979 		case FORMAT_A16F:
980 			a = *(half*)element;
981 			break;
982 		case FORMAT_R16F:
983 			r = *(half*)element;
984 			break;
985 		case FORMAT_G16R16F:
986 			r = ((half*)element)[0];
987 			g = ((half*)element)[1];
988 			break;
989 		case FORMAT_X16B16G16R16F:
990 		case FORMAT_X16B16G16R16F_UNSIGNED:
991 		case FORMAT_B16G16R16F:
992 			r = ((half*)element)[0];
993 			g = ((half*)element)[1];
994 			b = ((half*)element)[2];
995 			break;
996 		case FORMAT_A16B16G16R16F:
997 			r = ((half*)element)[0];
998 			g = ((half*)element)[1];
999 			b = ((half*)element)[2];
1000 			a = ((half*)element)[3];
1001 			break;
1002 		case FORMAT_A32F:
1003 			a = *(float*)element;
1004 			break;
1005 		case FORMAT_R32F:
1006 			r = *(float*)element;
1007 			break;
1008 		case FORMAT_G32R32F:
1009 			r = ((float*)element)[0];
1010 			g = ((float*)element)[1];
1011 			break;
1012 		case FORMAT_X32B32G32R32F:
1013 		case FORMAT_X32B32G32R32F_UNSIGNED:
1014 		case FORMAT_B32G32R32F:
1015 			r = ((float*)element)[0];
1016 			g = ((float*)element)[1];
1017 			b = ((float*)element)[2];
1018 			break;
1019 		case FORMAT_A32B32G32R32F:
1020 			r = ((float*)element)[0];
1021 			g = ((float*)element)[1];
1022 			b = ((float*)element)[2];
1023 			a = ((float*)element)[3];
1024 			break;
1025 		case FORMAT_D32F:
1026 		case FORMAT_D32FS8:
1027 		case FORMAT_D32F_LOCKABLE:
1028 		case FORMAT_D32FS8_TEXTURE:
1029 		case FORMAT_D32F_SHADOW:
1030 		case FORMAT_D32FS8_SHADOW:
1031 			r = *(float*)element;
1032 			g = r;
1033 			b = r;
1034 			a = r;
1035 			break;
1036 		case FORMAT_D32F_COMPLEMENTARY:
1037 		case FORMAT_D32FS8_COMPLEMENTARY:
1038 			r = 1.0f - *(float*)element;
1039 			g = r;
1040 			b = r;
1041 			a = r;
1042 			break;
1043 		case FORMAT_S8:
1044 			r = *(unsigned char*)element * (1.0f / 0xFF);
1045 			break;
1046 		default:
1047 			ASSERT(false);
1048 		}
1049 
1050 		if(isSRGBformat(format))
1051 		{
1052 			r = sRGBtoLinear(r);
1053 			g = sRGBtoLinear(g);
1054 			b = sRGBtoLinear(b);
1055 		}
1056 
1057 		return Color<float>(r, g, b, a);
1058 	}
1059 
sample(float x,float y,float z) const1060 	Color<float> Surface::Buffer::sample(float x, float y, float z) const
1061 	{
1062 		x -= 0.5f;
1063 		y -= 0.5f;
1064 		z -= 0.5f;
1065 
1066 		int x0 = clamp((int)x, 0, width - 1);
1067 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1068 
1069 		int y0 = clamp((int)y, 0, height - 1);
1070 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1071 
1072 		int z0 = clamp((int)z, 0, depth - 1);
1073 		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
1074 
1075 		Color<float> c000 = read(x0, y0, z0);
1076 		Color<float> c100 = read(x1, y0, z0);
1077 		Color<float> c010 = read(x0, y1, z0);
1078 		Color<float> c110 = read(x1, y1, z0);
1079 		Color<float> c001 = read(x0, y0, z1);
1080 		Color<float> c101 = read(x1, y0, z1);
1081 		Color<float> c011 = read(x0, y1, z1);
1082 		Color<float> c111 = read(x1, y1, z1);
1083 
1084 		float fx = x - x0;
1085 		float fy = y - y0;
1086 		float fz = z - z0;
1087 
1088 		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
1089 		c100 *= fx * (1 - fy) * (1 - fz);
1090 		c010 *= (1 - fx) * fy * (1 - fz);
1091 		c110 *= fx * fy * (1 - fz);
1092 		c001 *= (1 - fx) * (1 - fy) * fz;
1093 		c101 *= fx * (1 - fy) * fz;
1094 		c011 *= (1 - fx) * fy * fz;
1095 		c111 *= fx * fy * fz;
1096 
1097 		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
1098 	}
1099 
sample(float x,float y,int layer) const1100 	Color<float> Surface::Buffer::sample(float x, float y, int layer) const
1101 	{
1102 		x -= 0.5f;
1103 		y -= 0.5f;
1104 
1105 		int x0 = clamp((int)x, 0, width - 1);
1106 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1107 
1108 		int y0 = clamp((int)y, 0, height - 1);
1109 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1110 
1111 		Color<float> c00 = read(x0, y0, layer);
1112 		Color<float> c10 = read(x1, y0, layer);
1113 		Color<float> c01 = read(x0, y1, layer);
1114 		Color<float> c11 = read(x1, y1, layer);
1115 
1116 		float fx = x - x0;
1117 		float fy = y - y0;
1118 
1119 		c00 *= (1 - fx) * (1 - fy);
1120 		c10 *= fx * (1 - fy);
1121 		c01 *= (1 - fx) * fy;
1122 		c11 *= fx * fy;
1123 
1124 		return c00 + c10 + c01 + c11;
1125 	}
1126 
lockRect(int x,int y,int z,Lock lock)1127 	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
1128 	{
1129 		this->lock = lock;
1130 
1131 		switch(lock)
1132 		{
1133 		case LOCK_UNLOCKED:
1134 		case LOCK_READONLY:
1135 		case LOCK_UPDATE:
1136 			break;
1137 		case LOCK_WRITEONLY:
1138 		case LOCK_READWRITE:
1139 		case LOCK_DISCARD:
1140 			dirty = true;
1141 			break;
1142 		default:
1143 			ASSERT(false);
1144 		}
1145 
1146 		if(buffer)
1147 		{
1148 			x += border;
1149 			y += border;
1150 
1151 			switch(format)
1152 			{
1153 			case FORMAT_DXT1:
1154 			case FORMAT_ATI1:
1155 			case FORMAT_ETC1:
1156 			case FORMAT_R11_EAC:
1157 			case FORMAT_SIGNED_R11_EAC:
1158 			case FORMAT_RGB8_ETC2:
1159 			case FORMAT_SRGB8_ETC2:
1160 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1161 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1162 				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1163 			case FORMAT_RG11_EAC:
1164 			case FORMAT_SIGNED_RG11_EAC:
1165 			case FORMAT_RGBA8_ETC2_EAC:
1166 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1167 			case FORMAT_RGBA_ASTC_4x4_KHR:
1168 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1169 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1170 			case FORMAT_RGBA_ASTC_5x4_KHR:
1171 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1172 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
1173 			case FORMAT_RGBA_ASTC_5x5_KHR:
1174 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1175 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
1176 			case FORMAT_RGBA_ASTC_6x5_KHR:
1177 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1178 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
1179 			case FORMAT_RGBA_ASTC_6x6_KHR:
1180 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1181 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
1182 			case FORMAT_RGBA_ASTC_8x5_KHR:
1183 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1184 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
1185 			case FORMAT_RGBA_ASTC_8x6_KHR:
1186 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1187 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
1188 			case FORMAT_RGBA_ASTC_8x8_KHR:
1189 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1190 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
1191 			case FORMAT_RGBA_ASTC_10x5_KHR:
1192 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1193 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
1194 			case FORMAT_RGBA_ASTC_10x6_KHR:
1195 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1196 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
1197 			case FORMAT_RGBA_ASTC_10x8_KHR:
1198 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1199 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
1200 			case FORMAT_RGBA_ASTC_10x10_KHR:
1201 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1202 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
1203 			case FORMAT_RGBA_ASTC_12x10_KHR:
1204 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1205 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
1206 			case FORMAT_RGBA_ASTC_12x12_KHR:
1207 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1208 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
1209 			case FORMAT_DXT3:
1210 			case FORMAT_DXT5:
1211 			case FORMAT_ATI2:
1212 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1213 			default:
1214 				return (unsigned char*)buffer + x * bytes + y * pitchB + z * samples * sliceB;
1215 			}
1216 		}
1217 
1218 		return nullptr;
1219 	}
1220 
unlockRect()1221 	void Surface::Buffer::unlockRect()
1222 	{
1223 		lock = LOCK_UNLOCKED;
1224 	}
1225 
1226 	class SurfaceImplementation : public Surface
1227 	{
1228 	public:
SurfaceImplementation(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1229 		SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1230 			: Surface(width, height, depth, format, pixels, pitch, slice) {}
SurfaceImplementation(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchP=0)1231 		SurfaceImplementation(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0)
1232 			: Surface(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchP) {}
~SurfaceImplementation()1233 		~SurfaceImplementation() override {};
1234 
lockInternal(int x,int y,int z,Lock lock,Accessor client)1235 		void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
1236 		{
1237 			return Surface::lockInternal(x, y, z, lock, client);
1238 		}
1239 
unlockInternal()1240 		void unlockInternal() override
1241 		{
1242 			Surface::unlockInternal();
1243 		}
1244 	};
1245 
create(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1246 	Surface *Surface::create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1247 	{
1248 		return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
1249 	}
1250 
create(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchPprovided)1251 	Surface *Surface::create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided)
1252 	{
1253 		return new SurfaceImplementation(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchPprovided);
1254 	}
1255 
Surface(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1256 	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
1257 	{
1258 		resource = new Resource(0);
1259 		hasParent = false;
1260 		ownExternal = false;
1261 		depth = max(1, depth);
1262 
1263 		external.buffer = pixels;
1264 		external.width = width;
1265 		external.height = height;
1266 		external.depth = depth;
1267 		external.samples = 1;
1268 		external.format = format;
1269 		external.bytes = bytes(external.format);
1270 		external.pitchB = pitch;
1271 		external.pitchP = external.bytes ? pitch / external.bytes : 0;
1272 		external.sliceB = slice;
1273 		external.sliceP = external.bytes ? slice / external.bytes : 0;
1274 		external.border = 0;
1275 		external.lock = LOCK_UNLOCKED;
1276 		external.dirty = true;
1277 
1278 		internal.buffer = nullptr;
1279 		internal.width = width;
1280 		internal.height = height;
1281 		internal.depth = depth;
1282 		internal.samples = 1;
1283 		internal.format = selectInternalFormat(format);
1284 		internal.bytes = bytes(internal.format);
1285 		internal.pitchB = pitchB(internal.width, 0, internal.format, false);
1286 		internal.pitchP = pitchP(internal.width, 0, internal.format, false);
1287 		internal.sliceB = sliceB(internal.width, internal.height, 0, internal.format, false);
1288 		internal.sliceP = sliceP(internal.width, internal.height, 0, internal.format, false);
1289 		internal.border = 0;
1290 		internal.lock = LOCK_UNLOCKED;
1291 		internal.dirty = false;
1292 
1293 		stencil.buffer = nullptr;
1294 		stencil.width = width;
1295 		stencil.height = height;
1296 		stencil.depth = depth;
1297 		stencil.samples = 1;
1298 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
1299 		stencil.bytes = bytes(stencil.format);
1300 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, false);
1301 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, false);
1302 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, false);
1303 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, false);
1304 		stencil.border = 0;
1305 		stencil.lock = LOCK_UNLOCKED;
1306 		stencil.dirty = false;
1307 
1308 		dirtyContents = true;
1309 		paletteUsed = 0;
1310 	}
1311 
Surface(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchPprovided)1312 	Surface::Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
1313 	{
1314 		resource = texture ? texture : new Resource(0);
1315 		hasParent = texture != nullptr;
1316 		ownExternal = true;
1317 		depth = max(1, depth);
1318 		samples = max(1, samples);
1319 
1320 		external.buffer = nullptr;
1321 		external.width = width;
1322 		external.height = height;
1323 		external.depth = depth;
1324 		external.samples = (short)samples;
1325 		external.format = format;
1326 		external.bytes = bytes(external.format);
1327 		external.pitchB = !pitchPprovided ? pitchB(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided * external.bytes;
1328 		external.pitchP = !pitchPprovided ? pitchP(external.width, 0, external.format, renderTarget && !texture) : pitchPprovided;
1329 		external.sliceB = sliceB(external.width, external.height, 0, external.format, renderTarget && !texture);
1330 		external.sliceP = sliceP(external.width, external.height, 0, external.format, renderTarget && !texture);
1331 		external.border = 0;
1332 		external.lock = LOCK_UNLOCKED;
1333 		external.dirty = false;
1334 
1335 		internal.buffer = nullptr;
1336 		internal.width = width;
1337 		internal.height = height;
1338 		internal.depth = depth;
1339 		internal.samples = (short)samples;
1340 		internal.format = selectInternalFormat(format);
1341 		internal.bytes = bytes(internal.format);
1342 		internal.pitchB = !pitchPprovided ? pitchB(internal.width, border, internal.format, renderTarget) : pitchPprovided * internal.bytes;
1343 		internal.pitchP = !pitchPprovided ? pitchP(internal.width, border, internal.format, renderTarget) : pitchPprovided;
1344 		internal.sliceB = sliceB(internal.width, internal.height, border, internal.format, renderTarget);
1345 		internal.sliceP = sliceP(internal.width, internal.height, border, internal.format, renderTarget);
1346 		internal.border = (short)border;
1347 		internal.lock = LOCK_UNLOCKED;
1348 		internal.dirty = false;
1349 
1350 		stencil.buffer = nullptr;
1351 		stencil.width = width;
1352 		stencil.height = height;
1353 		stencil.depth = depth;
1354 		stencil.samples = (short)samples;
1355 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
1356 		stencil.bytes = bytes(stencil.format);
1357 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, renderTarget);
1358 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, renderTarget);
1359 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, renderTarget);
1360 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, renderTarget);
1361 		stencil.border = 0;
1362 		stencil.lock = LOCK_UNLOCKED;
1363 		stencil.dirty = false;
1364 
1365 		dirtyContents = true;
1366 		paletteUsed = 0;
1367 	}
1368 
~Surface()1369 	Surface::~Surface()
1370 	{
1371 		// sync() must be called before this destructor to ensure all locks have been released.
1372 		// We can't call it here because the parent resource may already have been destroyed.
1373 		ASSERT(isUnlocked());
1374 
1375 		if(!hasParent)
1376 		{
1377 			resource->destruct();
1378 		}
1379 
1380 		if(ownExternal)
1381 		{
1382 			deallocate(external.buffer);
1383 		}
1384 
1385 		if(internal.buffer != external.buffer)
1386 		{
1387 			deallocate(internal.buffer);
1388 		}
1389 
1390 		deallocate(stencil.buffer);
1391 
1392 		external.buffer = nullptr;
1393 		internal.buffer = nullptr;
1394 		stencil.buffer = nullptr;
1395 	}
1396 
lockExternal(int x,int y,int z,Lock lock,Accessor client)1397 	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
1398 	{
1399 		resource->lock(client);
1400 
1401 		if(!external.buffer)
1402 		{
1403 			if(internal.buffer && identicalBuffers())
1404 			{
1405 				external.buffer = internal.buffer;
1406 			}
1407 			else
1408 			{
1409 				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.border, external.samples, external.format);
1410 			}
1411 		}
1412 
1413 		if(internal.dirty)
1414 		{
1415 			if(lock != LOCK_DISCARD)
1416 			{
1417 				update(external, internal);
1418 			}
1419 
1420 			internal.dirty = false;
1421 		}
1422 
1423 		switch(lock)
1424 		{
1425 		case LOCK_READONLY:
1426 			break;
1427 		case LOCK_WRITEONLY:
1428 		case LOCK_READWRITE:
1429 		case LOCK_DISCARD:
1430 			dirtyContents = true;
1431 			break;
1432 		default:
1433 			ASSERT(false);
1434 		}
1435 
1436 		return external.lockRect(x, y, z, lock);
1437 	}
1438 
unlockExternal()1439 	void Surface::unlockExternal()
1440 	{
1441 		external.unlockRect();
1442 
1443 		resource->unlock();
1444 	}
1445 
lockInternal(int x,int y,int z,Lock lock,Accessor client)1446 	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
1447 	{
1448 		if(lock != LOCK_UNLOCKED)
1449 		{
1450 			resource->lock(client);
1451 		}
1452 
1453 		if(!internal.buffer)
1454 		{
1455 			if(external.buffer && identicalBuffers())
1456 			{
1457 				internal.buffer = external.buffer;
1458 			}
1459 			else
1460 			{
1461 				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.border, internal.samples, internal.format);
1462 			}
1463 		}
1464 
1465 		// FIXME: WHQL requires conversion to lower external precision and back
1466 		if(logPrecision >= WHQL)
1467 		{
1468 			if(internal.dirty && renderTarget && internal.format != external.format)
1469 			{
1470 				if(lock != LOCK_DISCARD)
1471 				{
1472 					switch(external.format)
1473 					{
1474 					case FORMAT_R3G3B2:
1475 					case FORMAT_A8R3G3B2:
1476 					case FORMAT_A1R5G5B5:
1477 					case FORMAT_A2R10G10B10:
1478 					case FORMAT_A2B10G10R10:
1479 						lockExternal(0, 0, 0, LOCK_READWRITE, client);
1480 						unlockExternal();
1481 						break;
1482 					default:
1483 						// Difference passes WHQL
1484 						break;
1485 					}
1486 				}
1487 			}
1488 		}
1489 
1490 		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
1491 		{
1492 			if(lock != LOCK_DISCARD)
1493 			{
1494 				update(internal, external);
1495 			}
1496 
1497 			external.dirty = false;
1498 			paletteUsed = Surface::paletteID;
1499 		}
1500 
1501 		switch(lock)
1502 		{
1503 		case LOCK_UNLOCKED:
1504 		case LOCK_READONLY:
1505 			break;
1506 		case LOCK_WRITEONLY:
1507 		case LOCK_READWRITE:
1508 		case LOCK_DISCARD:
1509 			dirtyContents = true;
1510 			break;
1511 		default:
1512 			ASSERT(false);
1513 		}
1514 
1515 		if(lock == LOCK_READONLY && client == PUBLIC)
1516 		{
1517 			resolve();
1518 		}
1519 
1520 		return internal.lockRect(x, y, z, lock);
1521 	}
1522 
unlockInternal()1523 	void Surface::unlockInternal()
1524 	{
1525 		internal.unlockRect();
1526 
1527 		resource->unlock();
1528 	}
1529 
lockStencil(int x,int y,int front,Accessor client)1530 	void *Surface::lockStencil(int x, int y, int front, Accessor client)
1531 	{
1532 		resource->lock(client);
1533 
1534 		if(stencil.format == FORMAT_NULL)
1535 		{
1536 			return nullptr;
1537 		}
1538 
1539 		if(!stencil.buffer)
1540 		{
1541 			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.border, stencil.samples, stencil.format);
1542 		}
1543 
1544 		return stencil.lockRect(x, y, front, LOCK_READWRITE);   // FIXME
1545 	}
1546 
unlockStencil()1547 	void Surface::unlockStencil()
1548 	{
1549 		stencil.unlockRect();
1550 
1551 		resource->unlock();
1552 	}
1553 
bytes(Format format)1554 	int Surface::bytes(Format format)
1555 	{
1556 		switch(format)
1557 		{
1558 		case FORMAT_NULL:				return 0;
1559 		case FORMAT_P8:					return 1;
1560 		case FORMAT_A8P8:				return 2;
1561 		case FORMAT_A8:					return 1;
1562 		case FORMAT_R8I:				return 1;
1563 		case FORMAT_R8:					return 1;
1564 		case FORMAT_R3G3B2:				return 1;
1565 		case FORMAT_R16I:				return 2;
1566 		case FORMAT_R16UI:				return 2;
1567 		case FORMAT_A8R3G3B2:			return 2;
1568 		case FORMAT_R5G6B5:				return 2;
1569 		case FORMAT_A1R5G5B5:			return 2;
1570 		case FORMAT_X1R5G5B5:			return 2;
1571 		case FORMAT_R5G5B5A1:           return 2;
1572 		case FORMAT_X4R4G4B4:			return 2;
1573 		case FORMAT_A4R4G4B4:			return 2;
1574 		case FORMAT_R4G4B4A4:           return 2;
1575 		case FORMAT_R8G8B8:				return 3;
1576 		case FORMAT_B8G8R8:             return 3;
1577 		case FORMAT_R32I:				return 4;
1578 		case FORMAT_R32UI:				return 4;
1579 		case FORMAT_X8R8G8B8:			return 4;
1580 	//	case FORMAT_X8G8R8B8Q:			return 4;
1581 		case FORMAT_A8R8G8B8:			return 4;
1582 	//	case FORMAT_A8G8R8B8Q:			return 4;
1583 		case FORMAT_X8B8G8R8I:			return 4;
1584 		case FORMAT_X8B8G8R8:			return 4;
1585 		case FORMAT_SRGB8_X8:			return 4;
1586 		case FORMAT_SRGB8_A8:			return 4;
1587 		case FORMAT_A8B8G8R8I:			return 4;
1588 		case FORMAT_R8UI:				return 1;
1589 		case FORMAT_G8R8UI:				return 2;
1590 		case FORMAT_X8B8G8R8UI:			return 4;
1591 		case FORMAT_A8B8G8R8UI:			return 4;
1592 		case FORMAT_A8B8G8R8:			return 4;
1593 		case FORMAT_R8_SNORM:			return 1;
1594 		case FORMAT_G8R8_SNORM:		return 2;
1595 		case FORMAT_X8B8G8R8_SNORM:	return 4;
1596 		case FORMAT_A8B8G8R8_SNORM:	return 4;
1597 		case FORMAT_A2R10G10B10:		return 4;
1598 		case FORMAT_A2B10G10R10:		return 4;
1599 		case FORMAT_A2B10G10R10UI:		return 4;
1600 		case FORMAT_G8R8I:				return 2;
1601 		case FORMAT_G8R8:				return 2;
1602 		case FORMAT_G16R16I:			return 4;
1603 		case FORMAT_G16R16UI:			return 4;
1604 		case FORMAT_G16R16:				return 4;
1605 		case FORMAT_G32R32I:			return 8;
1606 		case FORMAT_G32R32UI:			return 8;
1607 		case FORMAT_X16B16G16R16I:		return 8;
1608 		case FORMAT_X16B16G16R16UI:		return 8;
1609 		case FORMAT_A16B16G16R16I:		return 8;
1610 		case FORMAT_A16B16G16R16UI:		return 8;
1611 		case FORMAT_A16B16G16R16:		return 8;
1612 		case FORMAT_X32B32G32R32I:		return 16;
1613 		case FORMAT_X32B32G32R32UI:		return 16;
1614 		case FORMAT_A32B32G32R32I:		return 16;
1615 		case FORMAT_A32B32G32R32UI:		return 16;
1616 		// Compressed formats
1617 		case FORMAT_DXT1:				return 2;   // Column of four pixels
1618 		case FORMAT_DXT3:				return 4;   // Column of four pixels
1619 		case FORMAT_DXT5:				return 4;   // Column of four pixels
1620 		case FORMAT_ATI1:				return 2;   // Column of four pixels
1621 		case FORMAT_ATI2:				return 4;   // Column of four pixels
1622 		case FORMAT_ETC1:				return 2;   // Column of four pixels
1623 		case FORMAT_R11_EAC:			return 2;
1624 		case FORMAT_SIGNED_R11_EAC:		return 2;
1625 		case FORMAT_RG11_EAC:			return 4;
1626 		case FORMAT_SIGNED_RG11_EAC:	return 4;
1627 		case FORMAT_RGB8_ETC2:			return 2;
1628 		case FORMAT_SRGB8_ETC2:			return 2;
1629 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1630 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1631 		case FORMAT_RGBA8_ETC2_EAC:			return 4;
1632 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
1633 		case FORMAT_RGBA_ASTC_4x4_KHR:
1634 		case FORMAT_RGBA_ASTC_5x4_KHR:
1635 		case FORMAT_RGBA_ASTC_5x5_KHR:
1636 		case FORMAT_RGBA_ASTC_6x5_KHR:
1637 		case FORMAT_RGBA_ASTC_6x6_KHR:
1638 		case FORMAT_RGBA_ASTC_8x5_KHR:
1639 		case FORMAT_RGBA_ASTC_8x6_KHR:
1640 		case FORMAT_RGBA_ASTC_8x8_KHR:
1641 		case FORMAT_RGBA_ASTC_10x5_KHR:
1642 		case FORMAT_RGBA_ASTC_10x6_KHR:
1643 		case FORMAT_RGBA_ASTC_10x8_KHR:
1644 		case FORMAT_RGBA_ASTC_10x10_KHR:
1645 		case FORMAT_RGBA_ASTC_12x10_KHR:
1646 		case FORMAT_RGBA_ASTC_12x12_KHR:
1647 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1648 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1649 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1650 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1651 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1652 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1653 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1654 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1655 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1656 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1657 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1658 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1659 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1660 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
1661 		// Bumpmap formats
1662 		case FORMAT_V8U8:				return 2;
1663 		case FORMAT_L6V5U5:				return 2;
1664 		case FORMAT_Q8W8V8U8:			return 4;
1665 		case FORMAT_X8L8V8U8:			return 4;
1666 		case FORMAT_A2W10V10U10:		return 4;
1667 		case FORMAT_V16U16:				return 4;
1668 		case FORMAT_A16W16V16U16:		return 8;
1669 		case FORMAT_Q16W16V16U16:		return 8;
1670 		// Luminance formats
1671 		case FORMAT_L8:					return 1;
1672 		case FORMAT_A4L4:				return 1;
1673 		case FORMAT_L16:				return 2;
1674 		case FORMAT_A8L8:				return 2;
1675 		case FORMAT_L16F:               return 2;
1676 		case FORMAT_A16L16F:            return 4;
1677 		case FORMAT_L32F:               return 4;
1678 		case FORMAT_A32L32F:            return 8;
1679 		// Floating-point formats
1680 		case FORMAT_A16F:				return 2;
1681 		case FORMAT_R16F:				return 2;
1682 		case FORMAT_G16R16F:			return 4;
1683 		case FORMAT_B16G16R16F:			return 6;
1684 		case FORMAT_X16B16G16R16F:		return 8;
1685 		case FORMAT_A16B16G16R16F:		return 8;
1686 		case FORMAT_X16B16G16R16F_UNSIGNED: return 8;
1687 		case FORMAT_A32F:				return 4;
1688 		case FORMAT_R32F:				return 4;
1689 		case FORMAT_G32R32F:			return 8;
1690 		case FORMAT_B32G32R32F:			return 12;
1691 		case FORMAT_X32B32G32R32F:		return 16;
1692 		case FORMAT_A32B32G32R32F:		return 16;
1693 		case FORMAT_X32B32G32R32F_UNSIGNED: return 16;
1694 		// Depth/stencil formats
1695 		case FORMAT_D16:				return 2;
1696 		case FORMAT_D32:				return 4;
1697 		case FORMAT_D24X8:				return 4;
1698 		case FORMAT_D24S8:				return 4;
1699 		case FORMAT_D24FS8:				return 4;
1700 		case FORMAT_D32F:				return 4;
1701 		case FORMAT_D32FS8:				return 4;
1702 		case FORMAT_D32F_COMPLEMENTARY:	return 4;
1703 		case FORMAT_D32FS8_COMPLEMENTARY: return 4;
1704 		case FORMAT_D32F_LOCKABLE:		return 4;
1705 		case FORMAT_D32FS8_TEXTURE:		return 4;
1706 		case FORMAT_D32F_SHADOW:		return 4;
1707 		case FORMAT_D32FS8_SHADOW:		return 4;
1708 		case FORMAT_DF24S8:				return 4;
1709 		case FORMAT_DF16S8:				return 2;
1710 		case FORMAT_INTZ:				return 4;
1711 		case FORMAT_S8:					return 1;
1712 		case FORMAT_YV12_BT601:         return 1;   // Y plane only
1713 		case FORMAT_YV12_BT709:         return 1;   // Y plane only
1714 		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
1715 		default:
1716 			ASSERT(false);
1717 		}
1718 
1719 		return 0;
1720 	}
1721 
pitchB(int width,int border,Format format,bool target)1722 	int Surface::pitchB(int width, int border, Format format, bool target)
1723 	{
1724 		width += 2 * border;
1725 
1726 		// Render targets require 2x2 quads
1727 		if(target || isDepth(format) || isStencil(format))
1728 		{
1729 			width = align<2>(width);
1730 		}
1731 
1732 		switch(format)
1733 		{
1734 		case FORMAT_DXT1:
1735 		case FORMAT_ETC1:
1736 		case FORMAT_R11_EAC:
1737 		case FORMAT_SIGNED_R11_EAC:
1738 		case FORMAT_RGB8_ETC2:
1739 		case FORMAT_SRGB8_ETC2:
1740 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1741 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1742 			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1743 		case FORMAT_RG11_EAC:
1744 		case FORMAT_SIGNED_RG11_EAC:
1745 		case FORMAT_RGBA8_ETC2_EAC:
1746 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1747 		case FORMAT_RGBA_ASTC_4x4_KHR:
1748 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1749 			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
1750 		case FORMAT_RGBA_ASTC_5x4_KHR:
1751 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1752 		case FORMAT_RGBA_ASTC_5x5_KHR:
1753 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1754 			return 16 * ((width + 4) / 5);
1755 		case FORMAT_RGBA_ASTC_6x5_KHR:
1756 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1757 		case FORMAT_RGBA_ASTC_6x6_KHR:
1758 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1759 			return 16 * ((width + 5) / 6);
1760 		case FORMAT_RGBA_ASTC_8x5_KHR:
1761 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1762 		case FORMAT_RGBA_ASTC_8x6_KHR:
1763 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1764 		case FORMAT_RGBA_ASTC_8x8_KHR:
1765 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1766 			return 16 * ((width + 7) / 8);
1767 		case FORMAT_RGBA_ASTC_10x5_KHR:
1768 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1769 		case FORMAT_RGBA_ASTC_10x6_KHR:
1770 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1771 		case FORMAT_RGBA_ASTC_10x8_KHR:
1772 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1773 		case FORMAT_RGBA_ASTC_10x10_KHR:
1774 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1775 			return 16 * ((width + 9) / 10);
1776 		case FORMAT_RGBA_ASTC_12x10_KHR:
1777 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1778 		case FORMAT_RGBA_ASTC_12x12_KHR:
1779 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1780 			return 16 * ((width + 11) / 12);
1781 		case FORMAT_DXT3:
1782 		case FORMAT_DXT5:
1783 			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1784 		case FORMAT_ATI1:
1785 			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1786 		case FORMAT_ATI2:
1787 			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1788 		case FORMAT_YV12_BT601:
1789 		case FORMAT_YV12_BT709:
1790 		case FORMAT_YV12_JFIF:
1791 			return align<16>(width);
1792 		default:
1793 			return bytes(format) * width;
1794 		}
1795 	}
1796 
pitchP(int width,int border,Format format,bool target)1797 	int Surface::pitchP(int width, int border, Format format, bool target)
1798 	{
1799 		int B = bytes(format);
1800 
1801 		return B > 0 ? pitchB(width, border, format, target) / B : 0;
1802 	}
1803 
sliceB(int width,int height,int border,Format format,bool target)1804 	int Surface::sliceB(int width, int height, int border, Format format, bool target)
1805 	{
1806 		height += 2 * border;
1807 
1808 		// Render targets require 2x2 quads
1809 		if(target || isDepth(format) || isStencil(format))
1810 		{
1811 			height = align<2>(height);
1812 		}
1813 
1814 		switch(format)
1815 		{
1816 		case FORMAT_DXT1:
1817 		case FORMAT_DXT3:
1818 		case FORMAT_DXT5:
1819 		case FORMAT_ETC1:
1820 		case FORMAT_R11_EAC:
1821 		case FORMAT_SIGNED_R11_EAC:
1822 		case FORMAT_RG11_EAC:
1823 		case FORMAT_SIGNED_RG11_EAC:
1824 		case FORMAT_RGB8_ETC2:
1825 		case FORMAT_SRGB8_ETC2:
1826 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1827 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1828 		case FORMAT_RGBA8_ETC2_EAC:
1829 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1830 		case FORMAT_RGBA_ASTC_4x4_KHR:
1831 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1832 		case FORMAT_RGBA_ASTC_5x4_KHR:
1833 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1834 			return pitchB(width, border, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1835 		case FORMAT_RGBA_ASTC_5x5_KHR:
1836 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1837 		case FORMAT_RGBA_ASTC_6x5_KHR:
1838 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1839 		case FORMAT_RGBA_ASTC_8x5_KHR:
1840 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1841 		case FORMAT_RGBA_ASTC_10x5_KHR:
1842 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1843 			return pitchB(width, border, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
1844 		case FORMAT_RGBA_ASTC_6x6_KHR:
1845 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1846 		case FORMAT_RGBA_ASTC_8x6_KHR:
1847 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1848 		case FORMAT_RGBA_ASTC_10x6_KHR:
1849 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1850 			return pitchB(width, border, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
1851 		case FORMAT_RGBA_ASTC_8x8_KHR:
1852 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1853 		case FORMAT_RGBA_ASTC_10x8_KHR:
1854 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1855 			return pitchB(width, border, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
1856 		case FORMAT_RGBA_ASTC_10x10_KHR:
1857 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1858 		case FORMAT_RGBA_ASTC_12x10_KHR:
1859 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1860 			return pitchB(width, border, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
1861 		case FORMAT_RGBA_ASTC_12x12_KHR:
1862 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1863 			return pitchB(width, border, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
1864 		case FORMAT_ATI1:
1865 		case FORMAT_ATI2:
1866 			return pitchB(width, border, format, target) * align<4>(height);   // Pitch computed per row
1867 		default:
1868 			return pitchB(width, border, format, target) * height;   // Pitch computed per row
1869 		}
1870 	}
1871 
sliceP(int width,int height,int border,Format format,bool target)1872 	int Surface::sliceP(int width, int height, int border, Format format, bool target)
1873 	{
1874 		int B = bytes(format);
1875 
1876 		return B > 0 ? sliceB(width, height, border, format, target) / B : 0;
1877 	}
1878 
update(Buffer & destination,Buffer & source)1879 	void Surface::update(Buffer &destination, Buffer &source)
1880 	{
1881 	//	ASSERT(source.lock != LOCK_UNLOCKED);
1882 	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1883 
1884 		if(destination.buffer != source.buffer)
1885 		{
1886 			ASSERT(source.dirty && !destination.dirty);
1887 
1888 			switch(source.format)
1889 			{
1890 			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1891 			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1892 			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1893 			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1894 			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1895 			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1896 			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1897 			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1898 			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1899 			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1900 			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1901 			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
1902 			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
1903 			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
1904 			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
1905 			case FORMAT_ETC1:
1906 			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
1907 			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
1908 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
1909 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
1910 			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
1911 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
1912 			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
1913 			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
1914 			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
1915 			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
1916 			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
1917 			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
1918 			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
1919 			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
1920 			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
1921 			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
1922 			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
1923 			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
1924 			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
1925 			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
1926 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
1927 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
1928 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
1929 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
1930 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
1931 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
1932 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
1933 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
1934 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
1935 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
1936 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
1937 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
1938 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
1939 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
1940 			default:				genericUpdate(destination, source);		break;
1941 			}
1942 		}
1943 	}
1944 
genericUpdate(Buffer & destination,Buffer & source)1945 	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1946 	{
1947 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1948 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1949 
1950 		int depth = min(destination.depth, source.depth);
1951 		int height = min(destination.height, source.height);
1952 		int width = min(destination.width, source.width);
1953 		int rowBytes = width * source.bytes;
1954 
1955 		for(int z = 0; z < depth; z++)
1956 		{
1957 			unsigned char *sourceRow = sourceSlice;
1958 			unsigned char *destinationRow = destinationSlice;
1959 
1960 			for(int y = 0; y < height; y++)
1961 			{
1962 				if(source.format == destination.format)
1963 				{
1964 					memcpy(destinationRow, sourceRow, rowBytes);
1965 				}
1966 				else
1967 				{
1968 					unsigned char *sourceElement = sourceRow;
1969 					unsigned char *destinationElement = destinationRow;
1970 
1971 					for(int x = 0; x < width; x++)
1972 					{
1973 						Color<float> color = source.read(sourceElement);
1974 						destination.write(destinationElement, color);
1975 
1976 						sourceElement += source.bytes;
1977 						destinationElement += destination.bytes;
1978 					}
1979 				}
1980 
1981 				sourceRow += source.pitchB;
1982 				destinationRow += destination.pitchB;
1983 			}
1984 
1985 			sourceSlice += source.sliceB;
1986 			destinationSlice += destination.sliceB;
1987 		}
1988 
1989 		source.unlockRect();
1990 		destination.unlockRect();
1991 	}
1992 
decodeR8G8B8(Buffer & destination,Buffer & source)1993 	void Surface::decodeR8G8B8(Buffer &destination, Buffer &source)
1994 	{
1995 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1996 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1997 
1998 		int depth = min(destination.depth, source.depth);
1999 		int height = min(destination.height, source.height);
2000 		int width = min(destination.width, source.width);
2001 
2002 		for(int z = 0; z < depth; z++)
2003 		{
2004 			unsigned char *sourceRow = sourceSlice;
2005 			unsigned char *destinationRow = destinationSlice;
2006 
2007 			for(int y = 0; y < height; y++)
2008 			{
2009 				unsigned char *sourceElement = sourceRow;
2010 				unsigned char *destinationElement = destinationRow;
2011 
2012 				for(int x = 0; x < width; x++)
2013 				{
2014 					unsigned int b = sourceElement[0];
2015 					unsigned int g = sourceElement[1];
2016 					unsigned int r = sourceElement[2];
2017 
2018 					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
2019 
2020 					sourceElement += source.bytes;
2021 					destinationElement += destination.bytes;
2022 				}
2023 
2024 				sourceRow += source.pitchB;
2025 				destinationRow += destination.pitchB;
2026 			}
2027 
2028 			sourceSlice += source.sliceB;
2029 			destinationSlice += destination.sliceB;
2030 		}
2031 
2032 		source.unlockRect();
2033 		destination.unlockRect();
2034 	}
2035 
decodeX1R5G5B5(Buffer & destination,Buffer & source)2036 	void Surface::decodeX1R5G5B5(Buffer &destination, Buffer &source)
2037 	{
2038 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2039 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2040 
2041 		int depth = min(destination.depth, source.depth);
2042 		int height = min(destination.height, source.height);
2043 		int width = min(destination.width, source.width);
2044 
2045 		for(int z = 0; z < depth; z++)
2046 		{
2047 			unsigned char *sourceRow = sourceSlice;
2048 			unsigned char *destinationRow = destinationSlice;
2049 
2050 			for(int y = 0; y < height; y++)
2051 			{
2052 				unsigned char *sourceElement = sourceRow;
2053 				unsigned char *destinationElement = destinationRow;
2054 
2055 				for(int x = 0; x < width; x++)
2056 				{
2057 					unsigned int xrgb = *(unsigned short*)sourceElement;
2058 
2059 					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
2060 					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
2061 					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
2062 
2063 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2064 
2065 					sourceElement += source.bytes;
2066 					destinationElement += destination.bytes;
2067 				}
2068 
2069 				sourceRow += source.pitchB;
2070 				destinationRow += destination.pitchB;
2071 			}
2072 
2073 			sourceSlice += source.sliceB;
2074 			destinationSlice += destination.sliceB;
2075 		}
2076 
2077 		source.unlockRect();
2078 		destination.unlockRect();
2079 	}
2080 
decodeA1R5G5B5(Buffer & destination,Buffer & source)2081 	void Surface::decodeA1R5G5B5(Buffer &destination, Buffer &source)
2082 	{
2083 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2084 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2085 
2086 		int depth = min(destination.depth, source.depth);
2087 		int height = min(destination.height, source.height);
2088 		int width = min(destination.width, source.width);
2089 
2090 		for(int z = 0; z < depth; z++)
2091 		{
2092 			unsigned char *sourceRow = sourceSlice;
2093 			unsigned char *destinationRow = destinationSlice;
2094 
2095 			for(int y = 0; y < height; y++)
2096 			{
2097 				unsigned char *sourceElement = sourceRow;
2098 				unsigned char *destinationElement = destinationRow;
2099 
2100 				for(int x = 0; x < width; x++)
2101 				{
2102 					unsigned int argb = *(unsigned short*)sourceElement;
2103 
2104 					unsigned int a =   (argb & 0x8000) * 130560;
2105 					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
2106 					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
2107 					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
2108 
2109 					*(unsigned int*)destinationElement = a | r | g | b;
2110 
2111 					sourceElement += source.bytes;
2112 					destinationElement += destination.bytes;
2113 				}
2114 
2115 				sourceRow += source.pitchB;
2116 				destinationRow += destination.pitchB;
2117 			}
2118 
2119 			sourceSlice += source.sliceB;
2120 			destinationSlice += destination.sliceB;
2121 		}
2122 
2123 		source.unlockRect();
2124 		destination.unlockRect();
2125 	}
2126 
decodeX4R4G4B4(Buffer & destination,Buffer & source)2127 	void Surface::decodeX4R4G4B4(Buffer &destination, Buffer &source)
2128 	{
2129 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2130 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2131 
2132 		int depth = min(destination.depth, source.depth);
2133 		int height = min(destination.height, source.height);
2134 		int width = min(destination.width, source.width);
2135 
2136 		for(int z = 0; z < depth; z++)
2137 		{
2138 			unsigned char *sourceRow = sourceSlice;
2139 			unsigned char *destinationRow = destinationSlice;
2140 
2141 			for(int y = 0; y < height; y++)
2142 			{
2143 				unsigned char *sourceElement = sourceRow;
2144 				unsigned char *destinationElement = destinationRow;
2145 
2146 				for(int x = 0; x < width; x++)
2147 				{
2148 					unsigned int xrgb = *(unsigned short*)sourceElement;
2149 
2150 					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
2151 					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
2152 					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
2153 
2154 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2155 
2156 					sourceElement += source.bytes;
2157 					destinationElement += destination.bytes;
2158 				}
2159 
2160 				sourceRow += source.pitchB;
2161 				destinationRow += destination.pitchB;
2162 			}
2163 
2164 			sourceSlice += source.sliceB;
2165 			destinationSlice += destination.sliceB;
2166 		}
2167 
2168 		source.unlockRect();
2169 		destination.unlockRect();
2170 	}
2171 
decodeA4R4G4B4(Buffer & destination,Buffer & source)2172 	void Surface::decodeA4R4G4B4(Buffer &destination, Buffer &source)
2173 	{
2174 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2175 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2176 
2177 		int depth = min(destination.depth, source.depth);
2178 		int height = min(destination.height, source.height);
2179 		int width = min(destination.width, source.width);
2180 
2181 		for(int z = 0; z < depth; z++)
2182 		{
2183 			unsigned char *sourceRow = sourceSlice;
2184 			unsigned char *destinationRow = destinationSlice;
2185 
2186 			for(int y = 0; y < height; y++)
2187 			{
2188 				unsigned char *sourceElement = sourceRow;
2189 				unsigned char *destinationElement = destinationRow;
2190 
2191 				for(int x = 0; x < width; x++)
2192 				{
2193 					unsigned int argb = *(unsigned short*)sourceElement;
2194 
2195 					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
2196 					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
2197 					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
2198 					unsigned int b =  (argb & 0x000F) * 0x00000011;
2199 
2200 					*(unsigned int*)destinationElement = a | r | g | b;
2201 
2202 					sourceElement += source.bytes;
2203 					destinationElement += destination.bytes;
2204 				}
2205 
2206 				sourceRow += source.pitchB;
2207 				destinationRow += destination.pitchB;
2208 			}
2209 
2210 			sourceSlice += source.sliceB;
2211 			destinationSlice += destination.sliceB;
2212 		}
2213 
2214 		source.unlockRect();
2215 		destination.unlockRect();
2216 	}
2217 
decodeP8(Buffer & destination,Buffer & source)2218 	void Surface::decodeP8(Buffer &destination, Buffer &source)
2219 	{
2220 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2221 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2222 
2223 		int depth = min(destination.depth, source.depth);
2224 		int height = min(destination.height, source.height);
2225 		int width = min(destination.width, source.width);
2226 
2227 		for(int z = 0; z < depth; z++)
2228 		{
2229 			unsigned char *sourceRow = sourceSlice;
2230 			unsigned char *destinationRow = destinationSlice;
2231 
2232 			for(int y = 0; y < height; y++)
2233 			{
2234 				unsigned char *sourceElement = sourceRow;
2235 				unsigned char *destinationElement = destinationRow;
2236 
2237 				for(int x = 0; x < width; x++)
2238 				{
2239 					unsigned int abgr = palette[*(unsigned char*)sourceElement];
2240 
2241 					unsigned int r = (abgr & 0x000000FF) << 16;
2242 					unsigned int g = (abgr & 0x0000FF00) << 0;
2243 					unsigned int b = (abgr & 0x00FF0000) >> 16;
2244 					unsigned int a = (abgr & 0xFF000000) >> 0;
2245 
2246 					*(unsigned int*)destinationElement = a | r | g | b;
2247 
2248 					sourceElement += source.bytes;
2249 					destinationElement += destination.bytes;
2250 				}
2251 
2252 				sourceRow += source.pitchB;
2253 				destinationRow += destination.pitchB;
2254 			}
2255 
2256 			sourceSlice += source.sliceB;
2257 			destinationSlice += destination.sliceB;
2258 		}
2259 
2260 		source.unlockRect();
2261 		destination.unlockRect();
2262 	}
2263 
decodeDXT1(Buffer & internal,Buffer & external)2264 	void Surface::decodeDXT1(Buffer &internal, Buffer &external)
2265 	{
2266 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2267 		const DXT1 *source = (const DXT1*)external.lockRect(0, 0, 0, LOCK_READONLY);
2268 
2269 		for(int z = 0; z < external.depth; z++)
2270 		{
2271 			unsigned int *dest = destSlice;
2272 
2273 			for(int y = 0; y < external.height; y += 4)
2274 			{
2275 				for(int x = 0; x < external.width; x += 4)
2276 				{
2277 					Color<byte> c[4];
2278 
2279 					c[0] = source->c0;
2280 					c[1] = source->c1;
2281 
2282 					if(source->c0 > source->c1)   // No transparency
2283 					{
2284 						// c2 = 2 / 3 * c0 + 1 / 3 * c1
2285 						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2286 						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2287 						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2288 						c[2].a = 0xFF;
2289 
2290 						// c3 = 1 / 3 * c0 + 2 / 3 * c1
2291 						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2292 						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2293 						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2294 						c[3].a = 0xFF;
2295 					}
2296 					else   // c3 transparent
2297 					{
2298 						// c2 = 1 / 2 * c0 + 1 / 2 * c1
2299 						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
2300 						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
2301 						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
2302 						c[2].a = 0xFF;
2303 
2304 						c[3].r = 0;
2305 						c[3].g = 0;
2306 						c[3].b = 0;
2307 						c[3].a = 0;
2308 					}
2309 
2310 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2311 					{
2312 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2313 						{
2314 							dest[(x + i) + (y + j) * internal.pitchP] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
2315 						}
2316 					}
2317 
2318 					source++;
2319 				}
2320 			}
2321 
2322 			(byte*&)destSlice += internal.sliceB;
2323 		}
2324 
2325 		external.unlockRect();
2326 		internal.unlockRect();
2327 	}
2328 
decodeDXT3(Buffer & internal,Buffer & external)2329 	void Surface::decodeDXT3(Buffer &internal, Buffer &external)
2330 	{
2331 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2332 		const DXT3 *source = (const DXT3*)external.lockRect(0, 0, 0, LOCK_READONLY);
2333 
2334 		for(int z = 0; z < external.depth; z++)
2335 		{
2336 			unsigned int *dest = destSlice;
2337 
2338 			for(int y = 0; y < external.height; y += 4)
2339 			{
2340 				for(int x = 0; x < external.width; x += 4)
2341 				{
2342 					Color<byte> c[4];
2343 
2344 					c[0] = source->c0;
2345 					c[1] = source->c1;
2346 
2347 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2348 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2349 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2350 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2351 
2352 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2353 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2354 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2355 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2356 
2357 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2358 					{
2359 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2360 						{
2361 							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
2362 							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
2363 
2364 							dest[(x + i) + (y + j) * internal.pitchP] = color;
2365 						}
2366 					}
2367 
2368 					source++;
2369 				}
2370 			}
2371 
2372 			(byte*&)destSlice += internal.sliceB;
2373 		}
2374 
2375 		external.unlockRect();
2376 		internal.unlockRect();
2377 	}
2378 
decodeDXT5(Buffer & internal,Buffer & external)2379 	void Surface::decodeDXT5(Buffer &internal, Buffer &external)
2380 	{
2381 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2382 		const DXT5 *source = (const DXT5*)external.lockRect(0, 0, 0, LOCK_READONLY);
2383 
2384 		for(int z = 0; z < external.depth; z++)
2385 		{
2386 			unsigned int *dest = destSlice;
2387 
2388 			for(int y = 0; y < external.height; y += 4)
2389 			{
2390 				for(int x = 0; x < external.width; x += 4)
2391 				{
2392 					Color<byte> c[4];
2393 
2394 					c[0] = source->c0;
2395 					c[1] = source->c1;
2396 
2397 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2398 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2399 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2400 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2401 
2402 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2403 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2404 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2405 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2406 
2407 					byte a[8];
2408 
2409 					a[0] = source->a0;
2410 					a[1] = source->a1;
2411 
2412 					if(a[0] > a[1])
2413 					{
2414 						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
2415 						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
2416 						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
2417 						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
2418 						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
2419 						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
2420 					}
2421 					else
2422 					{
2423 						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
2424 						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
2425 						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
2426 						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
2427 						a[6] = 0;
2428 						a[7] = 0xFF;
2429 					}
2430 
2431 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2432 					{
2433 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2434 						{
2435 							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
2436 							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
2437 
2438 							dest[(x + i) + (y + j) * internal.pitchP] = color;
2439 						}
2440 					}
2441 
2442 					source++;
2443 				}
2444 			}
2445 
2446 			(byte*&)destSlice += internal.sliceB;
2447 		}
2448 
2449 		external.unlockRect();
2450 		internal.unlockRect();
2451 	}
2452 
decodeATI1(Buffer & internal,Buffer & external)2453 	void Surface::decodeATI1(Buffer &internal, Buffer &external)
2454 	{
2455 		byte *destSlice = (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2456 		const ATI1 *source = (const ATI1*)external.lockRect(0, 0, 0, LOCK_READONLY);
2457 
2458 		for(int z = 0; z < external.depth; z++)
2459 		{
2460 			byte *dest = destSlice;
2461 
2462 			for(int y = 0; y < external.height; y += 4)
2463 			{
2464 				for(int x = 0; x < external.width; x += 4)
2465 				{
2466 					byte r[8];
2467 
2468 					r[0] = source->r0;
2469 					r[1] = source->r1;
2470 
2471 					if(r[0] > r[1])
2472 					{
2473 						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
2474 						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
2475 						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
2476 						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
2477 						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
2478 						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
2479 					}
2480 					else
2481 					{
2482 						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
2483 						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
2484 						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
2485 						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
2486 						r[6] = 0;
2487 						r[7] = 0xFF;
2488 					}
2489 
2490 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2491 					{
2492 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2493 						{
2494 							dest[(x + i) + (y + j) * internal.pitchP] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
2495 						}
2496 					}
2497 
2498 					source++;
2499 				}
2500 			}
2501 
2502 			destSlice += internal.sliceB;
2503 		}
2504 
2505 		external.unlockRect();
2506 		internal.unlockRect();
2507 	}
2508 
decodeATI2(Buffer & internal,Buffer & external)2509 	void Surface::decodeATI2(Buffer &internal, Buffer &external)
2510 	{
2511 		word *destSlice = (word*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2512 		const ATI2 *source = (const ATI2*)external.lockRect(0, 0, 0, LOCK_READONLY);
2513 
2514 		for(int z = 0; z < external.depth; z++)
2515 		{
2516 			word *dest = destSlice;
2517 
2518 			for(int y = 0; y < external.height; y += 4)
2519 			{
2520 				for(int x = 0; x < external.width; x += 4)
2521 				{
2522 					byte X[8];
2523 
2524 					X[0] = source->x0;
2525 					X[1] = source->x1;
2526 
2527 					if(X[0] > X[1])
2528 					{
2529 						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
2530 						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
2531 						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
2532 						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
2533 						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
2534 						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
2535 					}
2536 					else
2537 					{
2538 						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
2539 						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
2540 						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
2541 						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
2542 						X[6] = 0;
2543 						X[7] = 0xFF;
2544 					}
2545 
2546 					byte Y[8];
2547 
2548 					Y[0] = source->y0;
2549 					Y[1] = source->y1;
2550 
2551 					if(Y[0] > Y[1])
2552 					{
2553 						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
2554 						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
2555 						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
2556 						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
2557 						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
2558 						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
2559 					}
2560 					else
2561 					{
2562 						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
2563 						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
2564 						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
2565 						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
2566 						Y[6] = 0;
2567 						Y[7] = 0xFF;
2568 					}
2569 
2570 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2571 					{
2572 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2573 						{
2574 							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
2575 							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
2576 
2577 							dest[(x + i) + (y + j) * internal.pitchP] = (g << 8) + r;
2578 						}
2579 					}
2580 
2581 					source++;
2582 				}
2583 			}
2584 
2585 			(byte*&)destSlice += internal.sliceB;
2586 		}
2587 
2588 		external.unlockRect();
2589 		internal.unlockRect();
2590 	}
2591 
decodeETC2(Buffer & internal,Buffer & external,int nbAlphaBits,bool isSRGB)2592 	void Surface::decodeETC2(Buffer &internal, Buffer &external, int nbAlphaBits, bool isSRGB)
2593 	{
2594 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE), external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2595 		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
2596 		external.unlockRect();
2597 		internal.unlockRect();
2598 
2599 		if(isSRGB)
2600 		{
2601 			static byte sRGBtoLinearTable[256];
2602 			static bool sRGBtoLinearTableDirty = true;
2603 			if(sRGBtoLinearTableDirty)
2604 			{
2605 				for(int i = 0; i < 256; i++)
2606 				{
2607 					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
2608 				}
2609 				sRGBtoLinearTableDirty = false;
2610 			}
2611 
2612 			// Perform sRGB conversion in place after decoding
2613 			byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
2614 			for(int y = 0; y < internal.height; y++)
2615 			{
2616 				byte *srcRow = src + y * internal.pitchB;
2617 				for(int x = 0; x <  internal.width; x++)
2618 				{
2619 					byte *srcPix = srcRow + x * internal.bytes;
2620 					for(int i = 0; i < 3; i++)
2621 					{
2622 						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
2623 					}
2624 				}
2625 			}
2626 			internal.unlockRect();
2627 		}
2628 	}
2629 
decodeEAC(Buffer & internal,Buffer & external,int nbChannels,bool isSigned)2630 	void Surface::decodeEAC(Buffer &internal, Buffer &external, int nbChannels, bool isSigned)
2631 	{
2632 		ASSERT(nbChannels == 1 || nbChannels == 2);
2633 
2634 		byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
2635 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), src, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2636 		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
2637 		external.unlockRect();
2638 
2639 		// FIXME: We convert EAC data to float, until signed short internal formats are supported
2640 		//        This code can be removed if ETC2 images are decoded to internal 16 bit signed R/RG formats
2641 		const float normalization = isSigned ? (1.0f / (8.0f * 127.875f)) : (1.0f / (8.0f * 255.875f));
2642 		for(int y = 0; y < internal.height; y++)
2643 		{
2644 			byte* srcRow = src + y * internal.pitchB;
2645 			for(int x = internal.width - 1; x >= 0; x--)
2646 			{
2647 				int* srcPix = reinterpret_cast<int*>(srcRow + x * internal.bytes);
2648 				float* dstPix = reinterpret_cast<float*>(srcPix);
2649 				for(int c = nbChannels - 1; c >= 0; c--)
2650 				{
2651 					dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
2652 				}
2653 			}
2654 		}
2655 
2656 		internal.unlockRect();
2657 	}
2658 
decodeASTC(Buffer & internal,Buffer & external,int xBlockSize,int yBlockSize,int zBlockSize,bool isSRGB)2659 	void Surface::decodeASTC(Buffer &internal, Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
2660 	{
2661 	}
2662 
size(int width,int height,int depth,int border,int samples,Format format)2663 	size_t Surface::size(int width, int height, int depth, int border, int samples, Format format)
2664 	{
2665 		samples = max(1, samples);
2666 
2667 		switch(format)
2668 		{
2669 		default:
2670 			{
2671 				uint64_t size = (uint64_t)sliceB(width, height, border, format, true) * depth * samples;
2672 
2673 				// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
2674 				// and stencil operations also read 8 bytes per four 8-bit stencil values,
2675 				// so we have to allocate 4 extra bytes to avoid buffer overruns.
2676 				size += 4;
2677 
2678 				// We can only sample buffers smaller than 2 GiB.
2679 				// Force an out-of-memory if larger, or let the caller report an error.
2680 				return size < 0x80000000u ? (size_t)size : std::numeric_limits<size_t>::max();
2681 			}
2682 		case FORMAT_YV12_BT601:
2683 		case FORMAT_YV12_BT709:
2684 		case FORMAT_YV12_JFIF:
2685 			{
2686 				width += 2 * border;
2687 				height += 2 * border;
2688 
2689 				size_t YStride = align<16>(width);
2690 				size_t YSize = YStride * height;
2691 				size_t CStride = align<16>(YStride / 2);
2692 				size_t CSize = CStride * height / 2;
2693 
2694 				return YSize + 2 * CSize;
2695 			}
2696 		}
2697 	}
2698 
isStencil(Format format)2699 	bool Surface::isStencil(Format format)
2700 	{
2701 		switch(format)
2702 		{
2703 		case FORMAT_D32:
2704 		case FORMAT_D16:
2705 		case FORMAT_D24X8:
2706 		case FORMAT_D32F:
2707 		case FORMAT_D32F_COMPLEMENTARY:
2708 		case FORMAT_D32F_LOCKABLE:
2709 		case FORMAT_D32F_SHADOW:
2710 			return false;
2711 		case FORMAT_D24S8:
2712 		case FORMAT_D24FS8:
2713 		case FORMAT_S8:
2714 		case FORMAT_DF24S8:
2715 		case FORMAT_DF16S8:
2716 		case FORMAT_D32FS8_TEXTURE:
2717 		case FORMAT_D32FS8_SHADOW:
2718 		case FORMAT_D32FS8:
2719 		case FORMAT_D32FS8_COMPLEMENTARY:
2720 		case FORMAT_INTZ:
2721 			return true;
2722 		default:
2723 			return false;
2724 		}
2725 	}
2726 
isDepth(Format format)2727 	bool Surface::isDepth(Format format)
2728 	{
2729 		switch(format)
2730 		{
2731 		case FORMAT_D32:
2732 		case FORMAT_D16:
2733 		case FORMAT_D24X8:
2734 		case FORMAT_D24S8:
2735 		case FORMAT_D24FS8:
2736 		case FORMAT_D32F:
2737 		case FORMAT_D32FS8:
2738 		case FORMAT_D32F_COMPLEMENTARY:
2739 		case FORMAT_D32FS8_COMPLEMENTARY:
2740 		case FORMAT_D32F_LOCKABLE:
2741 		case FORMAT_DF24S8:
2742 		case FORMAT_DF16S8:
2743 		case FORMAT_D32FS8_TEXTURE:
2744 		case FORMAT_D32F_SHADOW:
2745 		case FORMAT_D32FS8_SHADOW:
2746 		case FORMAT_INTZ:
2747 			return true;
2748 		case FORMAT_S8:
2749 			return false;
2750 		default:
2751 			return false;
2752 		}
2753 	}
2754 
hasQuadLayout(Format format)2755 	bool Surface::hasQuadLayout(Format format)
2756 	{
2757 		switch(format)
2758 		{
2759 		case FORMAT_D32:
2760 		case FORMAT_D16:
2761 		case FORMAT_D24X8:
2762 		case FORMAT_D24S8:
2763 		case FORMAT_D24FS8:
2764 		case FORMAT_D32F:
2765 		case FORMAT_D32FS8:
2766 		case FORMAT_D32F_COMPLEMENTARY:
2767 		case FORMAT_D32FS8_COMPLEMENTARY:
2768 		case FORMAT_DF24S8:
2769 		case FORMAT_DF16S8:
2770 		case FORMAT_INTZ:
2771 		case FORMAT_S8:
2772 		case FORMAT_A8G8R8B8Q:
2773 		case FORMAT_X8G8R8B8Q:
2774 			return true;
2775 		case FORMAT_D32F_LOCKABLE:
2776 		case FORMAT_D32FS8_TEXTURE:
2777 		case FORMAT_D32F_SHADOW:
2778 		case FORMAT_D32FS8_SHADOW:
2779 		default:
2780 			break;
2781 		}
2782 
2783 		return false;
2784 	}
2785 
isPalette(Format format)2786 	bool Surface::isPalette(Format format)
2787 	{
2788 		switch(format)
2789 		{
2790 		case FORMAT_P8:
2791 		case FORMAT_A8P8:
2792 			return true;
2793 		default:
2794 			return false;
2795 		}
2796 	}
2797 
isFloatFormat(Format format)2798 	bool Surface::isFloatFormat(Format format)
2799 	{
2800 		switch(format)
2801 		{
2802 		case FORMAT_R5G6B5:
2803 		case FORMAT_R8G8B8:
2804 		case FORMAT_B8G8R8:
2805 		case FORMAT_X8R8G8B8:
2806 		case FORMAT_X8B8G8R8I:
2807 		case FORMAT_X8B8G8R8:
2808 		case FORMAT_A8R8G8B8:
2809 		case FORMAT_SRGB8_X8:
2810 		case FORMAT_SRGB8_A8:
2811 		case FORMAT_A8B8G8R8I:
2812 		case FORMAT_R8UI:
2813 		case FORMAT_G8R8UI:
2814 		case FORMAT_X8B8G8R8UI:
2815 		case FORMAT_A8B8G8R8UI:
2816 		case FORMAT_A8B8G8R8:
2817 		case FORMAT_G8R8I:
2818 		case FORMAT_G8R8:
2819 		case FORMAT_A2B10G10R10:
2820 		case FORMAT_A2B10G10R10UI:
2821 		case FORMAT_R8_SNORM:
2822 		case FORMAT_G8R8_SNORM:
2823 		case FORMAT_X8B8G8R8_SNORM:
2824 		case FORMAT_A8B8G8R8_SNORM:
2825 		case FORMAT_R16I:
2826 		case FORMAT_R16UI:
2827 		case FORMAT_G16R16I:
2828 		case FORMAT_G16R16UI:
2829 		case FORMAT_G16R16:
2830 		case FORMAT_X16B16G16R16I:
2831 		case FORMAT_X16B16G16R16UI:
2832 		case FORMAT_A16B16G16R16I:
2833 		case FORMAT_A16B16G16R16UI:
2834 		case FORMAT_A16B16G16R16:
2835 		case FORMAT_V8U8:
2836 		case FORMAT_Q8W8V8U8:
2837 		case FORMAT_X8L8V8U8:
2838 		case FORMAT_V16U16:
2839 		case FORMAT_A16W16V16U16:
2840 		case FORMAT_Q16W16V16U16:
2841 		case FORMAT_A8:
2842 		case FORMAT_R8I:
2843 		case FORMAT_R8:
2844 		case FORMAT_S8:
2845 		case FORMAT_L8:
2846 		case FORMAT_L16:
2847 		case FORMAT_A8L8:
2848 		case FORMAT_YV12_BT601:
2849 		case FORMAT_YV12_BT709:
2850 		case FORMAT_YV12_JFIF:
2851 		case FORMAT_R32I:
2852 		case FORMAT_R32UI:
2853 		case FORMAT_G32R32I:
2854 		case FORMAT_G32R32UI:
2855 		case FORMAT_X32B32G32R32I:
2856 		case FORMAT_X32B32G32R32UI:
2857 		case FORMAT_A32B32G32R32I:
2858 		case FORMAT_A32B32G32R32UI:
2859 			return false;
2860 		case FORMAT_R16F:
2861 		case FORMAT_G16R16F:
2862 		case FORMAT_B16G16R16F:
2863 		case FORMAT_X16B16G16R16F:
2864 		case FORMAT_A16B16G16R16F:
2865 		case FORMAT_X16B16G16R16F_UNSIGNED:
2866 		case FORMAT_R32F:
2867 		case FORMAT_G32R32F:
2868 		case FORMAT_B32G32R32F:
2869 		case FORMAT_X32B32G32R32F:
2870 		case FORMAT_A32B32G32R32F:
2871 		case FORMAT_X32B32G32R32F_UNSIGNED:
2872 		case FORMAT_D32F:
2873 		case FORMAT_D32FS8:
2874 		case FORMAT_D32F_COMPLEMENTARY:
2875 		case FORMAT_D32FS8_COMPLEMENTARY:
2876 		case FORMAT_D32F_LOCKABLE:
2877 		case FORMAT_D32FS8_TEXTURE:
2878 		case FORMAT_D32F_SHADOW:
2879 		case FORMAT_D32FS8_SHADOW:
2880 		case FORMAT_L16F:
2881 		case FORMAT_A16L16F:
2882 		case FORMAT_L32F:
2883 		case FORMAT_A32L32F:
2884 			return true;
2885 		default:
2886 			ASSERT(false);
2887 		}
2888 
2889 		return false;
2890 	}
2891 
isUnsignedComponent(Format format,int component)2892 	bool Surface::isUnsignedComponent(Format format, int component)
2893 	{
2894 		switch(format)
2895 		{
2896 		case FORMAT_NULL:
2897 		case FORMAT_R5G6B5:
2898 		case FORMAT_R8G8B8:
2899 		case FORMAT_B8G8R8:
2900 		case FORMAT_X8R8G8B8:
2901 		case FORMAT_X8B8G8R8:
2902 		case FORMAT_A8R8G8B8:
2903 		case FORMAT_A8B8G8R8:
2904 		case FORMAT_SRGB8_X8:
2905 		case FORMAT_SRGB8_A8:
2906 		case FORMAT_G8R8:
2907 		case FORMAT_A2B10G10R10:
2908 		case FORMAT_A2B10G10R10UI:
2909 		case FORMAT_R16UI:
2910 		case FORMAT_G16R16:
2911 		case FORMAT_G16R16UI:
2912 		case FORMAT_X16B16G16R16UI:
2913 		case FORMAT_A16B16G16R16:
2914 		case FORMAT_A16B16G16R16UI:
2915 		case FORMAT_R32UI:
2916 		case FORMAT_G32R32UI:
2917 		case FORMAT_X32B32G32R32UI:
2918 		case FORMAT_A32B32G32R32UI:
2919 		case FORMAT_X32B32G32R32F_UNSIGNED:
2920 		case FORMAT_R8UI:
2921 		case FORMAT_G8R8UI:
2922 		case FORMAT_X8B8G8R8UI:
2923 		case FORMAT_A8B8G8R8UI:
2924 		case FORMAT_D32F:
2925 		case FORMAT_D32FS8:
2926 		case FORMAT_D32F_COMPLEMENTARY:
2927 		case FORMAT_D32FS8_COMPLEMENTARY:
2928 		case FORMAT_D32F_LOCKABLE:
2929 		case FORMAT_D32FS8_TEXTURE:
2930 		case FORMAT_D32F_SHADOW:
2931 		case FORMAT_D32FS8_SHADOW:
2932 		case FORMAT_A8:
2933 		case FORMAT_R8:
2934 		case FORMAT_L8:
2935 		case FORMAT_L16:
2936 		case FORMAT_A8L8:
2937 		case FORMAT_YV12_BT601:
2938 		case FORMAT_YV12_BT709:
2939 		case FORMAT_YV12_JFIF:
2940 			return true;
2941 		case FORMAT_A8B8G8R8I:
2942 		case FORMAT_A16B16G16R16I:
2943 		case FORMAT_A32B32G32R32I:
2944 		case FORMAT_A8B8G8R8_SNORM:
2945 		case FORMAT_Q8W8V8U8:
2946 		case FORMAT_Q16W16V16U16:
2947 		case FORMAT_A32B32G32R32F:
2948 			return false;
2949 		case FORMAT_R32F:
2950 		case FORMAT_R8I:
2951 		case FORMAT_R16I:
2952 		case FORMAT_R32I:
2953 		case FORMAT_R8_SNORM:
2954 			return component >= 1;
2955 		case FORMAT_V8U8:
2956 		case FORMAT_X8L8V8U8:
2957 		case FORMAT_V16U16:
2958 		case FORMAT_G32R32F:
2959 		case FORMAT_G8R8I:
2960 		case FORMAT_G16R16I:
2961 		case FORMAT_G32R32I:
2962 		case FORMAT_G8R8_SNORM:
2963 			return component >= 2;
2964 		case FORMAT_A16W16V16U16:
2965 		case FORMAT_B32G32R32F:
2966 		case FORMAT_X32B32G32R32F:
2967 		case FORMAT_X8B8G8R8I:
2968 		case FORMAT_X16B16G16R16I:
2969 		case FORMAT_X32B32G32R32I:
2970 		case FORMAT_X8B8G8R8_SNORM:
2971 			return component >= 3;
2972 		default:
2973 			ASSERT(false);
2974 		}
2975 
2976 		return false;
2977 	}
2978 
isSRGBreadable(Format format)2979 	bool Surface::isSRGBreadable(Format format)
2980 	{
2981 		// Keep in sync with Capabilities::isSRGBreadable
2982 		switch(format)
2983 		{
2984 		case FORMAT_L8:
2985 		case FORMAT_A8L8:
2986 		case FORMAT_R8G8B8:
2987 		case FORMAT_A8R8G8B8:
2988 		case FORMAT_X8R8G8B8:
2989 		case FORMAT_A8B8G8R8:
2990 		case FORMAT_X8B8G8R8:
2991 		case FORMAT_SRGB8_X8:
2992 		case FORMAT_SRGB8_A8:
2993 		case FORMAT_R5G6B5:
2994 		case FORMAT_X1R5G5B5:
2995 		case FORMAT_A1R5G5B5:
2996 		case FORMAT_A4R4G4B4:
2997 		case FORMAT_DXT1:
2998 		case FORMAT_DXT3:
2999 		case FORMAT_DXT5:
3000 		case FORMAT_ATI1:
3001 		case FORMAT_ATI2:
3002 			return true;
3003 		default:
3004 			return false;
3005 		}
3006 	}
3007 
isSRGBwritable(Format format)3008 	bool Surface::isSRGBwritable(Format format)
3009 	{
3010 		// Keep in sync with Capabilities::isSRGBwritable
3011 		switch(format)
3012 		{
3013 		case FORMAT_NULL:
3014 		case FORMAT_A8R8G8B8:
3015 		case FORMAT_X8R8G8B8:
3016 		case FORMAT_A8B8G8R8:
3017 		case FORMAT_X8B8G8R8:
3018 		case FORMAT_SRGB8_X8:
3019 		case FORMAT_SRGB8_A8:
3020 		case FORMAT_R5G6B5:
3021 			return true;
3022 		default:
3023 			return false;
3024 		}
3025 	}
3026 
isSRGBformat(Format format)3027 	bool Surface::isSRGBformat(Format format)
3028 	{
3029 		switch(format)
3030 		{
3031 		case FORMAT_SRGB8_X8:
3032 		case FORMAT_SRGB8_A8:
3033 			return true;
3034 		default:
3035 			return false;
3036 		}
3037 	}
3038 
isCompressed(Format format)3039 	bool Surface::isCompressed(Format format)
3040 	{
3041 		switch(format)
3042 		{
3043 		case FORMAT_DXT1:
3044 		case FORMAT_DXT3:
3045 		case FORMAT_DXT5:
3046 		case FORMAT_ATI1:
3047 		case FORMAT_ATI2:
3048 		case FORMAT_ETC1:
3049 		case FORMAT_R11_EAC:
3050 		case FORMAT_SIGNED_R11_EAC:
3051 		case FORMAT_RG11_EAC:
3052 		case FORMAT_SIGNED_RG11_EAC:
3053 		case FORMAT_RGB8_ETC2:
3054 		case FORMAT_SRGB8_ETC2:
3055 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3056 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3057 		case FORMAT_RGBA8_ETC2_EAC:
3058 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3059 		case FORMAT_RGBA_ASTC_4x4_KHR:
3060 		case FORMAT_RGBA_ASTC_5x4_KHR:
3061 		case FORMAT_RGBA_ASTC_5x5_KHR:
3062 		case FORMAT_RGBA_ASTC_6x5_KHR:
3063 		case FORMAT_RGBA_ASTC_6x6_KHR:
3064 		case FORMAT_RGBA_ASTC_8x5_KHR:
3065 		case FORMAT_RGBA_ASTC_8x6_KHR:
3066 		case FORMAT_RGBA_ASTC_8x8_KHR:
3067 		case FORMAT_RGBA_ASTC_10x5_KHR:
3068 		case FORMAT_RGBA_ASTC_10x6_KHR:
3069 		case FORMAT_RGBA_ASTC_10x8_KHR:
3070 		case FORMAT_RGBA_ASTC_10x10_KHR:
3071 		case FORMAT_RGBA_ASTC_12x10_KHR:
3072 		case FORMAT_RGBA_ASTC_12x12_KHR:
3073 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3074 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3075 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3076 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3077 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3078 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3079 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3080 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3081 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3082 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3083 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3084 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3085 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3086 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3087 			return true;
3088 		default:
3089 			return false;
3090 		}
3091 	}
3092 
isSignedNonNormalizedInteger(Format format)3093 	bool Surface::isSignedNonNormalizedInteger(Format format)
3094 	{
3095 		switch(format)
3096 		{
3097 		case FORMAT_A8B8G8R8I:
3098 		case FORMAT_X8B8G8R8I:
3099 		case FORMAT_G8R8I:
3100 		case FORMAT_R8I:
3101 		case FORMAT_A16B16G16R16I:
3102 		case FORMAT_X16B16G16R16I:
3103 		case FORMAT_G16R16I:
3104 		case FORMAT_R16I:
3105 		case FORMAT_A32B32G32R32I:
3106 		case FORMAT_X32B32G32R32I:
3107 		case FORMAT_G32R32I:
3108 		case FORMAT_R32I:
3109 			return true;
3110 		default:
3111 			return false;
3112 		}
3113 	}
3114 
isUnsignedNonNormalizedInteger(Format format)3115 	bool Surface::isUnsignedNonNormalizedInteger(Format format)
3116 	{
3117 		switch(format)
3118 		{
3119 		case FORMAT_A8B8G8R8UI:
3120 		case FORMAT_X8B8G8R8UI:
3121 		case FORMAT_G8R8UI:
3122 		case FORMAT_R8UI:
3123 		case FORMAT_A16B16G16R16UI:
3124 		case FORMAT_X16B16G16R16UI:
3125 		case FORMAT_G16R16UI:
3126 		case FORMAT_R16UI:
3127 		case FORMAT_A32B32G32R32UI:
3128 		case FORMAT_X32B32G32R32UI:
3129 		case FORMAT_G32R32UI:
3130 		case FORMAT_R32UI:
3131 			return true;
3132 		default:
3133 			return false;
3134 		}
3135 	}
3136 
isNonNormalizedInteger(Format format)3137 	bool Surface::isNonNormalizedInteger(Format format)
3138 	{
3139 		return isSignedNonNormalizedInteger(format) ||
3140 		       isUnsignedNonNormalizedInteger(format);
3141 	}
3142 
isNormalizedInteger(Format format)3143 	bool Surface::isNormalizedInteger(Format format)
3144 	{
3145 		return !isFloatFormat(format) &&
3146 		       !isNonNormalizedInteger(format) &&
3147 		       !isCompressed(format) &&
3148 		       !isDepth(format) &&
3149 		       !isStencil(format);
3150 	}
3151 
componentCount(Format format)3152 	int Surface::componentCount(Format format)
3153 	{
3154 		switch(format)
3155 		{
3156 		case FORMAT_R5G6B5:         return 3;
3157 		case FORMAT_X8R8G8B8:       return 3;
3158 		case FORMAT_X8B8G8R8I:      return 3;
3159 		case FORMAT_X8B8G8R8:       return 3;
3160 		case FORMAT_A8R8G8B8:       return 4;
3161 		case FORMAT_SRGB8_X8:       return 3;
3162 		case FORMAT_SRGB8_A8:       return 4;
3163 		case FORMAT_A8B8G8R8I:      return 4;
3164 		case FORMAT_A8B8G8R8:       return 4;
3165 		case FORMAT_G8R8I:          return 2;
3166 		case FORMAT_G8R8:           return 2;
3167 		case FORMAT_R8_SNORM:      return 1;
3168 		case FORMAT_G8R8_SNORM:    return 2;
3169 		case FORMAT_X8B8G8R8_SNORM:return 3;
3170 		case FORMAT_A8B8G8R8_SNORM:return 4;
3171 		case FORMAT_R8UI:           return 1;
3172 		case FORMAT_G8R8UI:         return 2;
3173 		case FORMAT_X8B8G8R8UI:     return 3;
3174 		case FORMAT_A8B8G8R8UI:     return 4;
3175 		case FORMAT_A2B10G10R10:    return 4;
3176 		case FORMAT_A2B10G10R10UI:  return 4;
3177 		case FORMAT_G16R16I:        return 2;
3178 		case FORMAT_G16R16UI:       return 2;
3179 		case FORMAT_G16R16:         return 2;
3180 		case FORMAT_G32R32I:        return 2;
3181 		case FORMAT_G32R32UI:       return 2;
3182 		case FORMAT_X16B16G16R16I:  return 3;
3183 		case FORMAT_X16B16G16R16UI: return 3;
3184 		case FORMAT_A16B16G16R16I:  return 4;
3185 		case FORMAT_A16B16G16R16UI: return 4;
3186 		case FORMAT_A16B16G16R16:   return 4;
3187 		case FORMAT_X32B32G32R32I:  return 3;
3188 		case FORMAT_X32B32G32R32UI: return 3;
3189 		case FORMAT_A32B32G32R32I:  return 4;
3190 		case FORMAT_A32B32G32R32UI: return 4;
3191 		case FORMAT_V8U8:           return 2;
3192 		case FORMAT_Q8W8V8U8:       return 4;
3193 		case FORMAT_X8L8V8U8:       return 3;
3194 		case FORMAT_V16U16:         return 2;
3195 		case FORMAT_A16W16V16U16:   return 4;
3196 		case FORMAT_Q16W16V16U16:   return 4;
3197 		case FORMAT_R32F:           return 1;
3198 		case FORMAT_G32R32F:        return 2;
3199 		case FORMAT_X32B32G32R32F:  return 3;
3200 		case FORMAT_A32B32G32R32F:  return 4;
3201 		case FORMAT_X32B32G32R32F_UNSIGNED: return 3;
3202 		case FORMAT_D32F:           return 1;
3203 		case FORMAT_D32FS8:         return 1;
3204 		case FORMAT_D32F_LOCKABLE:  return 1;
3205 		case FORMAT_D32FS8_TEXTURE: return 1;
3206 		case FORMAT_D32F_SHADOW:    return 1;
3207 		case FORMAT_D32FS8_SHADOW:  return 1;
3208 		case FORMAT_A8:             return 1;
3209 		case FORMAT_R8I:            return 1;
3210 		case FORMAT_R8:             return 1;
3211 		case FORMAT_R16I:           return 1;
3212 		case FORMAT_R16UI:          return 1;
3213 		case FORMAT_R32I:           return 1;
3214 		case FORMAT_R32UI:          return 1;
3215 		case FORMAT_L8:             return 1;
3216 		case FORMAT_L16:            return 1;
3217 		case FORMAT_A8L8:           return 2;
3218 		case FORMAT_YV12_BT601:     return 3;
3219 		case FORMAT_YV12_BT709:     return 3;
3220 		case FORMAT_YV12_JFIF:      return 3;
3221 		default:
3222 			ASSERT(false);
3223 		}
3224 
3225 		return 1;
3226 	}
3227 
allocateBuffer(int width,int height,int depth,int border,int samples,Format format)3228 	void *Surface::allocateBuffer(int width, int height, int depth, int border, int samples, Format format)
3229 	{
3230 		return allocate(size(width, height, depth, border, samples, format));
3231 	}
3232 
memfill4(void * buffer,int pattern,int bytes)3233 	void Surface::memfill4(void *buffer, int pattern, int bytes)
3234 	{
3235 		while((size_t)buffer & 0x1 && bytes >= 1)
3236 		{
3237 			*(char*)buffer = (char)pattern;
3238 			(char*&)buffer += 1;
3239 			bytes -= 1;
3240 		}
3241 
3242 		while((size_t)buffer & 0x3 && bytes >= 2)
3243 		{
3244 			*(short*)buffer = (short)pattern;
3245 			(short*&)buffer += 1;
3246 			bytes -= 2;
3247 		}
3248 
3249 		#if defined(__i386__) || defined(__x86_64__)
3250 			if(CPUID::supportsSSE())
3251 			{
3252 				while((size_t)buffer & 0xF && bytes >= 4)
3253 				{
3254 					*(int*)buffer = pattern;
3255 					(int*&)buffer += 1;
3256 					bytes -= 4;
3257 				}
3258 
3259 				__m128 quad = _mm_set_ps1((float&)pattern);
3260 
3261 				float *pointer = (float*)buffer;
3262 				int qxwords = bytes / 64;
3263 				bytes -= qxwords * 64;
3264 
3265 				while(qxwords--)
3266 				{
3267 					_mm_stream_ps(pointer + 0, quad);
3268 					_mm_stream_ps(pointer + 4, quad);
3269 					_mm_stream_ps(pointer + 8, quad);
3270 					_mm_stream_ps(pointer + 12, quad);
3271 
3272 					pointer += 16;
3273 				}
3274 
3275 				buffer = pointer;
3276 			}
3277 		#endif
3278 
3279 		while(bytes >= 4)
3280 		{
3281 			*(int*)buffer = (int)pattern;
3282 			(int*&)buffer += 1;
3283 			bytes -= 4;
3284 		}
3285 
3286 		while(bytes >= 2)
3287 		{
3288 			*(short*)buffer = (short)pattern;
3289 			(short*&)buffer += 1;
3290 			bytes -= 2;
3291 		}
3292 
3293 		while(bytes >= 1)
3294 		{
3295 			*(char*)buffer = (char)pattern;
3296 			(char*&)buffer += 1;
3297 			bytes -= 1;
3298 		}
3299 	}
3300 
sync()3301 	void Surface::sync()
3302 	{
3303 		resource->lock(EXCLUSIVE);
3304 		resource->unlock();
3305 	}
3306 
isEntire(const Rect & rect) const3307 	bool Surface::isEntire(const Rect& rect) const
3308 	{
3309 		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
3310 	}
3311 
getRect() const3312 	Rect Surface::getRect() const
3313 	{
3314 		return Rect(0, 0, internal.width, internal.height);
3315 	}
3316 
clearDepth(float depth,int x0,int y0,int width,int height)3317 	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
3318 	{
3319 		if(width == 0 || height == 0)
3320 		{
3321 			return;
3322 		}
3323 
3324 		if(internal.format == FORMAT_NULL)
3325 		{
3326 			return;
3327 		}
3328 
3329 		// Not overlapping
3330 		if(x0 > internal.width) return;
3331 		if(y0 > internal.height) return;
3332 		if(x0 + width < 0) return;
3333 		if(y0 + height < 0) return;
3334 
3335 		// Clip against dimensions
3336 		if(x0 < 0) {width += x0; x0 = 0;}
3337 		if(x0 + width > internal.width) width = internal.width - x0;
3338 		if(y0 < 0) {height += y0; y0 = 0;}
3339 		if(y0 + height > internal.height) height = internal.height - y0;
3340 
3341 		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
3342 		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
3343 
3344 		int x1 = x0 + width;
3345 		int y1 = y0 + height;
3346 
3347 		if(!hasQuadLayout(internal.format))
3348 		{
3349 			float *target = (float*)lockInternal(x0, y0, 0, lock, PUBLIC);
3350 
3351 			for(int z = 0; z < internal.samples; z++)
3352 			{
3353 				float *row = target;
3354 				for(int y = y0; y < y1; y++)
3355 				{
3356 					memfill4(row, (int&)depth, width * sizeof(float));
3357 					row += internal.pitchP;
3358 				}
3359 				target += internal.sliceP;
3360 			}
3361 
3362 			unlockInternal();
3363 		}
3364 		else   // Quad layout
3365 		{
3366 			if(complementaryDepthBuffer)
3367 			{
3368 				depth = 1 - depth;
3369 			}
3370 
3371 			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
3372 
3373 			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3374 			int oddX1 = (x1 & ~1) * 2;
3375 			int evenX0 = ((x0 + 1) & ~1) * 2;
3376 			int evenBytes = (oddX1 - evenX0) * sizeof(float);
3377 
3378 			for(int z = 0; z < internal.samples; z++)
3379 			{
3380 				for(int y = y0; y < y1; y++)
3381 				{
3382 					float *target = buffer + (y & ~1) * internal.pitchP + (y & 1) * 2;
3383 
3384 					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
3385 					{
3386 						if((x0 & 1) != 0)
3387 						{
3388 							target[oddX0 + 0] = depth;
3389 							target[oddX0 + 2] = depth;
3390 						}
3391 
3392 					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
3393 					//	{
3394 					//		target[x2 + 0] = depth;
3395 					//		target[x2 + 1] = depth;
3396 					//		target[x2 + 2] = depth;
3397 					//		target[x2 + 3] = depth;
3398 					//	}
3399 
3400 					//	__asm
3401 					//	{
3402 					//		movss xmm0, depth
3403 					//		shufps xmm0, xmm0, 0x00
3404 					//
3405 					//		mov eax, x0
3406 					//		add eax, 1
3407 					//		and eax, 0xFFFFFFFE
3408 					//		cmp eax, x1
3409 					//		jge qEnd
3410 					//
3411 					//		mov edi, target
3412 					//
3413 					//	qLoop:
3414 					//		movntps [edi+8*eax], xmm0
3415 					//
3416 					//		add eax, 2
3417 					//		cmp eax, x1
3418 					//		jl qLoop
3419 					//	qEnd:
3420 					//	}
3421 
3422 						memfill4(&target[evenX0], (int&)depth, evenBytes);
3423 
3424 						if((x1 & 1) != 0)
3425 						{
3426 							target[oddX1 + 0] = depth;
3427 							target[oddX1 + 2] = depth;
3428 						}
3429 
3430 						y++;
3431 					}
3432 					else
3433 					{
3434 						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3435 						{
3436 							target[i] = depth;
3437 						}
3438 					}
3439 				}
3440 
3441 				buffer += internal.sliceP;
3442 			}
3443 
3444 			unlockInternal();
3445 		}
3446 	}
3447 
clearStencil(unsigned char s,unsigned char mask,int x0,int y0,int width,int height)3448 	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
3449 	{
3450 		if(mask == 0 || width == 0 || height == 0)
3451 		{
3452 			return;
3453 		}
3454 
3455 		if(stencil.format == FORMAT_NULL)
3456 		{
3457 			return;
3458 		}
3459 
3460 		// Not overlapping
3461 		if(x0 > internal.width) return;
3462 		if(y0 > internal.height) return;
3463 		if(x0 + width < 0) return;
3464 		if(y0 + height < 0) return;
3465 
3466 		// Clip against dimensions
3467 		if(x0 < 0) {width += x0; x0 = 0;}
3468 		if(x0 + width > internal.width) width = internal.width - x0;
3469 		if(y0 < 0) {height += y0; y0 = 0;}
3470 		if(y0 + height > internal.height) height = internal.height - y0;
3471 
3472 		int x1 = x0 + width;
3473 		int y1 = y0 + height;
3474 
3475 		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3476 		int oddX1 = (x1 & ~1) * 2;
3477 		int evenX0 = ((x0 + 1) & ~1) * 2;
3478 		int evenBytes = oddX1 - evenX0;
3479 
3480 		unsigned char maskedS = s & mask;
3481 		unsigned char invMask = ~mask;
3482 		unsigned int fill = maskedS;
3483 		fill = fill | (fill << 8) | (fill << 16) | (fill << 24);
3484 
3485 		char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
3486 
3487 		// Stencil buffers are assumed to use quad layout
3488 		for(int z = 0; z < stencil.samples; z++)
3489 		{
3490 			for(int y = y0; y < y1; y++)
3491 			{
3492 				char *target = buffer + (y & ~1) * stencil.pitchP + (y & 1) * 2;
3493 
3494 				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
3495 				{
3496 					if((x0 & 1) != 0)
3497 					{
3498 						target[oddX0 + 0] = fill;
3499 						target[oddX0 + 2] = fill;
3500 					}
3501 
3502 					memfill4(&target[evenX0], fill, evenBytes);
3503 
3504 					if((x1 & 1) != 0)
3505 					{
3506 						target[oddX1 + 0] = fill;
3507 						target[oddX1 + 2] = fill;
3508 					}
3509 
3510 					y++;
3511 				}
3512 				else
3513 				{
3514 					for(int x = x0; x < x1; x++)
3515 					{
3516 						int i = (x & ~1) * 2 + (x & 1);
3517 						target[i] = maskedS | (target[i] & invMask);
3518 					}
3519 				}
3520 			}
3521 
3522 			buffer += stencil.sliceP;
3523 		}
3524 
3525 		unlockStencil();
3526 	}
3527 
fill(const Color<float> & color,int x0,int y0,int width,int height)3528 	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3529 	{
3530 		unsigned char *row;
3531 		Buffer *buffer;
3532 
3533 		if(internal.dirty)
3534 		{
3535 			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3536 			buffer = &internal;
3537 		}
3538 		else
3539 		{
3540 			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3541 			buffer = &external;
3542 		}
3543 
3544 		if(buffer->bytes <= 4)
3545 		{
3546 			int c;
3547 			buffer->write(&c, color);
3548 
3549 			if(buffer->bytes <= 1) c = (c << 8)  | c;
3550 			if(buffer->bytes <= 2) c = (c << 16) | c;
3551 
3552 			for(int y = 0; y < height; y++)
3553 			{
3554 				memfill4(row, c, width * buffer->bytes);
3555 
3556 				row += buffer->pitchB;
3557 			}
3558 		}
3559 		else   // Generic
3560 		{
3561 			for(int y = 0; y < height; y++)
3562 			{
3563 				unsigned char *element = row;
3564 
3565 				for(int x = 0; x < width; x++)
3566 				{
3567 					buffer->write(element, color);
3568 
3569 					element += buffer->bytes;
3570 				}
3571 
3572 				row += buffer->pitchB;
3573 			}
3574 		}
3575 
3576 		if(buffer == &internal)
3577 		{
3578 			unlockInternal();
3579 		}
3580 		else
3581 		{
3582 			unlockExternal();
3583 		}
3584 	}
3585 
copyInternal(const Surface * source,int x,int y,float srcX,float srcY,bool filter)3586 	void Surface::copyInternal(const Surface *source, int x, int y, float srcX, float srcY, bool filter)
3587 	{
3588 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3589 
3590 		sw::Color<float> color;
3591 
3592 		if(!filter)
3593 		{
3594 			color = source->internal.read((int)srcX, (int)srcY, 0);
3595 		}
3596 		else   // Bilinear filtering
3597 		{
3598 			color = source->internal.sample(srcX, srcY, 0);
3599 		}
3600 
3601 		internal.write(x, y, color);
3602 	}
3603 
copyInternal(const Surface * source,int x,int y,int z,float srcX,float srcY,float srcZ,bool filter)3604 	void Surface::copyInternal(const Surface *source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
3605 	{
3606 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3607 
3608 		sw::Color<float> color;
3609 
3610 		if(!filter)
3611 		{
3612 			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
3613 		}
3614 		else   // Bilinear filtering
3615 		{
3616 			color = source->internal.sample(srcX, srcY, srcZ);
3617 		}
3618 
3619 		internal.write(x, y, z, color);
3620 	}
3621 
copyCubeEdge(Edge dstEdge,Surface * src,Edge srcEdge)3622 	void Surface::copyCubeEdge(Edge dstEdge, Surface *src, Edge srcEdge)
3623 	{
3624 		Surface *dst = this;
3625 
3626 		// Figure out if the edges to be copied in reverse order respectively from one another
3627 		// The copy should be reversed whenever the same edges are contiguous or if we're
3628 		// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
3629 		//
3630 		//      | +y |
3631 		// | -x | +z | +x | -z |
3632 		//      | -y |
3633 
3634 		bool reverse = (srcEdge == dstEdge) ||
3635 		               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
3636 		               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
3637 		               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
3638 		               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
3639 
3640 		int srcBytes = src->bytes(src->Surface::getInternalFormat());
3641 		int srcPitch = src->getInternalPitchB();
3642 		int dstBytes = dst->bytes(dst->Surface::getInternalFormat());
3643 		int dstPitch = dst->getInternalPitchB();
3644 
3645 		int srcW = src->getWidth();
3646 		int srcH = src->getHeight();
3647 		int dstW = dst->getWidth();
3648 		int dstH = dst->getHeight();
3649 
3650 		ASSERT(srcW == srcH && dstW == dstH && srcW == dstW && srcBytes == dstBytes);
3651 
3652 		// Src is expressed in the regular [0, width-1], [0, height-1] space
3653 		int srcDelta = ((srcEdge == TOP) || (srcEdge == BOTTOM)) ? srcBytes : srcPitch;
3654 		int srcStart = ((srcEdge == BOTTOM) ? srcPitch * (srcH - 1) : ((srcEdge == RIGHT) ? srcBytes * (srcW - 1) : 0));
3655 
3656 		// Dst contains borders, so it is expressed in the [-1, width+1], [-1, height+1] space
3657 		int dstDelta = (((dstEdge == TOP) || (dstEdge == BOTTOM)) ? dstBytes : dstPitch) * (reverse ? -1 : 1);
3658 		int dstStart = ((dstEdge == BOTTOM) ? dstPitch * (dstH + 1) : ((dstEdge == RIGHT) ? dstBytes * (dstW + 1) : 0)) + (reverse ? dstW * -dstDelta : dstDelta);
3659 
3660 		char *srcBuf = (char*)src->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PRIVATE) + srcStart;
3661 		char *dstBuf = (char*)dst->lockInternal(-1, -1, 0, sw::LOCK_READWRITE, sw::PRIVATE) + dstStart;
3662 
3663 		for(int i = 0; i < srcW; ++i, dstBuf += dstDelta, srcBuf += srcDelta)
3664 		{
3665 			memcpy(dstBuf, srcBuf, srcBytes);
3666 		}
3667 
3668 		if(dstEdge == LEFT || dstEdge == RIGHT)
3669 		{
3670 			// TOP and BOTTOM are already set, let's average out the corners
3671 			int x0 = (dstEdge == RIGHT) ? dstW : -1;
3672 			int y0 = -1;
3673 			int x1 = (dstEdge == RIGHT) ? dstW - 1 : 0;
3674 			int y1 = 0;
3675 			dst->computeCubeCorner(x0, y0, x1, y1);
3676 			y0 = dstH;
3677 			y1 = dstH - 1;
3678 			dst->computeCubeCorner(x0, y0, x1, y1);
3679 		}
3680 
3681 		src->unlockInternal();
3682 		dst->unlockInternal();
3683 	}
3684 
computeCubeCorner(int x0,int y0,int x1,int y1)3685 	void Surface::computeCubeCorner(int x0, int y0, int x1, int y1)
3686 	{
3687 		ASSERT(internal.lock != LOCK_UNLOCKED);
3688 
3689 		sw::Color<float> color = internal.read(x0, y1);
3690 		color += internal.read(x1, y0);
3691 		color += internal.read(x1, y1);
3692 		color *= (1.0f / 3.0f);
3693 
3694 		internal.write(x0, y0, color);
3695 	}
3696 
hasStencil() const3697 	bool Surface::hasStencil() const
3698 	{
3699 		return isStencil(external.format);
3700 	}
3701 
hasDepth() const3702 	bool Surface::hasDepth() const
3703 	{
3704 		return isDepth(external.format);
3705 	}
3706 
hasPalette() const3707 	bool Surface::hasPalette() const
3708 	{
3709 		return isPalette(external.format);
3710 	}
3711 
isRenderTarget() const3712 	bool Surface::isRenderTarget() const
3713 	{
3714 		return renderTarget;
3715 	}
3716 
hasDirtyContents() const3717 	bool Surface::hasDirtyContents() const
3718 	{
3719 		return dirtyContents;
3720 	}
3721 
markContentsClean()3722 	void Surface::markContentsClean()
3723 	{
3724 		dirtyContents = false;
3725 	}
3726 
getResource()3727 	Resource *Surface::getResource()
3728 	{
3729 		return resource;
3730 	}
3731 
identicalBuffers() const3732 	bool Surface::identicalBuffers() const
3733 	{
3734 		return external.format == internal.format &&
3735 		       external.width  == internal.width &&
3736 		       external.height == internal.height &&
3737 		       external.depth  == internal.depth &&
3738 		       external.pitchB == internal.pitchB &&
3739 		       external.sliceB == internal.sliceB &&
3740 		       external.border == internal.border &&
3741 		       external.samples == internal.samples;
3742 	}
3743 
selectInternalFormat(Format format) const3744 	Format Surface::selectInternalFormat(Format format) const
3745 	{
3746 		switch(format)
3747 		{
3748 		case FORMAT_NULL:
3749 			return FORMAT_NULL;
3750 		case FORMAT_P8:
3751 		case FORMAT_A8P8:
3752 		case FORMAT_A4R4G4B4:
3753 		case FORMAT_A1R5G5B5:
3754 		case FORMAT_A8R3G3B2:
3755 			return FORMAT_A8R8G8B8;
3756 		case FORMAT_A8:
3757 			return FORMAT_A8;
3758 		case FORMAT_R8I:
3759 			return FORMAT_R8I;
3760 		case FORMAT_R8UI:
3761 			return FORMAT_R8UI;
3762 		case FORMAT_R8_SNORM:
3763 			return FORMAT_R8_SNORM;
3764 		case FORMAT_R8:
3765 			return FORMAT_R8;
3766 		case FORMAT_R16I:
3767 			return FORMAT_R16I;
3768 		case FORMAT_R16UI:
3769 			return FORMAT_R16UI;
3770 		case FORMAT_R32I:
3771 			return FORMAT_R32I;
3772 		case FORMAT_R32UI:
3773 			return FORMAT_R32UI;
3774 		case FORMAT_X16B16G16R16I:
3775 			return FORMAT_X16B16G16R16I;
3776 		case FORMAT_A16B16G16R16I:
3777 			return FORMAT_A16B16G16R16I;
3778 		case FORMAT_X16B16G16R16UI:
3779 			return FORMAT_X16B16G16R16UI;
3780 		case FORMAT_A16B16G16R16UI:
3781 			return FORMAT_A16B16G16R16UI;
3782 		case FORMAT_A2R10G10B10:
3783 		case FORMAT_A2B10G10R10:
3784 		case FORMAT_A16B16G16R16:
3785 			return FORMAT_A16B16G16R16;
3786 		case FORMAT_A2B10G10R10UI:
3787 			return FORMAT_A16B16G16R16UI;
3788 		case FORMAT_X32B32G32R32I:
3789 			return FORMAT_X32B32G32R32I;
3790 		case FORMAT_A32B32G32R32I:
3791 			return FORMAT_A32B32G32R32I;
3792 		case FORMAT_X32B32G32R32UI:
3793 			return FORMAT_X32B32G32R32UI;
3794 		case FORMAT_A32B32G32R32UI:
3795 			return FORMAT_A32B32G32R32UI;
3796 		case FORMAT_G8R8I:
3797 			return FORMAT_G8R8I;
3798 		case FORMAT_G8R8UI:
3799 			return FORMAT_G8R8UI;
3800 		case FORMAT_G8R8_SNORM:
3801 			return FORMAT_G8R8_SNORM;
3802 		case FORMAT_G8R8:
3803 			return FORMAT_G8R8;
3804 		case FORMAT_G16R16I:
3805 			return FORMAT_G16R16I;
3806 		case FORMAT_G16R16UI:
3807 			return FORMAT_G16R16UI;
3808 		case FORMAT_G16R16:
3809 			return FORMAT_G16R16;
3810 		case FORMAT_G32R32I:
3811 			return FORMAT_G32R32I;
3812 		case FORMAT_G32R32UI:
3813 			return FORMAT_G32R32UI;
3814 		case FORMAT_A8R8G8B8:
3815 			if(lockable || !quadLayoutEnabled)
3816 			{
3817 				return FORMAT_A8R8G8B8;
3818 			}
3819 			else
3820 			{
3821 				return FORMAT_A8G8R8B8Q;
3822 			}
3823 		case FORMAT_A8B8G8R8I:
3824 			return FORMAT_A8B8G8R8I;
3825 		case FORMAT_A8B8G8R8UI:
3826 			return FORMAT_A8B8G8R8UI;
3827 		case FORMAT_A8B8G8R8_SNORM:
3828 			return FORMAT_A8B8G8R8_SNORM;
3829 		case FORMAT_R5G5B5A1:
3830 		case FORMAT_R4G4B4A4:
3831 		case FORMAT_A8B8G8R8:
3832 			return FORMAT_A8B8G8R8;
3833 		case FORMAT_R5G6B5:
3834 			return FORMAT_R5G6B5;
3835 		case FORMAT_R3G3B2:
3836 		case FORMAT_R8G8B8:
3837 		case FORMAT_X4R4G4B4:
3838 		case FORMAT_X1R5G5B5:
3839 		case FORMAT_X8R8G8B8:
3840 			if(lockable || !quadLayoutEnabled)
3841 			{
3842 				return FORMAT_X8R8G8B8;
3843 			}
3844 			else
3845 			{
3846 				return FORMAT_X8G8R8B8Q;
3847 			}
3848 		case FORMAT_X8B8G8R8I:
3849 			return FORMAT_X8B8G8R8I;
3850 		case FORMAT_X8B8G8R8UI:
3851 			return FORMAT_X8B8G8R8UI;
3852 		case FORMAT_X8B8G8R8_SNORM:
3853 			return FORMAT_X8B8G8R8_SNORM;
3854 		case FORMAT_B8G8R8:
3855 		case FORMAT_X8B8G8R8:
3856 			return FORMAT_X8B8G8R8;
3857 		case FORMAT_SRGB8_X8:
3858 			return FORMAT_SRGB8_X8;
3859 		case FORMAT_SRGB8_A8:
3860 			return FORMAT_SRGB8_A8;
3861 		// Compressed formats
3862 		case FORMAT_DXT1:
3863 		case FORMAT_DXT3:
3864 		case FORMAT_DXT5:
3865 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3866 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3867 		case FORMAT_RGBA8_ETC2_EAC:
3868 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3869 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3870 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3871 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3872 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3873 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3874 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3875 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3876 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3877 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3878 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3879 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3880 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3881 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3882 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3883 			return FORMAT_A8R8G8B8;
3884 		case FORMAT_RGBA_ASTC_4x4_KHR:
3885 		case FORMAT_RGBA_ASTC_5x4_KHR:
3886 		case FORMAT_RGBA_ASTC_5x5_KHR:
3887 		case FORMAT_RGBA_ASTC_6x5_KHR:
3888 		case FORMAT_RGBA_ASTC_6x6_KHR:
3889 		case FORMAT_RGBA_ASTC_8x5_KHR:
3890 		case FORMAT_RGBA_ASTC_8x6_KHR:
3891 		case FORMAT_RGBA_ASTC_8x8_KHR:
3892 		case FORMAT_RGBA_ASTC_10x5_KHR:
3893 		case FORMAT_RGBA_ASTC_10x6_KHR:
3894 		case FORMAT_RGBA_ASTC_10x8_KHR:
3895 		case FORMAT_RGBA_ASTC_10x10_KHR:
3896 		case FORMAT_RGBA_ASTC_12x10_KHR:
3897 		case FORMAT_RGBA_ASTC_12x12_KHR:
3898 			// ASTC supports HDR, so a floating point format is required to represent it properly
3899 			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
3900 		case FORMAT_ATI1:
3901 			return FORMAT_R8;
3902 		case FORMAT_R11_EAC:
3903 		case FORMAT_SIGNED_R11_EAC:
3904 			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
3905 		case FORMAT_ATI2:
3906 			return FORMAT_G8R8;
3907 		case FORMAT_RG11_EAC:
3908 		case FORMAT_SIGNED_RG11_EAC:
3909 			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
3910 		case FORMAT_ETC1:
3911 		case FORMAT_RGB8_ETC2:
3912 		case FORMAT_SRGB8_ETC2:
3913 			return FORMAT_X8R8G8B8;
3914 		// Bumpmap formats
3915 		case FORMAT_V8U8:			return FORMAT_V8U8;
3916 		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
3917 		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
3918 		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
3919 		case FORMAT_V16U16:			return FORMAT_V16U16;
3920 		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
3921 		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
3922 		// Floating-point formats
3923 		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
3924 		case FORMAT_R16F:			return FORMAT_R32F;
3925 		case FORMAT_G16R16F:		return FORMAT_G32R32F;
3926 		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
3927 		case FORMAT_X16B16G16R16F:	return FORMAT_X32B32G32R32F;
3928 		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
3929 		case FORMAT_X16B16G16R16F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
3930 		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
3931 		case FORMAT_R32F:			return FORMAT_R32F;
3932 		case FORMAT_G32R32F:		return FORMAT_G32R32F;
3933 		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
3934 		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
3935 		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
3936 		case FORMAT_X32B32G32R32F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
3937 		// Luminance formats
3938 		case FORMAT_L8:				return FORMAT_L8;
3939 		case FORMAT_A4L4:			return FORMAT_A8L8;
3940 		case FORMAT_L16:			return FORMAT_L16;
3941 		case FORMAT_A8L8:			return FORMAT_A8L8;
3942 		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
3943 		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
3944 		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
3945 		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
3946 		// Depth/stencil formats
3947 		case FORMAT_D16:
3948 		case FORMAT_D32:
3949 		case FORMAT_D24X8:
3950 			if(hasParent)   // Texture
3951 			{
3952 				return FORMAT_D32F_SHADOW;
3953 			}
3954 			else if(complementaryDepthBuffer)
3955 			{
3956 				return FORMAT_D32F_COMPLEMENTARY;
3957 			}
3958 			else
3959 			{
3960 				return FORMAT_D32F;
3961 			}
3962 		case FORMAT_D24S8:
3963 		case FORMAT_D24FS8:
3964 			if(hasParent)   // Texture
3965 			{
3966 				return FORMAT_D32FS8_SHADOW;
3967 			}
3968 			else if(complementaryDepthBuffer)
3969 			{
3970 				return FORMAT_D32FS8_COMPLEMENTARY;
3971 			}
3972 			else
3973 			{
3974 				return FORMAT_D32FS8;
3975 			}
3976 		case FORMAT_D32F:           return FORMAT_D32F;
3977 		case FORMAT_D32FS8:         return FORMAT_D32FS8;
3978 		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
3979 		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
3980 		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
3981 		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
3982 		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
3983 		case FORMAT_S8:             return FORMAT_S8;
3984 		// YUV formats
3985 		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
3986 		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
3987 		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
3988 		default:
3989 			ASSERT(false);
3990 		}
3991 
3992 		return FORMAT_NULL;
3993 	}
3994 
setTexturePalette(unsigned int * palette)3995 	void Surface::setTexturePalette(unsigned int *palette)
3996 	{
3997 		Surface::palette = palette;
3998 		Surface::paletteID++;
3999 	}
4000 
resolve()4001 	void Surface::resolve()
4002 	{
4003 		if(internal.samples <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
4004 		{
4005 			return;
4006 		}
4007 
4008 		ASSERT(internal.depth == 1);  // Unimplemented
4009 
4010 		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
4011 
4012 		int width = internal.width;
4013 		int height = internal.height;
4014 		int pitch = internal.pitchB;
4015 		int slice = internal.sliceB;
4016 
4017 		unsigned char *source0 = (unsigned char*)source;
4018 		unsigned char *source1 = source0 + slice;
4019 		unsigned char *source2 = source1 + slice;
4020 		unsigned char *source3 = source2 + slice;
4021 		unsigned char *source4 = source3 + slice;
4022 		unsigned char *source5 = source4 + slice;
4023 		unsigned char *source6 = source5 + slice;
4024 		unsigned char *source7 = source6 + slice;
4025 		unsigned char *source8 = source7 + slice;
4026 		unsigned char *source9 = source8 + slice;
4027 		unsigned char *sourceA = source9 + slice;
4028 		unsigned char *sourceB = sourceA + slice;
4029 		unsigned char *sourceC = sourceB + slice;
4030 		unsigned char *sourceD = sourceC + slice;
4031 		unsigned char *sourceE = sourceD + slice;
4032 		unsigned char *sourceF = sourceE + slice;
4033 
4034 		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
4035 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
4036 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
4037 		{
4038 			#if defined(__i386__) || defined(__x86_64__)
4039 				if(CPUID::supportsSSE2() && (width % 4) == 0)
4040 				{
4041 					if(internal.samples == 2)
4042 					{
4043 						for(int y = 0; y < height; y++)
4044 						{
4045 							for(int x = 0; x < width; x += 4)
4046 							{
4047 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4048 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4049 
4050 								c0 = _mm_avg_epu8(c0, c1);
4051 
4052 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4053 							}
4054 
4055 							source0 += pitch;
4056 							source1 += pitch;
4057 						}
4058 					}
4059 					else if(internal.samples == 4)
4060 					{
4061 						for(int y = 0; y < height; y++)
4062 						{
4063 							for(int x = 0; x < width; x += 4)
4064 							{
4065 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4066 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4067 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4068 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4069 
4070 								c0 = _mm_avg_epu8(c0, c1);
4071 								c2 = _mm_avg_epu8(c2, c3);
4072 								c0 = _mm_avg_epu8(c0, c2);
4073 
4074 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4075 							}
4076 
4077 							source0 += pitch;
4078 							source1 += pitch;
4079 							source2 += pitch;
4080 							source3 += pitch;
4081 						}
4082 					}
4083 					else if(internal.samples == 8)
4084 					{
4085 						for(int y = 0; y < height; y++)
4086 						{
4087 							for(int x = 0; x < width; x += 4)
4088 							{
4089 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4090 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4091 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4092 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4093 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4094 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4095 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4096 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4097 
4098 								c0 = _mm_avg_epu8(c0, c1);
4099 								c2 = _mm_avg_epu8(c2, c3);
4100 								c4 = _mm_avg_epu8(c4, c5);
4101 								c6 = _mm_avg_epu8(c6, c7);
4102 								c0 = _mm_avg_epu8(c0, c2);
4103 								c4 = _mm_avg_epu8(c4, c6);
4104 								c0 = _mm_avg_epu8(c0, c4);
4105 
4106 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4107 							}
4108 
4109 							source0 += pitch;
4110 							source1 += pitch;
4111 							source2 += pitch;
4112 							source3 += pitch;
4113 							source4 += pitch;
4114 							source5 += pitch;
4115 							source6 += pitch;
4116 							source7 += pitch;
4117 						}
4118 					}
4119 					else if(internal.samples == 16)
4120 					{
4121 						for(int y = 0; y < height; y++)
4122 						{
4123 							for(int x = 0; x < width; x += 4)
4124 							{
4125 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4126 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4127 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4128 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4129 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4130 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4131 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4132 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4133 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4134 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4135 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4136 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4137 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4138 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4139 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4140 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4141 
4142 								c0 = _mm_avg_epu8(c0, c1);
4143 								c2 = _mm_avg_epu8(c2, c3);
4144 								c4 = _mm_avg_epu8(c4, c5);
4145 								c6 = _mm_avg_epu8(c6, c7);
4146 								c8 = _mm_avg_epu8(c8, c9);
4147 								cA = _mm_avg_epu8(cA, cB);
4148 								cC = _mm_avg_epu8(cC, cD);
4149 								cE = _mm_avg_epu8(cE, cF);
4150 								c0 = _mm_avg_epu8(c0, c2);
4151 								c4 = _mm_avg_epu8(c4, c6);
4152 								c8 = _mm_avg_epu8(c8, cA);
4153 								cC = _mm_avg_epu8(cC, cE);
4154 								c0 = _mm_avg_epu8(c0, c4);
4155 								c8 = _mm_avg_epu8(c8, cC);
4156 								c0 = _mm_avg_epu8(c0, c8);
4157 
4158 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4159 							}
4160 
4161 							source0 += pitch;
4162 							source1 += pitch;
4163 							source2 += pitch;
4164 							source3 += pitch;
4165 							source4 += pitch;
4166 							source5 += pitch;
4167 							source6 += pitch;
4168 							source7 += pitch;
4169 							source8 += pitch;
4170 							source9 += pitch;
4171 							sourceA += pitch;
4172 							sourceB += pitch;
4173 							sourceC += pitch;
4174 							sourceD += pitch;
4175 							sourceE += pitch;
4176 							sourceF += pitch;
4177 						}
4178 					}
4179 					else ASSERT(false);
4180 				}
4181 				else
4182 			#endif
4183 			{
4184 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
4185 
4186 				if(internal.samples == 2)
4187 				{
4188 					for(int y = 0; y < height; y++)
4189 					{
4190 						for(int x = 0; x < width; x++)
4191 						{
4192 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4193 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4194 
4195 							c0 = AVERAGE(c0, c1);
4196 
4197 							*(unsigned int*)(source0 + 4 * x) = c0;
4198 						}
4199 
4200 						source0 += pitch;
4201 						source1 += pitch;
4202 					}
4203 				}
4204 				else if(internal.samples == 4)
4205 				{
4206 					for(int y = 0; y < height; y++)
4207 					{
4208 						for(int x = 0; x < width; x++)
4209 						{
4210 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4211 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4212 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4213 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4214 
4215 							c0 = AVERAGE(c0, c1);
4216 							c2 = AVERAGE(c2, c3);
4217 							c0 = AVERAGE(c0, c2);
4218 
4219 							*(unsigned int*)(source0 + 4 * x) = c0;
4220 						}
4221 
4222 						source0 += pitch;
4223 						source1 += pitch;
4224 						source2 += pitch;
4225 						source3 += pitch;
4226 					}
4227 				}
4228 				else if(internal.samples == 8)
4229 				{
4230 					for(int y = 0; y < height; y++)
4231 					{
4232 						for(int x = 0; x < width; x++)
4233 						{
4234 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4235 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4236 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4237 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4238 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4239 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4240 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4241 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4242 
4243 							c0 = AVERAGE(c0, c1);
4244 							c2 = AVERAGE(c2, c3);
4245 							c4 = AVERAGE(c4, c5);
4246 							c6 = AVERAGE(c6, c7);
4247 							c0 = AVERAGE(c0, c2);
4248 							c4 = AVERAGE(c4, c6);
4249 							c0 = AVERAGE(c0, c4);
4250 
4251 							*(unsigned int*)(source0 + 4 * x) = c0;
4252 						}
4253 
4254 						source0 += pitch;
4255 						source1 += pitch;
4256 						source2 += pitch;
4257 						source3 += pitch;
4258 						source4 += pitch;
4259 						source5 += pitch;
4260 						source6 += pitch;
4261 						source7 += pitch;
4262 					}
4263 				}
4264 				else if(internal.samples == 16)
4265 				{
4266 					for(int y = 0; y < height; y++)
4267 					{
4268 						for(int x = 0; x < width; x++)
4269 						{
4270 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4271 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4272 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4273 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4274 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4275 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4276 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4277 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4278 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4279 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4280 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4281 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4282 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4283 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4284 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4285 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4286 
4287 							c0 = AVERAGE(c0, c1);
4288 							c2 = AVERAGE(c2, c3);
4289 							c4 = AVERAGE(c4, c5);
4290 							c6 = AVERAGE(c6, c7);
4291 							c8 = AVERAGE(c8, c9);
4292 							cA = AVERAGE(cA, cB);
4293 							cC = AVERAGE(cC, cD);
4294 							cE = AVERAGE(cE, cF);
4295 							c0 = AVERAGE(c0, c2);
4296 							c4 = AVERAGE(c4, c6);
4297 							c8 = AVERAGE(c8, cA);
4298 							cC = AVERAGE(cC, cE);
4299 							c0 = AVERAGE(c0, c4);
4300 							c8 = AVERAGE(c8, cC);
4301 							c0 = AVERAGE(c0, c8);
4302 
4303 							*(unsigned int*)(source0 + 4 * x) = c0;
4304 						}
4305 
4306 						source0 += pitch;
4307 						source1 += pitch;
4308 						source2 += pitch;
4309 						source3 += pitch;
4310 						source4 += pitch;
4311 						source5 += pitch;
4312 						source6 += pitch;
4313 						source7 += pitch;
4314 						source8 += pitch;
4315 						source9 += pitch;
4316 						sourceA += pitch;
4317 						sourceB += pitch;
4318 						sourceC += pitch;
4319 						sourceD += pitch;
4320 						sourceE += pitch;
4321 						sourceF += pitch;
4322 					}
4323 				}
4324 				else ASSERT(false);
4325 
4326 				#undef AVERAGE
4327 			}
4328 		}
4329 		else if(internal.format == FORMAT_G16R16)
4330 		{
4331 
4332 			#if defined(__i386__) || defined(__x86_64__)
4333 				if(CPUID::supportsSSE2() && (width % 4) == 0)
4334 				{
4335 					if(internal.samples == 2)
4336 					{
4337 						for(int y = 0; y < height; y++)
4338 						{
4339 							for(int x = 0; x < width; x += 4)
4340 							{
4341 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4342 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4343 
4344 								c0 = _mm_avg_epu16(c0, c1);
4345 
4346 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4347 							}
4348 
4349 							source0 += pitch;
4350 							source1 += pitch;
4351 						}
4352 					}
4353 					else if(internal.samples == 4)
4354 					{
4355 						for(int y = 0; y < height; y++)
4356 						{
4357 							for(int x = 0; x < width; x += 4)
4358 							{
4359 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4360 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4361 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4362 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4363 
4364 								c0 = _mm_avg_epu16(c0, c1);
4365 								c2 = _mm_avg_epu16(c2, c3);
4366 								c0 = _mm_avg_epu16(c0, c2);
4367 
4368 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4369 							}
4370 
4371 							source0 += pitch;
4372 							source1 += pitch;
4373 							source2 += pitch;
4374 							source3 += pitch;
4375 						}
4376 					}
4377 					else if(internal.samples == 8)
4378 					{
4379 						for(int y = 0; y < height; y++)
4380 						{
4381 							for(int x = 0; x < width; x += 4)
4382 							{
4383 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4384 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4385 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4386 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4387 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4388 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4389 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4390 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4391 
4392 								c0 = _mm_avg_epu16(c0, c1);
4393 								c2 = _mm_avg_epu16(c2, c3);
4394 								c4 = _mm_avg_epu16(c4, c5);
4395 								c6 = _mm_avg_epu16(c6, c7);
4396 								c0 = _mm_avg_epu16(c0, c2);
4397 								c4 = _mm_avg_epu16(c4, c6);
4398 								c0 = _mm_avg_epu16(c0, c4);
4399 
4400 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4401 							}
4402 
4403 							source0 += pitch;
4404 							source1 += pitch;
4405 							source2 += pitch;
4406 							source3 += pitch;
4407 							source4 += pitch;
4408 							source5 += pitch;
4409 							source6 += pitch;
4410 							source7 += pitch;
4411 						}
4412 					}
4413 					else if(internal.samples == 16)
4414 					{
4415 						for(int y = 0; y < height; y++)
4416 						{
4417 							for(int x = 0; x < width; x += 4)
4418 							{
4419 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4420 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4421 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4422 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4423 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4424 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4425 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4426 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4427 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4428 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4429 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4430 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4431 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4432 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4433 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4434 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4435 
4436 								c0 = _mm_avg_epu16(c0, c1);
4437 								c2 = _mm_avg_epu16(c2, c3);
4438 								c4 = _mm_avg_epu16(c4, c5);
4439 								c6 = _mm_avg_epu16(c6, c7);
4440 								c8 = _mm_avg_epu16(c8, c9);
4441 								cA = _mm_avg_epu16(cA, cB);
4442 								cC = _mm_avg_epu16(cC, cD);
4443 								cE = _mm_avg_epu16(cE, cF);
4444 								c0 = _mm_avg_epu16(c0, c2);
4445 								c4 = _mm_avg_epu16(c4, c6);
4446 								c8 = _mm_avg_epu16(c8, cA);
4447 								cC = _mm_avg_epu16(cC, cE);
4448 								c0 = _mm_avg_epu16(c0, c4);
4449 								c8 = _mm_avg_epu16(c8, cC);
4450 								c0 = _mm_avg_epu16(c0, c8);
4451 
4452 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4453 							}
4454 
4455 							source0 += pitch;
4456 							source1 += pitch;
4457 							source2 += pitch;
4458 							source3 += pitch;
4459 							source4 += pitch;
4460 							source5 += pitch;
4461 							source6 += pitch;
4462 							source7 += pitch;
4463 							source8 += pitch;
4464 							source9 += pitch;
4465 							sourceA += pitch;
4466 							sourceB += pitch;
4467 							sourceC += pitch;
4468 							sourceD += pitch;
4469 							sourceE += pitch;
4470 							sourceF += pitch;
4471 						}
4472 					}
4473 					else ASSERT(false);
4474 				}
4475 				else
4476 			#endif
4477 			{
4478 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4479 
4480 				if(internal.samples == 2)
4481 				{
4482 					for(int y = 0; y < height; y++)
4483 					{
4484 						for(int x = 0; x < width; x++)
4485 						{
4486 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4487 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4488 
4489 							c0 = AVERAGE(c0, c1);
4490 
4491 							*(unsigned int*)(source0 + 4 * x) = c0;
4492 						}
4493 
4494 						source0 += pitch;
4495 						source1 += pitch;
4496 					}
4497 				}
4498 				else if(internal.samples == 4)
4499 				{
4500 					for(int y = 0; y < height; y++)
4501 					{
4502 						for(int x = 0; x < width; x++)
4503 						{
4504 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4505 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4506 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4507 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4508 
4509 							c0 = AVERAGE(c0, c1);
4510 							c2 = AVERAGE(c2, c3);
4511 							c0 = AVERAGE(c0, c2);
4512 
4513 							*(unsigned int*)(source0 + 4 * x) = c0;
4514 						}
4515 
4516 						source0 += pitch;
4517 						source1 += pitch;
4518 						source2 += pitch;
4519 						source3 += pitch;
4520 					}
4521 				}
4522 				else if(internal.samples == 8)
4523 				{
4524 					for(int y = 0; y < height; y++)
4525 					{
4526 						for(int x = 0; x < width; x++)
4527 						{
4528 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4529 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4530 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4531 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4532 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4533 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4534 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4535 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4536 
4537 							c0 = AVERAGE(c0, c1);
4538 							c2 = AVERAGE(c2, c3);
4539 							c4 = AVERAGE(c4, c5);
4540 							c6 = AVERAGE(c6, c7);
4541 							c0 = AVERAGE(c0, c2);
4542 							c4 = AVERAGE(c4, c6);
4543 							c0 = AVERAGE(c0, c4);
4544 
4545 							*(unsigned int*)(source0 + 4 * x) = c0;
4546 						}
4547 
4548 						source0 += pitch;
4549 						source1 += pitch;
4550 						source2 += pitch;
4551 						source3 += pitch;
4552 						source4 += pitch;
4553 						source5 += pitch;
4554 						source6 += pitch;
4555 						source7 += pitch;
4556 					}
4557 				}
4558 				else if(internal.samples == 16)
4559 				{
4560 					for(int y = 0; y < height; y++)
4561 					{
4562 						for(int x = 0; x < width; x++)
4563 						{
4564 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4565 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4566 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4567 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4568 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4569 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4570 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4571 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4572 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4573 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4574 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4575 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4576 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4577 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4578 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4579 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4580 
4581 							c0 = AVERAGE(c0, c1);
4582 							c2 = AVERAGE(c2, c3);
4583 							c4 = AVERAGE(c4, c5);
4584 							c6 = AVERAGE(c6, c7);
4585 							c8 = AVERAGE(c8, c9);
4586 							cA = AVERAGE(cA, cB);
4587 							cC = AVERAGE(cC, cD);
4588 							cE = AVERAGE(cE, cF);
4589 							c0 = AVERAGE(c0, c2);
4590 							c4 = AVERAGE(c4, c6);
4591 							c8 = AVERAGE(c8, cA);
4592 							cC = AVERAGE(cC, cE);
4593 							c0 = AVERAGE(c0, c4);
4594 							c8 = AVERAGE(c8, cC);
4595 							c0 = AVERAGE(c0, c8);
4596 
4597 							*(unsigned int*)(source0 + 4 * x) = c0;
4598 						}
4599 
4600 						source0 += pitch;
4601 						source1 += pitch;
4602 						source2 += pitch;
4603 						source3 += pitch;
4604 						source4 += pitch;
4605 						source5 += pitch;
4606 						source6 += pitch;
4607 						source7 += pitch;
4608 						source8 += pitch;
4609 						source9 += pitch;
4610 						sourceA += pitch;
4611 						sourceB += pitch;
4612 						sourceC += pitch;
4613 						sourceD += pitch;
4614 						sourceE += pitch;
4615 						sourceF += pitch;
4616 					}
4617 				}
4618 				else ASSERT(false);
4619 
4620 				#undef AVERAGE
4621 			}
4622 		}
4623 		else if(internal.format == FORMAT_A16B16G16R16)
4624 		{
4625 			#if defined(__i386__) || defined(__x86_64__)
4626 				if(CPUID::supportsSSE2() && (width % 2) == 0)
4627 				{
4628 					if(internal.samples == 2)
4629 					{
4630 						for(int y = 0; y < height; y++)
4631 						{
4632 							for(int x = 0; x < width; x += 2)
4633 							{
4634 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4635 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4636 
4637 								c0 = _mm_avg_epu16(c0, c1);
4638 
4639 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4640 							}
4641 
4642 							source0 += pitch;
4643 							source1 += pitch;
4644 						}
4645 					}
4646 					else if(internal.samples == 4)
4647 					{
4648 						for(int y = 0; y < height; y++)
4649 						{
4650 							for(int x = 0; x < width; x += 2)
4651 							{
4652 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4653 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4654 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4655 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4656 
4657 								c0 = _mm_avg_epu16(c0, c1);
4658 								c2 = _mm_avg_epu16(c2, c3);
4659 								c0 = _mm_avg_epu16(c0, c2);
4660 
4661 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4662 							}
4663 
4664 							source0 += pitch;
4665 							source1 += pitch;
4666 							source2 += pitch;
4667 							source3 += pitch;
4668 						}
4669 					}
4670 					else if(internal.samples == 8)
4671 					{
4672 						for(int y = 0; y < height; y++)
4673 						{
4674 							for(int x = 0; x < width; x += 2)
4675 							{
4676 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4677 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4678 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4679 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4680 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4681 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4682 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4683 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4684 
4685 								c0 = _mm_avg_epu16(c0, c1);
4686 								c2 = _mm_avg_epu16(c2, c3);
4687 								c4 = _mm_avg_epu16(c4, c5);
4688 								c6 = _mm_avg_epu16(c6, c7);
4689 								c0 = _mm_avg_epu16(c0, c2);
4690 								c4 = _mm_avg_epu16(c4, c6);
4691 								c0 = _mm_avg_epu16(c0, c4);
4692 
4693 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4694 							}
4695 
4696 							source0 += pitch;
4697 							source1 += pitch;
4698 							source2 += pitch;
4699 							source3 += pitch;
4700 							source4 += pitch;
4701 							source5 += pitch;
4702 							source6 += pitch;
4703 							source7 += pitch;
4704 						}
4705 					}
4706 					else if(internal.samples == 16)
4707 					{
4708 						for(int y = 0; y < height; y++)
4709 						{
4710 							for(int x = 0; x < width; x += 2)
4711 							{
4712 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4713 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4714 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4715 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4716 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4717 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4718 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4719 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4720 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4721 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4722 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4723 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4724 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4725 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4726 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4727 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4728 
4729 								c0 = _mm_avg_epu16(c0, c1);
4730 								c2 = _mm_avg_epu16(c2, c3);
4731 								c4 = _mm_avg_epu16(c4, c5);
4732 								c6 = _mm_avg_epu16(c6, c7);
4733 								c8 = _mm_avg_epu16(c8, c9);
4734 								cA = _mm_avg_epu16(cA, cB);
4735 								cC = _mm_avg_epu16(cC, cD);
4736 								cE = _mm_avg_epu16(cE, cF);
4737 								c0 = _mm_avg_epu16(c0, c2);
4738 								c4 = _mm_avg_epu16(c4, c6);
4739 								c8 = _mm_avg_epu16(c8, cA);
4740 								cC = _mm_avg_epu16(cC, cE);
4741 								c0 = _mm_avg_epu16(c0, c4);
4742 								c8 = _mm_avg_epu16(c8, cC);
4743 								c0 = _mm_avg_epu16(c0, c8);
4744 
4745 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4746 							}
4747 
4748 							source0 += pitch;
4749 							source1 += pitch;
4750 							source2 += pitch;
4751 							source3 += pitch;
4752 							source4 += pitch;
4753 							source5 += pitch;
4754 							source6 += pitch;
4755 							source7 += pitch;
4756 							source8 += pitch;
4757 							source9 += pitch;
4758 							sourceA += pitch;
4759 							sourceB += pitch;
4760 							sourceC += pitch;
4761 							sourceD += pitch;
4762 							sourceE += pitch;
4763 							sourceF += pitch;
4764 						}
4765 					}
4766 					else ASSERT(false);
4767 				}
4768 				else
4769 			#endif
4770 			{
4771 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4772 
4773 				if(internal.samples == 2)
4774 				{
4775 					for(int y = 0; y < height; y++)
4776 					{
4777 						for(int x = 0; x < 2 * width; x++)
4778 						{
4779 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4780 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4781 
4782 							c0 = AVERAGE(c0, c1);
4783 
4784 							*(unsigned int*)(source0 + 4 * x) = c0;
4785 						}
4786 
4787 						source0 += pitch;
4788 						source1 += pitch;
4789 					}
4790 				}
4791 				else if(internal.samples == 4)
4792 				{
4793 					for(int y = 0; y < height; y++)
4794 					{
4795 						for(int x = 0; x < 2 * width; x++)
4796 						{
4797 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4798 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4799 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4800 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4801 
4802 							c0 = AVERAGE(c0, c1);
4803 							c2 = AVERAGE(c2, c3);
4804 							c0 = AVERAGE(c0, c2);
4805 
4806 							*(unsigned int*)(source0 + 4 * x) = c0;
4807 						}
4808 
4809 						source0 += pitch;
4810 						source1 += pitch;
4811 						source2 += pitch;
4812 						source3 += pitch;
4813 					}
4814 				}
4815 				else if(internal.samples == 8)
4816 				{
4817 					for(int y = 0; y < height; y++)
4818 					{
4819 						for(int x = 0; x < 2 * width; x++)
4820 						{
4821 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4822 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4823 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4824 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4825 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4826 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4827 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4828 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4829 
4830 							c0 = AVERAGE(c0, c1);
4831 							c2 = AVERAGE(c2, c3);
4832 							c4 = AVERAGE(c4, c5);
4833 							c6 = AVERAGE(c6, c7);
4834 							c0 = AVERAGE(c0, c2);
4835 							c4 = AVERAGE(c4, c6);
4836 							c0 = AVERAGE(c0, c4);
4837 
4838 							*(unsigned int*)(source0 + 4 * x) = c0;
4839 						}
4840 
4841 						source0 += pitch;
4842 						source1 += pitch;
4843 						source2 += pitch;
4844 						source3 += pitch;
4845 						source4 += pitch;
4846 						source5 += pitch;
4847 						source6 += pitch;
4848 						source7 += pitch;
4849 					}
4850 				}
4851 				else if(internal.samples == 16)
4852 				{
4853 					for(int y = 0; y < height; y++)
4854 					{
4855 						for(int x = 0; x < 2 * width; x++)
4856 						{
4857 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4858 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4859 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4860 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4861 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4862 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4863 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4864 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4865 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4866 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4867 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4868 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4869 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4870 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4871 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4872 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4873 
4874 							c0 = AVERAGE(c0, c1);
4875 							c2 = AVERAGE(c2, c3);
4876 							c4 = AVERAGE(c4, c5);
4877 							c6 = AVERAGE(c6, c7);
4878 							c8 = AVERAGE(c8, c9);
4879 							cA = AVERAGE(cA, cB);
4880 							cC = AVERAGE(cC, cD);
4881 							cE = AVERAGE(cE, cF);
4882 							c0 = AVERAGE(c0, c2);
4883 							c4 = AVERAGE(c4, c6);
4884 							c8 = AVERAGE(c8, cA);
4885 							cC = AVERAGE(cC, cE);
4886 							c0 = AVERAGE(c0, c4);
4887 							c8 = AVERAGE(c8, cC);
4888 							c0 = AVERAGE(c0, c8);
4889 
4890 							*(unsigned int*)(source0 + 4 * x) = c0;
4891 						}
4892 
4893 						source0 += pitch;
4894 						source1 += pitch;
4895 						source2 += pitch;
4896 						source3 += pitch;
4897 						source4 += pitch;
4898 						source5 += pitch;
4899 						source6 += pitch;
4900 						source7 += pitch;
4901 						source8 += pitch;
4902 						source9 += pitch;
4903 						sourceA += pitch;
4904 						sourceB += pitch;
4905 						sourceC += pitch;
4906 						sourceD += pitch;
4907 						sourceE += pitch;
4908 						sourceF += pitch;
4909 					}
4910 				}
4911 				else ASSERT(false);
4912 
4913 				#undef AVERAGE
4914 			}
4915 		}
4916 		else if(internal.format == FORMAT_R32F)
4917 		{
4918 			#if defined(__i386__) || defined(__x86_64__)
4919 				if(CPUID::supportsSSE() && (width % 4) == 0)
4920 				{
4921 					if(internal.samples == 2)
4922 					{
4923 						for(int y = 0; y < height; y++)
4924 						{
4925 							for(int x = 0; x < width; x += 4)
4926 							{
4927 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4928 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4929 
4930 								c0 = _mm_add_ps(c0, c1);
4931 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4932 
4933 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4934 							}
4935 
4936 							source0 += pitch;
4937 							source1 += pitch;
4938 						}
4939 					}
4940 					else if(internal.samples == 4)
4941 					{
4942 						for(int y = 0; y < height; y++)
4943 						{
4944 							for(int x = 0; x < width; x += 4)
4945 							{
4946 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4947 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4948 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4949 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4950 
4951 								c0 = _mm_add_ps(c0, c1);
4952 								c2 = _mm_add_ps(c2, c3);
4953 								c0 = _mm_add_ps(c0, c2);
4954 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4955 
4956 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4957 							}
4958 
4959 							source0 += pitch;
4960 							source1 += pitch;
4961 							source2 += pitch;
4962 							source3 += pitch;
4963 						}
4964 					}
4965 					else if(internal.samples == 8)
4966 					{
4967 						for(int y = 0; y < height; y++)
4968 						{
4969 							for(int x = 0; x < width; x += 4)
4970 							{
4971 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4972 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4973 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4974 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4975 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4976 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4977 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4978 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4979 
4980 								c0 = _mm_add_ps(c0, c1);
4981 								c2 = _mm_add_ps(c2, c3);
4982 								c4 = _mm_add_ps(c4, c5);
4983 								c6 = _mm_add_ps(c6, c7);
4984 								c0 = _mm_add_ps(c0, c2);
4985 								c4 = _mm_add_ps(c4, c6);
4986 								c0 = _mm_add_ps(c0, c4);
4987 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4988 
4989 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4990 							}
4991 
4992 							source0 += pitch;
4993 							source1 += pitch;
4994 							source2 += pitch;
4995 							source3 += pitch;
4996 							source4 += pitch;
4997 							source5 += pitch;
4998 							source6 += pitch;
4999 							source7 += pitch;
5000 						}
5001 					}
5002 					else if(internal.samples == 16)
5003 					{
5004 						for(int y = 0; y < height; y++)
5005 						{
5006 							for(int x = 0; x < width; x += 4)
5007 							{
5008 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
5009 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
5010 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
5011 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
5012 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
5013 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
5014 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
5015 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
5016 								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
5017 								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
5018 								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
5019 								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
5020 								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
5021 								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
5022 								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
5023 								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
5024 
5025 								c0 = _mm_add_ps(c0, c1);
5026 								c2 = _mm_add_ps(c2, c3);
5027 								c4 = _mm_add_ps(c4, c5);
5028 								c6 = _mm_add_ps(c6, c7);
5029 								c8 = _mm_add_ps(c8, c9);
5030 								cA = _mm_add_ps(cA, cB);
5031 								cC = _mm_add_ps(cC, cD);
5032 								cE = _mm_add_ps(cE, cF);
5033 								c0 = _mm_add_ps(c0, c2);
5034 								c4 = _mm_add_ps(c4, c6);
5035 								c8 = _mm_add_ps(c8, cA);
5036 								cC = _mm_add_ps(cC, cE);
5037 								c0 = _mm_add_ps(c0, c4);
5038 								c8 = _mm_add_ps(c8, cC);
5039 								c0 = _mm_add_ps(c0, c8);
5040 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5041 
5042 								_mm_store_ps((float*)(source0 + 4 * x), c0);
5043 							}
5044 
5045 							source0 += pitch;
5046 							source1 += pitch;
5047 							source2 += pitch;
5048 							source3 += pitch;
5049 							source4 += pitch;
5050 							source5 += pitch;
5051 							source6 += pitch;
5052 							source7 += pitch;
5053 							source8 += pitch;
5054 							source9 += pitch;
5055 							sourceA += pitch;
5056 							sourceB += pitch;
5057 							sourceC += pitch;
5058 							sourceD += pitch;
5059 							sourceE += pitch;
5060 							sourceF += pitch;
5061 						}
5062 					}
5063 					else ASSERT(false);
5064 				}
5065 				else
5066 			#endif
5067 			{
5068 				if(internal.samples == 2)
5069 				{
5070 					for(int y = 0; y < height; y++)
5071 					{
5072 						for(int x = 0; x < width; x++)
5073 						{
5074 							float c0 = *(float*)(source0 + 4 * x);
5075 							float c1 = *(float*)(source1 + 4 * x);
5076 
5077 							c0 = c0 + c1;
5078 							c0 *= 1.0f / 2.0f;
5079 
5080 							*(float*)(source0 + 4 * x) = c0;
5081 						}
5082 
5083 						source0 += pitch;
5084 						source1 += pitch;
5085 					}
5086 				}
5087 				else if(internal.samples == 4)
5088 				{
5089 					for(int y = 0; y < height; y++)
5090 					{
5091 						for(int x = 0; x < width; x++)
5092 						{
5093 							float c0 = *(float*)(source0 + 4 * x);
5094 							float c1 = *(float*)(source1 + 4 * x);
5095 							float c2 = *(float*)(source2 + 4 * x);
5096 							float c3 = *(float*)(source3 + 4 * x);
5097 
5098 							c0 = c0 + c1;
5099 							c2 = c2 + c3;
5100 							c0 = c0 + c2;
5101 							c0 *= 1.0f / 4.0f;
5102 
5103 							*(float*)(source0 + 4 * x) = c0;
5104 						}
5105 
5106 						source0 += pitch;
5107 						source1 += pitch;
5108 						source2 += pitch;
5109 						source3 += pitch;
5110 					}
5111 				}
5112 				else if(internal.samples == 8)
5113 				{
5114 					for(int y = 0; y < height; y++)
5115 					{
5116 						for(int x = 0; x < width; x++)
5117 						{
5118 							float c0 = *(float*)(source0 + 4 * x);
5119 							float c1 = *(float*)(source1 + 4 * x);
5120 							float c2 = *(float*)(source2 + 4 * x);
5121 							float c3 = *(float*)(source3 + 4 * x);
5122 							float c4 = *(float*)(source4 + 4 * x);
5123 							float c5 = *(float*)(source5 + 4 * x);
5124 							float c6 = *(float*)(source6 + 4 * x);
5125 							float c7 = *(float*)(source7 + 4 * x);
5126 
5127 							c0 = c0 + c1;
5128 							c2 = c2 + c3;
5129 							c4 = c4 + c5;
5130 							c6 = c6 + c7;
5131 							c0 = c0 + c2;
5132 							c4 = c4 + c6;
5133 							c0 = c0 + c4;
5134 							c0 *= 1.0f / 8.0f;
5135 
5136 							*(float*)(source0 + 4 * x) = c0;
5137 						}
5138 
5139 						source0 += pitch;
5140 						source1 += pitch;
5141 						source2 += pitch;
5142 						source3 += pitch;
5143 						source4 += pitch;
5144 						source5 += pitch;
5145 						source6 += pitch;
5146 						source7 += pitch;
5147 					}
5148 				}
5149 				else if(internal.samples == 16)
5150 				{
5151 					for(int y = 0; y < height; y++)
5152 					{
5153 						for(int x = 0; x < width; x++)
5154 						{
5155 							float c0 = *(float*)(source0 + 4 * x);
5156 							float c1 = *(float*)(source1 + 4 * x);
5157 							float c2 = *(float*)(source2 + 4 * x);
5158 							float c3 = *(float*)(source3 + 4 * x);
5159 							float c4 = *(float*)(source4 + 4 * x);
5160 							float c5 = *(float*)(source5 + 4 * x);
5161 							float c6 = *(float*)(source6 + 4 * x);
5162 							float c7 = *(float*)(source7 + 4 * x);
5163 							float c8 = *(float*)(source8 + 4 * x);
5164 							float c9 = *(float*)(source9 + 4 * x);
5165 							float cA = *(float*)(sourceA + 4 * x);
5166 							float cB = *(float*)(sourceB + 4 * x);
5167 							float cC = *(float*)(sourceC + 4 * x);
5168 							float cD = *(float*)(sourceD + 4 * x);
5169 							float cE = *(float*)(sourceE + 4 * x);
5170 							float cF = *(float*)(sourceF + 4 * x);
5171 
5172 							c0 = c0 + c1;
5173 							c2 = c2 + c3;
5174 							c4 = c4 + c5;
5175 							c6 = c6 + c7;
5176 							c8 = c8 + c9;
5177 							cA = cA + cB;
5178 							cC = cC + cD;
5179 							cE = cE + cF;
5180 							c0 = c0 + c2;
5181 							c4 = c4 + c6;
5182 							c8 = c8 + cA;
5183 							cC = cC + cE;
5184 							c0 = c0 + c4;
5185 							c8 = c8 + cC;
5186 							c0 = c0 + c8;
5187 							c0 *= 1.0f / 16.0f;
5188 
5189 							*(float*)(source0 + 4 * x) = c0;
5190 						}
5191 
5192 						source0 += pitch;
5193 						source1 += pitch;
5194 						source2 += pitch;
5195 						source3 += pitch;
5196 						source4 += pitch;
5197 						source5 += pitch;
5198 						source6 += pitch;
5199 						source7 += pitch;
5200 						source8 += pitch;
5201 						source9 += pitch;
5202 						sourceA += pitch;
5203 						sourceB += pitch;
5204 						sourceC += pitch;
5205 						sourceD += pitch;
5206 						sourceE += pitch;
5207 						sourceF += pitch;
5208 					}
5209 				}
5210 				else ASSERT(false);
5211 			}
5212 		}
5213 		else if(internal.format == FORMAT_G32R32F)
5214 		{
5215 			#if defined(__i386__) || defined(__x86_64__)
5216 				if(CPUID::supportsSSE() && (width % 2) == 0)
5217 				{
5218 					if(internal.samples == 2)
5219 					{
5220 						for(int y = 0; y < height; y++)
5221 						{
5222 							for(int x = 0; x < width; x += 2)
5223 							{
5224 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5225 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5226 
5227 								c0 = _mm_add_ps(c0, c1);
5228 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5229 
5230 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5231 							}
5232 
5233 							source0 += pitch;
5234 							source1 += pitch;
5235 						}
5236 					}
5237 					else if(internal.samples == 4)
5238 					{
5239 						for(int y = 0; y < height; y++)
5240 						{
5241 							for(int x = 0; x < width; x += 2)
5242 							{
5243 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5244 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5245 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5246 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5247 
5248 								c0 = _mm_add_ps(c0, c1);
5249 								c2 = _mm_add_ps(c2, c3);
5250 								c0 = _mm_add_ps(c0, c2);
5251 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5252 
5253 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5254 							}
5255 
5256 							source0 += pitch;
5257 							source1 += pitch;
5258 							source2 += pitch;
5259 							source3 += pitch;
5260 						}
5261 					}
5262 					else if(internal.samples == 8)
5263 					{
5264 						for(int y = 0; y < height; y++)
5265 						{
5266 							for(int x = 0; x < width; x += 2)
5267 							{
5268 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5269 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5270 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5271 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5272 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5273 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5274 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5275 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5276 
5277 								c0 = _mm_add_ps(c0, c1);
5278 								c2 = _mm_add_ps(c2, c3);
5279 								c4 = _mm_add_ps(c4, c5);
5280 								c6 = _mm_add_ps(c6, c7);
5281 								c0 = _mm_add_ps(c0, c2);
5282 								c4 = _mm_add_ps(c4, c6);
5283 								c0 = _mm_add_ps(c0, c4);
5284 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5285 
5286 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5287 							}
5288 
5289 							source0 += pitch;
5290 							source1 += pitch;
5291 							source2 += pitch;
5292 							source3 += pitch;
5293 							source4 += pitch;
5294 							source5 += pitch;
5295 							source6 += pitch;
5296 							source7 += pitch;
5297 						}
5298 					}
5299 					else if(internal.samples == 16)
5300 					{
5301 						for(int y = 0; y < height; y++)
5302 						{
5303 							for(int x = 0; x < width; x += 2)
5304 							{
5305 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5306 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5307 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5308 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5309 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5310 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5311 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5312 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5313 								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
5314 								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
5315 								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
5316 								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
5317 								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
5318 								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
5319 								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
5320 								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
5321 
5322 								c0 = _mm_add_ps(c0, c1);
5323 								c2 = _mm_add_ps(c2, c3);
5324 								c4 = _mm_add_ps(c4, c5);
5325 								c6 = _mm_add_ps(c6, c7);
5326 								c8 = _mm_add_ps(c8, c9);
5327 								cA = _mm_add_ps(cA, cB);
5328 								cC = _mm_add_ps(cC, cD);
5329 								cE = _mm_add_ps(cE, cF);
5330 								c0 = _mm_add_ps(c0, c2);
5331 								c4 = _mm_add_ps(c4, c6);
5332 								c8 = _mm_add_ps(c8, cA);
5333 								cC = _mm_add_ps(cC, cE);
5334 								c0 = _mm_add_ps(c0, c4);
5335 								c8 = _mm_add_ps(c8, cC);
5336 								c0 = _mm_add_ps(c0, c8);
5337 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5338 
5339 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5340 							}
5341 
5342 							source0 += pitch;
5343 							source1 += pitch;
5344 							source2 += pitch;
5345 							source3 += pitch;
5346 							source4 += pitch;
5347 							source5 += pitch;
5348 							source6 += pitch;
5349 							source7 += pitch;
5350 							source8 += pitch;
5351 							source9 += pitch;
5352 							sourceA += pitch;
5353 							sourceB += pitch;
5354 							sourceC += pitch;
5355 							sourceD += pitch;
5356 							sourceE += pitch;
5357 							sourceF += pitch;
5358 						}
5359 					}
5360 					else ASSERT(false);
5361 				}
5362 				else
5363 			#endif
5364 			{
5365 				if(internal.samples == 2)
5366 				{
5367 					for(int y = 0; y < height; y++)
5368 					{
5369 						for(int x = 0; x < 2 * width; x++)
5370 						{
5371 							float c0 = *(float*)(source0 + 4 * x);
5372 							float c1 = *(float*)(source1 + 4 * x);
5373 
5374 							c0 = c0 + c1;
5375 							c0 *= 1.0f / 2.0f;
5376 
5377 							*(float*)(source0 + 4 * x) = c0;
5378 						}
5379 
5380 						source0 += pitch;
5381 						source1 += pitch;
5382 					}
5383 				}
5384 				else if(internal.samples == 4)
5385 				{
5386 					for(int y = 0; y < height; y++)
5387 					{
5388 						for(int x = 0; x < 2 * width; x++)
5389 						{
5390 							float c0 = *(float*)(source0 + 4 * x);
5391 							float c1 = *(float*)(source1 + 4 * x);
5392 							float c2 = *(float*)(source2 + 4 * x);
5393 							float c3 = *(float*)(source3 + 4 * x);
5394 
5395 							c0 = c0 + c1;
5396 							c2 = c2 + c3;
5397 							c0 = c0 + c2;
5398 							c0 *= 1.0f / 4.0f;
5399 
5400 							*(float*)(source0 + 4 * x) = c0;
5401 						}
5402 
5403 						source0 += pitch;
5404 						source1 += pitch;
5405 						source2 += pitch;
5406 						source3 += pitch;
5407 					}
5408 				}
5409 				else if(internal.samples == 8)
5410 				{
5411 					for(int y = 0; y < height; y++)
5412 					{
5413 						for(int x = 0; x < 2 * width; x++)
5414 						{
5415 							float c0 = *(float*)(source0 + 4 * x);
5416 							float c1 = *(float*)(source1 + 4 * x);
5417 							float c2 = *(float*)(source2 + 4 * x);
5418 							float c3 = *(float*)(source3 + 4 * x);
5419 							float c4 = *(float*)(source4 + 4 * x);
5420 							float c5 = *(float*)(source5 + 4 * x);
5421 							float c6 = *(float*)(source6 + 4 * x);
5422 							float c7 = *(float*)(source7 + 4 * x);
5423 
5424 							c0 = c0 + c1;
5425 							c2 = c2 + c3;
5426 							c4 = c4 + c5;
5427 							c6 = c6 + c7;
5428 							c0 = c0 + c2;
5429 							c4 = c4 + c6;
5430 							c0 = c0 + c4;
5431 							c0 *= 1.0f / 8.0f;
5432 
5433 							*(float*)(source0 + 4 * x) = c0;
5434 						}
5435 
5436 						source0 += pitch;
5437 						source1 += pitch;
5438 						source2 += pitch;
5439 						source3 += pitch;
5440 						source4 += pitch;
5441 						source5 += pitch;
5442 						source6 += pitch;
5443 						source7 += pitch;
5444 					}
5445 				}
5446 				else if(internal.samples == 16)
5447 				{
5448 					for(int y = 0; y < height; y++)
5449 					{
5450 						for(int x = 0; x < 2 * width; x++)
5451 						{
5452 							float c0 = *(float*)(source0 + 4 * x);
5453 							float c1 = *(float*)(source1 + 4 * x);
5454 							float c2 = *(float*)(source2 + 4 * x);
5455 							float c3 = *(float*)(source3 + 4 * x);
5456 							float c4 = *(float*)(source4 + 4 * x);
5457 							float c5 = *(float*)(source5 + 4 * x);
5458 							float c6 = *(float*)(source6 + 4 * x);
5459 							float c7 = *(float*)(source7 + 4 * x);
5460 							float c8 = *(float*)(source8 + 4 * x);
5461 							float c9 = *(float*)(source9 + 4 * x);
5462 							float cA = *(float*)(sourceA + 4 * x);
5463 							float cB = *(float*)(sourceB + 4 * x);
5464 							float cC = *(float*)(sourceC + 4 * x);
5465 							float cD = *(float*)(sourceD + 4 * x);
5466 							float cE = *(float*)(sourceE + 4 * x);
5467 							float cF = *(float*)(sourceF + 4 * x);
5468 
5469 							c0 = c0 + c1;
5470 							c2 = c2 + c3;
5471 							c4 = c4 + c5;
5472 							c6 = c6 + c7;
5473 							c8 = c8 + c9;
5474 							cA = cA + cB;
5475 							cC = cC + cD;
5476 							cE = cE + cF;
5477 							c0 = c0 + c2;
5478 							c4 = c4 + c6;
5479 							c8 = c8 + cA;
5480 							cC = cC + cE;
5481 							c0 = c0 + c4;
5482 							c8 = c8 + cC;
5483 							c0 = c0 + c8;
5484 							c0 *= 1.0f / 16.0f;
5485 
5486 							*(float*)(source0 + 4 * x) = c0;
5487 						}
5488 
5489 						source0 += pitch;
5490 						source1 += pitch;
5491 						source2 += pitch;
5492 						source3 += pitch;
5493 						source4 += pitch;
5494 						source5 += pitch;
5495 						source6 += pitch;
5496 						source7 += pitch;
5497 						source8 += pitch;
5498 						source9 += pitch;
5499 						sourceA += pitch;
5500 						sourceB += pitch;
5501 						sourceC += pitch;
5502 						sourceD += pitch;
5503 						sourceE += pitch;
5504 						sourceF += pitch;
5505 					}
5506 				}
5507 				else ASSERT(false);
5508 			}
5509 		}
5510 		else if(internal.format == FORMAT_A32B32G32R32F ||
5511 		        internal.format == FORMAT_X32B32G32R32F ||
5512 		        internal.format == FORMAT_X32B32G32R32F_UNSIGNED)
5513 		{
5514 			#if defined(__i386__) || defined(__x86_64__)
5515 				if(CPUID::supportsSSE())
5516 				{
5517 					if(internal.samples == 2)
5518 					{
5519 						for(int y = 0; y < height; y++)
5520 						{
5521 							for(int x = 0; x < width; x++)
5522 							{
5523 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5524 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5525 
5526 								c0 = _mm_add_ps(c0, c1);
5527 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5528 
5529 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5530 							}
5531 
5532 							source0 += pitch;
5533 							source1 += pitch;
5534 						}
5535 					}
5536 					else if(internal.samples == 4)
5537 					{
5538 						for(int y = 0; y < height; y++)
5539 						{
5540 							for(int x = 0; x < width; x++)
5541 							{
5542 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5543 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5544 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5545 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5546 
5547 								c0 = _mm_add_ps(c0, c1);
5548 								c2 = _mm_add_ps(c2, c3);
5549 								c0 = _mm_add_ps(c0, c2);
5550 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5551 
5552 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5553 							}
5554 
5555 							source0 += pitch;
5556 							source1 += pitch;
5557 							source2 += pitch;
5558 							source3 += pitch;
5559 						}
5560 					}
5561 					else if(internal.samples == 8)
5562 					{
5563 						for(int y = 0; y < height; y++)
5564 						{
5565 							for(int x = 0; x < width; x++)
5566 							{
5567 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5568 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5569 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5570 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5571 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5572 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5573 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5574 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5575 
5576 								c0 = _mm_add_ps(c0, c1);
5577 								c2 = _mm_add_ps(c2, c3);
5578 								c4 = _mm_add_ps(c4, c5);
5579 								c6 = _mm_add_ps(c6, c7);
5580 								c0 = _mm_add_ps(c0, c2);
5581 								c4 = _mm_add_ps(c4, c6);
5582 								c0 = _mm_add_ps(c0, c4);
5583 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5584 
5585 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5586 							}
5587 
5588 							source0 += pitch;
5589 							source1 += pitch;
5590 							source2 += pitch;
5591 							source3 += pitch;
5592 							source4 += pitch;
5593 							source5 += pitch;
5594 							source6 += pitch;
5595 							source7 += pitch;
5596 						}
5597 					}
5598 					else if(internal.samples == 16)
5599 					{
5600 						for(int y = 0; y < height; y++)
5601 						{
5602 							for(int x = 0; x < width; x++)
5603 							{
5604 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5605 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5606 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5607 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5608 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5609 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5610 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5611 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5612 								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
5613 								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
5614 								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
5615 								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
5616 								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
5617 								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
5618 								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
5619 								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
5620 
5621 								c0 = _mm_add_ps(c0, c1);
5622 								c2 = _mm_add_ps(c2, c3);
5623 								c4 = _mm_add_ps(c4, c5);
5624 								c6 = _mm_add_ps(c6, c7);
5625 								c8 = _mm_add_ps(c8, c9);
5626 								cA = _mm_add_ps(cA, cB);
5627 								cC = _mm_add_ps(cC, cD);
5628 								cE = _mm_add_ps(cE, cF);
5629 								c0 = _mm_add_ps(c0, c2);
5630 								c4 = _mm_add_ps(c4, c6);
5631 								c8 = _mm_add_ps(c8, cA);
5632 								cC = _mm_add_ps(cC, cE);
5633 								c0 = _mm_add_ps(c0, c4);
5634 								c8 = _mm_add_ps(c8, cC);
5635 								c0 = _mm_add_ps(c0, c8);
5636 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5637 
5638 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5639 							}
5640 
5641 							source0 += pitch;
5642 							source1 += pitch;
5643 							source2 += pitch;
5644 							source3 += pitch;
5645 							source4 += pitch;
5646 							source5 += pitch;
5647 							source6 += pitch;
5648 							source7 += pitch;
5649 							source8 += pitch;
5650 							source9 += pitch;
5651 							sourceA += pitch;
5652 							sourceB += pitch;
5653 							sourceC += pitch;
5654 							sourceD += pitch;
5655 							sourceE += pitch;
5656 							sourceF += pitch;
5657 						}
5658 					}
5659 					else ASSERT(false);
5660 				}
5661 				else
5662 			#endif
5663 			{
5664 				if(internal.samples == 2)
5665 				{
5666 					for(int y = 0; y < height; y++)
5667 					{
5668 						for(int x = 0; x < 4 * width; x++)
5669 						{
5670 							float c0 = *(float*)(source0 + 4 * x);
5671 							float c1 = *(float*)(source1 + 4 * x);
5672 
5673 							c0 = c0 + c1;
5674 							c0 *= 1.0f / 2.0f;
5675 
5676 							*(float*)(source0 + 4 * x) = c0;
5677 						}
5678 
5679 						source0 += pitch;
5680 						source1 += pitch;
5681 					}
5682 				}
5683 				else if(internal.samples == 4)
5684 				{
5685 					for(int y = 0; y < height; y++)
5686 					{
5687 						for(int x = 0; x < 4 * width; x++)
5688 						{
5689 							float c0 = *(float*)(source0 + 4 * x);
5690 							float c1 = *(float*)(source1 + 4 * x);
5691 							float c2 = *(float*)(source2 + 4 * x);
5692 							float c3 = *(float*)(source3 + 4 * x);
5693 
5694 							c0 = c0 + c1;
5695 							c2 = c2 + c3;
5696 							c0 = c0 + c2;
5697 							c0 *= 1.0f / 4.0f;
5698 
5699 							*(float*)(source0 + 4 * x) = c0;
5700 						}
5701 
5702 						source0 += pitch;
5703 						source1 += pitch;
5704 						source2 += pitch;
5705 						source3 += pitch;
5706 					}
5707 				}
5708 				else if(internal.samples == 8)
5709 				{
5710 					for(int y = 0; y < height; y++)
5711 					{
5712 						for(int x = 0; x < 4 * width; x++)
5713 						{
5714 							float c0 = *(float*)(source0 + 4 * x);
5715 							float c1 = *(float*)(source1 + 4 * x);
5716 							float c2 = *(float*)(source2 + 4 * x);
5717 							float c3 = *(float*)(source3 + 4 * x);
5718 							float c4 = *(float*)(source4 + 4 * x);
5719 							float c5 = *(float*)(source5 + 4 * x);
5720 							float c6 = *(float*)(source6 + 4 * x);
5721 							float c7 = *(float*)(source7 + 4 * x);
5722 
5723 							c0 = c0 + c1;
5724 							c2 = c2 + c3;
5725 							c4 = c4 + c5;
5726 							c6 = c6 + c7;
5727 							c0 = c0 + c2;
5728 							c4 = c4 + c6;
5729 							c0 = c0 + c4;
5730 							c0 *= 1.0f / 8.0f;
5731 
5732 							*(float*)(source0 + 4 * x) = c0;
5733 						}
5734 
5735 						source0 += pitch;
5736 						source1 += pitch;
5737 						source2 += pitch;
5738 						source3 += pitch;
5739 						source4 += pitch;
5740 						source5 += pitch;
5741 						source6 += pitch;
5742 						source7 += pitch;
5743 					}
5744 				}
5745 				else if(internal.samples == 16)
5746 				{
5747 					for(int y = 0; y < height; y++)
5748 					{
5749 						for(int x = 0; x < 4 * width; x++)
5750 						{
5751 							float c0 = *(float*)(source0 + 4 * x);
5752 							float c1 = *(float*)(source1 + 4 * x);
5753 							float c2 = *(float*)(source2 + 4 * x);
5754 							float c3 = *(float*)(source3 + 4 * x);
5755 							float c4 = *(float*)(source4 + 4 * x);
5756 							float c5 = *(float*)(source5 + 4 * x);
5757 							float c6 = *(float*)(source6 + 4 * x);
5758 							float c7 = *(float*)(source7 + 4 * x);
5759 							float c8 = *(float*)(source8 + 4 * x);
5760 							float c9 = *(float*)(source9 + 4 * x);
5761 							float cA = *(float*)(sourceA + 4 * x);
5762 							float cB = *(float*)(sourceB + 4 * x);
5763 							float cC = *(float*)(sourceC + 4 * x);
5764 							float cD = *(float*)(sourceD + 4 * x);
5765 							float cE = *(float*)(sourceE + 4 * x);
5766 							float cF = *(float*)(sourceF + 4 * x);
5767 
5768 							c0 = c0 + c1;
5769 							c2 = c2 + c3;
5770 							c4 = c4 + c5;
5771 							c6 = c6 + c7;
5772 							c8 = c8 + c9;
5773 							cA = cA + cB;
5774 							cC = cC + cD;
5775 							cE = cE + cF;
5776 							c0 = c0 + c2;
5777 							c4 = c4 + c6;
5778 							c8 = c8 + cA;
5779 							cC = cC + cE;
5780 							c0 = c0 + c4;
5781 							c8 = c8 + cC;
5782 							c0 = c0 + c8;
5783 							c0 *= 1.0f / 16.0f;
5784 
5785 							*(float*)(source0 + 4 * x) = c0;
5786 						}
5787 
5788 						source0 += pitch;
5789 						source1 += pitch;
5790 						source2 += pitch;
5791 						source3 += pitch;
5792 						source4 += pitch;
5793 						source5 += pitch;
5794 						source6 += pitch;
5795 						source7 += pitch;
5796 						source8 += pitch;
5797 						source9 += pitch;
5798 						sourceA += pitch;
5799 						sourceB += pitch;
5800 						sourceC += pitch;
5801 						sourceD += pitch;
5802 						sourceE += pitch;
5803 						sourceF += pitch;
5804 					}
5805 				}
5806 				else ASSERT(false);
5807 			}
5808 		}
5809 		else if(internal.format == FORMAT_R5G6B5)
5810 		{
5811 			#if defined(__i386__) || defined(__x86_64__)
5812 				if(CPUID::supportsSSE2() && (width % 8) == 0)
5813 				{
5814 					if(internal.samples == 2)
5815 					{
5816 						for(int y = 0; y < height; y++)
5817 						{
5818 							for(int x = 0; x < width; x += 8)
5819 							{
5820 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5821 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5822 
5823 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5824 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5825 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5826 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5827 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5828 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5829 
5830 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5831 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5832 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5833 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5834 								c0 = _mm_or_si128(c0, c1);
5835 
5836 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5837 							}
5838 
5839 							source0 += pitch;
5840 							source1 += pitch;
5841 						}
5842 					}
5843 					else if(internal.samples == 4)
5844 					{
5845 						for(int y = 0; y < height; y++)
5846 						{
5847 							for(int x = 0; x < width; x += 8)
5848 							{
5849 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5850 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5851 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5852 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5853 
5854 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5855 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5856 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5857 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5858 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5859 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5860 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5861 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5862 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5863 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5864 
5865 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5866 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5867 								c0 = _mm_avg_epu8(c0, c2);
5868 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5869 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5870 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5871 								c1 = _mm_avg_epu16(c1, c3);
5872 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5873 								c0 = _mm_or_si128(c0, c1);
5874 
5875 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5876 							}
5877 
5878 							source0 += pitch;
5879 							source1 += pitch;
5880 							source2 += pitch;
5881 							source3 += pitch;
5882 						}
5883 					}
5884 					else if(internal.samples == 8)
5885 					{
5886 						for(int y = 0; y < height; y++)
5887 						{
5888 							for(int x = 0; x < width; x += 8)
5889 							{
5890 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5891 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5892 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5893 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5894 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5895 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5896 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5897 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5898 
5899 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5900 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5901 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5902 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5903 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5904 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5905 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5906 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5907 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5908 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5909 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5910 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5911 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5912 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5913 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5914 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5915 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5916 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5917 
5918 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5919 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5920 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5921 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5922 								c0 = _mm_avg_epu8(c0, c2);
5923 								c4 = _mm_avg_epu8(c4, c6);
5924 								c0 = _mm_avg_epu8(c0, c4);
5925 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5926 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5927 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5928 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
5929 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
5930 								c1 = _mm_avg_epu16(c1, c3);
5931 								c5 = _mm_avg_epu16(c5, c7);
5932 								c1 = _mm_avg_epu16(c1, c5);
5933 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5934 								c0 = _mm_or_si128(c0, c1);
5935 
5936 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5937 							}
5938 
5939 							source0 += pitch;
5940 							source1 += pitch;
5941 							source2 += pitch;
5942 							source3 += pitch;
5943 							source4 += pitch;
5944 							source5 += pitch;
5945 							source6 += pitch;
5946 							source7 += pitch;
5947 						}
5948 					}
5949 					else if(internal.samples == 16)
5950 					{
5951 						for(int y = 0; y < height; y++)
5952 						{
5953 							for(int x = 0; x < width; x += 8)
5954 							{
5955 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5956 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5957 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5958 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5959 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5960 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5961 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5962 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5963 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
5964 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
5965 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
5966 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
5967 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
5968 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
5969 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
5970 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
5971 
5972 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5973 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5974 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5975 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5976 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5977 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5978 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5979 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5980 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5981 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5982 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5983 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5984 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5985 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5986 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5987 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5988 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5989 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5990 								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
5991 								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
5992 								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
5993 								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
5994 								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
5995 								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
5996 								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
5997 								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
5998 								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
5999 								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
6000 								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
6001 								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
6002 								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
6003 								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
6004 								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
6005 								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
6006 
6007 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
6008 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
6009 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
6010 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
6011 								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
6012 								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
6013 								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
6014 								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
6015 								c0 = _mm_avg_epu8(c0, c2);
6016 								c4 = _mm_avg_epu8(c4, c6);
6017 								c8 = _mm_avg_epu8(c8, cA);
6018 								cC = _mm_avg_epu8(cC, cE);
6019 								c0 = _mm_avg_epu8(c0, c4);
6020 								c8 = _mm_avg_epu8(c8, cC);
6021 								c0 = _mm_avg_epu8(c0, c8);
6022 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
6023 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
6024 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
6025 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
6026 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
6027 								c9 = _mm_avg_epu16(c8__g_, c9__g_);
6028 								cB = _mm_avg_epu16(cA__g_, cB__g_);
6029 								cD = _mm_avg_epu16(cC__g_, cD__g_);
6030 								cF = _mm_avg_epu16(cE__g_, cF__g_);
6031 								c1 = _mm_avg_epu8(c1, c3);
6032 								c5 = _mm_avg_epu8(c5, c7);
6033 								c9 = _mm_avg_epu8(c9, cB);
6034 								cD = _mm_avg_epu8(cD, cF);
6035 								c1 = _mm_avg_epu8(c1, c5);
6036 								c9 = _mm_avg_epu8(c9, cD);
6037 								c1 = _mm_avg_epu8(c1, c9);
6038 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
6039 								c0 = _mm_or_si128(c0, c1);
6040 
6041 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
6042 							}
6043 
6044 							source0 += pitch;
6045 							source1 += pitch;
6046 							source2 += pitch;
6047 							source3 += pitch;
6048 							source4 += pitch;
6049 							source5 += pitch;
6050 							source6 += pitch;
6051 							source7 += pitch;
6052 							source8 += pitch;
6053 							source9 += pitch;
6054 							sourceA += pitch;
6055 							sourceB += pitch;
6056 							sourceC += pitch;
6057 							sourceD += pitch;
6058 							sourceE += pitch;
6059 							sourceF += pitch;
6060 						}
6061 					}
6062 					else ASSERT(false);
6063 				}
6064 				else
6065 			#endif
6066 			{
6067 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
6068 
6069 				if(internal.samples == 2)
6070 				{
6071 					for(int y = 0; y < height; y++)
6072 					{
6073 						for(int x = 0; x < width; x++)
6074 						{
6075 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6076 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6077 
6078 							c0 = AVERAGE(c0, c1);
6079 
6080 							*(unsigned short*)(source0 + 2 * x) = c0;
6081 						}
6082 
6083 						source0 += pitch;
6084 						source1 += pitch;
6085 					}
6086 				}
6087 				else if(internal.samples == 4)
6088 				{
6089 					for(int y = 0; y < height; y++)
6090 					{
6091 						for(int x = 0; x < width; x++)
6092 						{
6093 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6094 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6095 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6096 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6097 
6098 							c0 = AVERAGE(c0, c1);
6099 							c2 = AVERAGE(c2, c3);
6100 							c0 = AVERAGE(c0, c2);
6101 
6102 							*(unsigned short*)(source0 + 2 * x) = c0;
6103 						}
6104 
6105 						source0 += pitch;
6106 						source1 += pitch;
6107 						source2 += pitch;
6108 						source3 += pitch;
6109 					}
6110 				}
6111 				else if(internal.samples == 8)
6112 				{
6113 					for(int y = 0; y < height; y++)
6114 					{
6115 						for(int x = 0; x < width; x++)
6116 						{
6117 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6118 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6119 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6120 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6121 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
6122 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
6123 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
6124 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
6125 
6126 							c0 = AVERAGE(c0, c1);
6127 							c2 = AVERAGE(c2, c3);
6128 							c4 = AVERAGE(c4, c5);
6129 							c6 = AVERAGE(c6, c7);
6130 							c0 = AVERAGE(c0, c2);
6131 							c4 = AVERAGE(c4, c6);
6132 							c0 = AVERAGE(c0, c4);
6133 
6134 							*(unsigned short*)(source0 + 2 * x) = c0;
6135 						}
6136 
6137 						source0 += pitch;
6138 						source1 += pitch;
6139 						source2 += pitch;
6140 						source3 += pitch;
6141 						source4 += pitch;
6142 						source5 += pitch;
6143 						source6 += pitch;
6144 						source7 += pitch;
6145 					}
6146 				}
6147 				else if(internal.samples == 16)
6148 				{
6149 					for(int y = 0; y < height; y++)
6150 					{
6151 						for(int x = 0; x < width; x++)
6152 						{
6153 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6154 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6155 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6156 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6157 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
6158 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
6159 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
6160 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
6161 							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
6162 							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
6163 							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
6164 							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
6165 							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
6166 							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
6167 							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
6168 							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
6169 
6170 							c0 = AVERAGE(c0, c1);
6171 							c2 = AVERAGE(c2, c3);
6172 							c4 = AVERAGE(c4, c5);
6173 							c6 = AVERAGE(c6, c7);
6174 							c8 = AVERAGE(c8, c9);
6175 							cA = AVERAGE(cA, cB);
6176 							cC = AVERAGE(cC, cD);
6177 							cE = AVERAGE(cE, cF);
6178 							c0 = AVERAGE(c0, c2);
6179 							c4 = AVERAGE(c4, c6);
6180 							c8 = AVERAGE(c8, cA);
6181 							cC = AVERAGE(cC, cE);
6182 							c0 = AVERAGE(c0, c4);
6183 							c8 = AVERAGE(c8, cC);
6184 							c0 = AVERAGE(c0, c8);
6185 
6186 							*(unsigned short*)(source0 + 2 * x) = c0;
6187 						}
6188 
6189 						source0 += pitch;
6190 						source1 += pitch;
6191 						source2 += pitch;
6192 						source3 += pitch;
6193 						source4 += pitch;
6194 						source5 += pitch;
6195 						source6 += pitch;
6196 						source7 += pitch;
6197 						source8 += pitch;
6198 						source9 += pitch;
6199 						sourceA += pitch;
6200 						sourceB += pitch;
6201 						sourceC += pitch;
6202 						sourceD += pitch;
6203 						sourceE += pitch;
6204 						sourceF += pitch;
6205 					}
6206 				}
6207 				else ASSERT(false);
6208 
6209 				#undef AVERAGE
6210 			}
6211 		}
6212 		else
6213 		{
6214 		//	UNIMPLEMENTED();
6215 		}
6216 	}
6217 }
6218