1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Surface.hpp"
16 
17 #include "Color.hpp"
18 #include "Context.hpp"
19 #include "ETC_Decoder.hpp"
20 #include "Renderer.hpp"
21 #include "Common/Half.hpp"
22 #include "Common/Memory.hpp"
23 #include "Common/CPUID.hpp"
24 #include "Common/Resource.hpp"
25 #include "Common/Debug.hpp"
26 #include "Reactor/Reactor.hpp"
27 
28 #if defined(__i386__) || defined(__x86_64__)
29 	#include <xmmintrin.h>
30 	#include <emmintrin.h>
31 #endif
32 
33 #undef min
34 #undef max
35 
36 namespace sw
37 {
38 	extern bool quadLayoutEnabled;
39 	extern bool complementaryDepthBuffer;
40 	extern TranscendentalPrecision logPrecision;
41 
42 	unsigned int *Surface::palette = 0;
43 	unsigned int Surface::paletteID = 0;
44 
write(int x,int y,int z,const Color<float> & color)45 	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
46 	{
47 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
48 
49 		for(int i = 0; i < samples; i++)
50 		{
51 			write(element, color);
52 			element += sliceB;
53 		}
54 	}
55 
write(int x,int y,const Color<float> & color)56 	void Surface::Buffer::write(int x, int y, const Color<float> &color)
57 	{
58 		byte *element = (byte*)buffer + (x + border) * bytes + (y + border) * pitchB;
59 
60 		for(int i = 0; i < samples; i++)
61 		{
62 			write(element, color);
63 			element += sliceB;
64 		}
65 	}
66 
write(void * element,const Color<float> & color)67 	inline void Surface::Buffer::write(void *element, const Color<float> &color)
68 	{
69 		float r = color.r;
70 		float g = color.g;
71 		float b = color.b;
72 		float a = color.a;
73 
74 		if(isSRGBformat(format))
75 		{
76 			r = linearToSRGB(r);
77 			g = linearToSRGB(g);
78 			b = linearToSRGB(b);
79 		}
80 
81 		switch(format)
82 		{
83 		case FORMAT_A8:
84 			*(unsigned char*)element = unorm<8>(a);
85 			break;
86 		case FORMAT_R8_SNORM:
87 			*(char*)element = snorm<8>(r);
88 			break;
89 		case FORMAT_R8:
90 			*(unsigned char*)element = unorm<8>(r);
91 			break;
92 		case FORMAT_R8I:
93 			*(char*)element = scast<8>(r);
94 			break;
95 		case FORMAT_R8UI:
96 			*(unsigned char*)element = ucast<8>(r);
97 			break;
98 		case FORMAT_R16I:
99 			*(short*)element = scast<16>(r);
100 			break;
101 		case FORMAT_R16UI:
102 			*(unsigned short*)element = ucast<16>(r);
103 			break;
104 		case FORMAT_R32I:
105 			*(int*)element = static_cast<int>(r);
106 			break;
107 		case FORMAT_R32UI:
108 			*(unsigned int*)element = static_cast<unsigned int>(r);
109 			break;
110 		case FORMAT_R3G3B2:
111 			*(unsigned char*)element = (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
112 			break;
113 		case FORMAT_A8R3G3B2:
114 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<3>(r) << 5) | (unorm<3>(g) << 2) | (unorm<2>(b) << 0);
115 			break;
116 		case FORMAT_X4R4G4B4:
117 			*(unsigned short*)element = 0xF000 | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
118 			break;
119 		case FORMAT_A4R4G4B4:
120 			*(unsigned short*)element = (unorm<4>(a) << 12) | (unorm<4>(r) << 8) | (unorm<4>(g) << 4) | (unorm<4>(b) << 0);
121 			break;
122 		case FORMAT_R4G4B4A4:
123 			*(unsigned short*)element = (unorm<4>(r) << 12) | (unorm<4>(g) << 8) | (unorm<4>(b) << 4) | (unorm<4>(a) << 0);
124 			break;
125 		case FORMAT_R5G6B5:
126 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<6>(g) << 5) | (unorm<5>(b) << 0);
127 			break;
128 		case FORMAT_A1R5G5B5:
129 			*(unsigned short*)element = (unorm<1>(a) << 15) | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
130 			break;
131 		case FORMAT_R5G5B5A1:
132 			*(unsigned short*)element = (unorm<5>(r) << 11) | (unorm<5>(g) << 6) | (unorm<5>(b) << 1) | (unorm<5>(a) << 0);
133 			break;
134 		case FORMAT_X1R5G5B5:
135 			*(unsigned short*)element = 0x8000 | (unorm<5>(r) << 10) | (unorm<5>(g) << 5) | (unorm<5>(b) << 0);
136 			break;
137 		case FORMAT_A8R8G8B8:
138 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
139 			break;
140 		case FORMAT_X8R8G8B8:
141 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(r) << 16) | (unorm<8>(g) << 8) | (unorm<8>(b) << 0);
142 			break;
143 		case FORMAT_A8B8G8R8_SNORM:
144 			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(a)) << 24) |
145 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
146 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
147 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
148 			break;
149 		case FORMAT_A8B8G8R8:
150 		case FORMAT_SRGB8_A8:
151 			*(unsigned int*)element = (unorm<8>(a) << 24) | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
152 			break;
153 		case FORMAT_A8B8G8R8I:
154 			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(a)) << 24) |
155 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
156 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
157 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
158 			break;
159 		case FORMAT_A8B8G8R8UI:
160 			*(unsigned int*)element = (ucast<8>(a) << 24) | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
161 			break;
162 		case FORMAT_X8B8G8R8_SNORM:
163 			*(unsigned int*)element = 0x7F000000 |
164 			                          (static_cast<unsigned int>(snorm<8>(b)) << 16) |
165 			                          (static_cast<unsigned int>(snorm<8>(g)) << 8) |
166 			                          (static_cast<unsigned int>(snorm<8>(r)) << 0);
167 			break;
168 		case FORMAT_X8B8G8R8:
169 		case FORMAT_SRGB8_X8:
170 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
171 			break;
172 		case FORMAT_X8B8G8R8I:
173 			*(unsigned int*)element = 0x7F000000 |
174 			                          (static_cast<unsigned int>(scast<8>(b)) << 16) |
175 			                          (static_cast<unsigned int>(scast<8>(g)) << 8) |
176 			                          (static_cast<unsigned int>(scast<8>(r)) << 0);
177 		case FORMAT_X8B8G8R8UI:
178 			*(unsigned int*)element = 0xFF000000 | (ucast<8>(b) << 16) | (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
179 			break;
180 		case FORMAT_A2R10G10B10:
181 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(r) << 20) | (unorm<10>(g) << 10) | (unorm<10>(b) << 0);
182 			break;
183 		case FORMAT_A2B10G10R10:
184 		case FORMAT_A2B10G10R10UI:
185 			*(unsigned int*)element = (unorm<2>(a) << 30) | (unorm<10>(b) << 20) | (unorm<10>(g) << 10) | (unorm<10>(r) << 0);
186 			break;
187 		case FORMAT_G8R8_SNORM:
188 			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(g)) << 8) |
189 			                            (static_cast<unsigned short>(snorm<8>(r)) << 0);
190 			break;
191 		case FORMAT_G8R8:
192 			*(unsigned short*)element = (unorm<8>(g) << 8) | (unorm<8>(r) << 0);
193 			break;
194 		case FORMAT_G8R8I:
195 			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(g)) << 8) |
196 			                            (static_cast<unsigned short>(scast<8>(r)) << 0);
197 			break;
198 		case FORMAT_G8R8UI:
199 			*(unsigned short*)element = (ucast<8>(g) << 8) | (ucast<8>(r) << 0);
200 			break;
201 		case FORMAT_G16R16:
202 			*(unsigned int*)element = (unorm<16>(g) << 16) | (unorm<16>(r) << 0);
203 			break;
204 		case FORMAT_G16R16I:
205 			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(g)) << 16) |
206 			                          (static_cast<unsigned int>(scast<16>(r)) << 0);
207 			break;
208 		case FORMAT_G16R16UI:
209 			*(unsigned int*)element = (ucast<16>(g) << 16) | (ucast<16>(r) << 0);
210 			break;
211 		case FORMAT_G32R32I:
212 		case FORMAT_G32R32UI:
213 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
214 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
215 			break;
216 		case FORMAT_A16B16G16R16:
217 			((unsigned short*)element)[0] = unorm<16>(r);
218 			((unsigned short*)element)[1] = unorm<16>(g);
219 			((unsigned short*)element)[2] = unorm<16>(b);
220 			((unsigned short*)element)[3] = unorm<16>(a);
221 			break;
222 		case FORMAT_A16B16G16R16I:
223 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
224 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
225 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
226 			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(a));
227 			break;
228 		case FORMAT_A16B16G16R16UI:
229 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
230 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
231 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
232 			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(a));
233 			break;
234 		case FORMAT_X16B16G16R16I:
235 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(r));
236 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(g));
237 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(b));
238 			break;
239 		case FORMAT_X16B16G16R16UI:
240 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(r));
241 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(g));
242 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(b));
243 			break;
244 		case FORMAT_A32B32G32R32I:
245 		case FORMAT_A32B32G32R32UI:
246 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
247 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
248 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
249 			((unsigned int*)element)[3] = static_cast<unsigned int>(a);
250 			break;
251 		case FORMAT_X32B32G32R32I:
252 		case FORMAT_X32B32G32R32UI:
253 			((unsigned int*)element)[0] = static_cast<unsigned int>(r);
254 			((unsigned int*)element)[1] = static_cast<unsigned int>(g);
255 			((unsigned int*)element)[2] = static_cast<unsigned int>(b);
256 			break;
257 		case FORMAT_V8U8:
258 			*(unsigned short*)element = (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
259 			break;
260 		case FORMAT_L6V5U5:
261 			*(unsigned short*)element = (unorm<6>(b) << 10) | (snorm<5>(g) << 5) | (snorm<5>(r) << 0);
262 			break;
263 		case FORMAT_Q8W8V8U8:
264 			*(unsigned int*)element = (snorm<8>(a) << 24) | (snorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
265 			break;
266 		case FORMAT_X8L8V8U8:
267 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(b) << 16) | (snorm<8>(g) << 8) | (snorm<8>(r) << 0);
268 			break;
269 		case FORMAT_V16U16:
270 			*(unsigned int*)element = (snorm<16>(g) << 16) | (snorm<16>(r) << 0);
271 			break;
272 		case FORMAT_A2W10V10U10:
273 			*(unsigned int*)element = (unorm<2>(a) << 30) | (snorm<10>(b) << 20) | (snorm<10>(g) << 10) | (snorm<10>(r) << 0);
274 			break;
275 		case FORMAT_A16W16V16U16:
276 			((unsigned short*)element)[0] = snorm<16>(r);
277 			((unsigned short*)element)[1] = snorm<16>(g);
278 			((unsigned short*)element)[2] = snorm<16>(b);
279 			((unsigned short*)element)[3] = unorm<16>(a);
280 			break;
281 		case FORMAT_Q16W16V16U16:
282 			((unsigned short*)element)[0] = snorm<16>(r);
283 			((unsigned short*)element)[1] = snorm<16>(g);
284 			((unsigned short*)element)[2] = snorm<16>(b);
285 			((unsigned short*)element)[3] = snorm<16>(a);
286 			break;
287 		case FORMAT_R8G8B8:
288 			((unsigned char*)element)[0] = unorm<8>(b);
289 			((unsigned char*)element)[1] = unorm<8>(g);
290 			((unsigned char*)element)[2] = unorm<8>(r);
291 			break;
292 		case FORMAT_B8G8R8:
293 			((unsigned char*)element)[0] = unorm<8>(r);
294 			((unsigned char*)element)[1] = unorm<8>(g);
295 			((unsigned char*)element)[2] = unorm<8>(b);
296 			break;
297 		case FORMAT_R16F:
298 			*(half*)element = (half)r;
299 			break;
300 		case FORMAT_A16F:
301 			*(half*)element = (half)a;
302 			break;
303 		case FORMAT_G16R16F:
304 			((half*)element)[0] = (half)r;
305 			((half*)element)[1] = (half)g;
306 			break;
307 		case FORMAT_X16B16G16R16F_UNSIGNED:
308 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
309 			// Fall through to FORMAT_X16B16G16R16F.
310 		case FORMAT_X16B16G16R16F:
311 			((half*)element)[3] = 1.0f;
312 			// Fall through to FORMAT_B16G16R16F.
313 		case FORMAT_B16G16R16F:
314 			((half*)element)[0] = (half)r;
315 			((half*)element)[1] = (half)g;
316 			((half*)element)[2] = (half)b;
317 			break;
318 		case FORMAT_A16B16G16R16F:
319 			((half*)element)[0] = (half)r;
320 			((half*)element)[1] = (half)g;
321 			((half*)element)[2] = (half)b;
322 			((half*)element)[3] = (half)a;
323 			break;
324 		case FORMAT_A32F:
325 			*(float*)element = a;
326 			break;
327 		case FORMAT_R32F:
328 			*(float*)element = r;
329 			break;
330 		case FORMAT_G32R32F:
331 			((float*)element)[0] = r;
332 			((float*)element)[1] = g;
333 			break;
334 		case FORMAT_X32B32G32R32F_UNSIGNED:
335 			r = max(r, 0.0f); g = max(g, 0.0f); b = max(b, 0.0f);
336 			// Fall through to FORMAT_X32B32G32R32F.
337 		case FORMAT_X32B32G32R32F:
338 			((float*)element)[3] = 1.0f;
339 			// Fall through to FORMAT_B32G32R32F.
340 		case FORMAT_B32G32R32F:
341 			((float*)element)[0] = r;
342 			((float*)element)[1] = g;
343 			((float*)element)[2] = b;
344 			break;
345 		case FORMAT_A32B32G32R32F:
346 			((float*)element)[0] = r;
347 			((float*)element)[1] = g;
348 			((float*)element)[2] = b;
349 			((float*)element)[3] = a;
350 			break;
351 		case FORMAT_D32F:
352 		case FORMAT_D32FS8:
353 		case FORMAT_D32F_LOCKABLE:
354 		case FORMAT_D32FS8_TEXTURE:
355 		case FORMAT_D32F_SHADOW:
356 		case FORMAT_D32FS8_SHADOW:
357 			*((float*)element) = r;
358 			break;
359 		case FORMAT_D32F_COMPLEMENTARY:
360 		case FORMAT_D32FS8_COMPLEMENTARY:
361 			*((float*)element) = 1 - r;
362 			break;
363 		case FORMAT_S8:
364 			*((unsigned char*)element) = unorm<8>(r);
365 			break;
366 		case FORMAT_L8:
367 			*(unsigned char*)element = unorm<8>(r);
368 			break;
369 		case FORMAT_A4L4:
370 			*(unsigned char*)element = (unorm<4>(a) << 4) | (unorm<4>(r) << 0);
371 			break;
372 		case FORMAT_L16:
373 			*(unsigned short*)element = unorm<16>(r);
374 			break;
375 		case FORMAT_A8L8:
376 			*(unsigned short*)element = (unorm<8>(a) << 8) | (unorm<8>(r) << 0);
377 			break;
378 		case FORMAT_L16F:
379 			*(half*)element = (half)r;
380 			break;
381 		case FORMAT_A16L16F:
382 			((half*)element)[0] = (half)r;
383 			((half*)element)[1] = (half)a;
384 			break;
385 		case FORMAT_L32F:
386 			*(float*)element = r;
387 			break;
388 		case FORMAT_A32L32F:
389 			((float*)element)[0] = r;
390 			((float*)element)[1] = a;
391 			break;
392 		default:
393 			ASSERT(false);
394 		}
395 	}
396 
read(int x,int y,int z) const397 	Color<float> Surface::Buffer::read(int x, int y, int z) const
398 	{
399 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB + z * samples * sliceB;
400 
401 		return read(element);
402 	}
403 
read(int x,int y) const404 	Color<float> Surface::Buffer::read(int x, int y) const
405 	{
406 		void *element = (unsigned char*)buffer + (x + border) * bytes + (y + border) * pitchB;
407 
408 		return read(element);
409 	}
410 
read(void * element) const411 	inline Color<float> Surface::Buffer::read(void *element) const
412 	{
413 		float r = 0.0f;
414 		float g = 0.0f;
415 		float b = 0.0f;
416 		float a = 1.0f;
417 
418 		switch(format)
419 		{
420 		case FORMAT_P8:
421 			{
422 				ASSERT(palette);
423 
424 				unsigned int abgr = palette[*(unsigned char*)element];
425 
426 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
427 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
428 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
429 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
430 			}
431 			break;
432 		case FORMAT_A8P8:
433 			{
434 				ASSERT(palette);
435 
436 				unsigned int bgr = palette[((unsigned char*)element)[0]];
437 
438 				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
439 				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
440 				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
441 				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
442 			}
443 			break;
444 		case FORMAT_A8:
445 			r = 0;
446 			g = 0;
447 			b = 0;
448 			a = *(unsigned char*)element * (1.0f / 0xFF);
449 			break;
450 		case FORMAT_R8_SNORM:
451 			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
452 			break;
453 		case FORMAT_R8:
454 			r = *(unsigned char*)element * (1.0f / 0xFF);
455 			break;
456 		case FORMAT_R8I:
457 			r = *(signed char*)element;
458 			break;
459 		case FORMAT_R8UI:
460 			r = *(unsigned char*)element;
461 			break;
462 		case FORMAT_R3G3B2:
463 			{
464 				unsigned char rgb = *(unsigned char*)element;
465 
466 				r = (rgb & 0xE0) * (1.0f / 0xE0);
467 				g = (rgb & 0x1C) * (1.0f / 0x1C);
468 				b = (rgb & 0x03) * (1.0f / 0x03);
469 			}
470 			break;
471 		case FORMAT_A8R3G3B2:
472 			{
473 				unsigned short argb = *(unsigned short*)element;
474 
475 				a = (argb & 0xFF00) * (1.0f / 0xFF00);
476 				r = (argb & 0x00E0) * (1.0f / 0x00E0);
477 				g = (argb & 0x001C) * (1.0f / 0x001C);
478 				b = (argb & 0x0003) * (1.0f / 0x0003);
479 			}
480 			break;
481 		case FORMAT_X4R4G4B4:
482 			{
483 				unsigned short rgb = *(unsigned short*)element;
484 
485 				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
486 				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
487 				b = (rgb & 0x000F) * (1.0f / 0x000F);
488 			}
489 			break;
490 		case FORMAT_A4R4G4B4:
491 			{
492 				unsigned short argb = *(unsigned short*)element;
493 
494 				a = (argb & 0xF000) * (1.0f / 0xF000);
495 				r = (argb & 0x0F00) * (1.0f / 0x0F00);
496 				g = (argb & 0x00F0) * (1.0f / 0x00F0);
497 				b = (argb & 0x000F) * (1.0f / 0x000F);
498 			}
499 			break;
500 		case FORMAT_R4G4B4A4:
501 			{
502 				unsigned short rgba = *(unsigned short*)element;
503 
504 				r = (rgba & 0xF000) * (1.0f / 0xF000);
505 				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
506 				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
507 				a = (rgba & 0x000F) * (1.0f / 0x000F);
508 			}
509 			break;
510 		case FORMAT_R5G6B5:
511 			{
512 				unsigned short rgb = *(unsigned short*)element;
513 
514 				r = (rgb & 0xF800) * (1.0f / 0xF800);
515 				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
516 				b = (rgb & 0x001F) * (1.0f / 0x001F);
517 			}
518 			break;
519 		case FORMAT_A1R5G5B5:
520 			{
521 				unsigned short argb = *(unsigned short*)element;
522 
523 				a = (argb & 0x8000) * (1.0f / 0x8000);
524 				r = (argb & 0x7C00) * (1.0f / 0x7C00);
525 				g = (argb & 0x03E0) * (1.0f / 0x03E0);
526 				b = (argb & 0x001F) * (1.0f / 0x001F);
527 			}
528 			break;
529 		case FORMAT_R5G5B5A1:
530 			{
531 				unsigned short rgba = *(unsigned short*)element;
532 
533 				r = (rgba & 0xF800) * (1.0f / 0xF800);
534 				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
535 				b = (rgba & 0x003E) * (1.0f / 0x003E);
536 				a = (rgba & 0x0001) * (1.0f / 0x0001);
537 			}
538 			break;
539 		case FORMAT_X1R5G5B5:
540 			{
541 				unsigned short xrgb = *(unsigned short*)element;
542 
543 				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
544 				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
545 				b = (xrgb & 0x001F) * (1.0f / 0x001F);
546 			}
547 			break;
548 		case FORMAT_A8R8G8B8:
549 			{
550 				unsigned int argb = *(unsigned int*)element;
551 
552 				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
553 				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
554 				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
555 				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
556 			}
557 			break;
558 		case FORMAT_X8R8G8B8:
559 			{
560 				unsigned int xrgb = *(unsigned int*)element;
561 
562 				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
563 				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
564 				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
565 			}
566 			break;
567 		case FORMAT_A8B8G8R8_SNORM:
568 			{
569 				signed char* abgr = (signed char*)element;
570 
571 				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
572 				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
573 				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
574 				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
575 			}
576 			break;
577 		case FORMAT_A8B8G8R8:
578 		case FORMAT_SRGB8_A8:
579 			{
580 				unsigned int abgr = *(unsigned int*)element;
581 
582 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
583 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
584 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
585 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
586 			}
587 			break;
588 		case FORMAT_A8B8G8R8I:
589 			{
590 				signed char* abgr = (signed char*)element;
591 
592 				r = abgr[0];
593 				g = abgr[1];
594 				b = abgr[2];
595 				a = abgr[3];
596 			}
597 			break;
598 		case FORMAT_A8B8G8R8UI:
599 			{
600 				unsigned char* abgr = (unsigned char*)element;
601 
602 				r = abgr[0];
603 				g = abgr[1];
604 				b = abgr[2];
605 				a = abgr[3];
606 			}
607 			break;
608 		case FORMAT_X8B8G8R8_SNORM:
609 			{
610 				signed char* bgr = (signed char*)element;
611 
612 				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
613 				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
614 				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
615 			}
616 			break;
617 		case FORMAT_X8B8G8R8:
618 		case FORMAT_SRGB8_X8:
619 			{
620 				unsigned int xbgr = *(unsigned int*)element;
621 
622 				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
623 				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
624 				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
625 			}
626 			break;
627 		case FORMAT_X8B8G8R8I:
628 			{
629 				signed char* bgr = (signed char*)element;
630 
631 				r = bgr[0];
632 				g = bgr[1];
633 				b = bgr[2];
634 			}
635 			break;
636 		case FORMAT_X8B8G8R8UI:
637 			{
638 				unsigned char* bgr = (unsigned char*)element;
639 
640 				r = bgr[0];
641 				g = bgr[1];
642 				b = bgr[2];
643 			}
644 			break;
645 		case FORMAT_G8R8_SNORM:
646 			{
647 				signed char* gr = (signed char*)element;
648 
649 				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
650 				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
651 			}
652 			break;
653 		case FORMAT_G8R8:
654 			{
655 				unsigned short gr = *(unsigned short*)element;
656 
657 				g = (gr & 0xFF00) * (1.0f / 0xFF00);
658 				r = (gr & 0x00FF) * (1.0f / 0x00FF);
659 			}
660 			break;
661 		case FORMAT_G8R8I:
662 			{
663 				signed char* gr = (signed char*)element;
664 
665 				r = gr[0];
666 				g = gr[1];
667 			}
668 			break;
669 		case FORMAT_G8R8UI:
670 			{
671 				unsigned char* gr = (unsigned char*)element;
672 
673 				r = gr[0];
674 				g = gr[1];
675 			}
676 			break;
677 		case FORMAT_R16I:
678 			r = *((short*)element);
679 			break;
680 		case FORMAT_R16UI:
681 			r = *((unsigned short*)element);
682 			break;
683 		case FORMAT_G16R16I:
684 			{
685 				short* gr = (short*)element;
686 
687 				r = gr[0];
688 				g = gr[1];
689 			}
690 			break;
691 		case FORMAT_G16R16:
692 			{
693 				unsigned int gr = *(unsigned int*)element;
694 
695 				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
696 				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
697 			}
698 			break;
699 		case FORMAT_G16R16UI:
700 			{
701 				unsigned short* gr = (unsigned short*)element;
702 
703 				r = gr[0];
704 				g = gr[1];
705 			}
706 			break;
707 		case FORMAT_A2R10G10B10:
708 			{
709 				unsigned int argb = *(unsigned int*)element;
710 
711 				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
712 				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
713 				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
714 				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
715 			}
716 			break;
717 		case FORMAT_A2B10G10R10:
718 			{
719 				unsigned int abgr = *(unsigned int*)element;
720 
721 				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
722 				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
723 				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
724 				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
725 			}
726 			break;
727 		case FORMAT_A2B10G10R10UI:
728 			{
729 				unsigned int abgr = *(unsigned int*)element;
730 
731 				a = static_cast<float>((abgr & 0xC0000000) >> 30);
732 				b = static_cast<float>((abgr & 0x3FF00000) >> 20);
733 				g = static_cast<float>((abgr & 0x000FFC00) >> 10);
734 				r = static_cast<float>(abgr & 0x000003FF);
735 			}
736 			break;
737 		case FORMAT_A16B16G16R16I:
738 			{
739 				short* abgr = (short*)element;
740 
741 				r = abgr[0];
742 				g = abgr[1];
743 				b = abgr[2];
744 				a = abgr[3];
745 			}
746 			break;
747 		case FORMAT_A16B16G16R16:
748 			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
749 			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
750 			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
751 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
752 			break;
753 		case FORMAT_A16B16G16R16UI:
754 			{
755 				unsigned short* abgr = (unsigned short*)element;
756 
757 				r = abgr[0];
758 				g = abgr[1];
759 				b = abgr[2];
760 				a = abgr[3];
761 			}
762 			break;
763 		case FORMAT_X16B16G16R16I:
764 			{
765 				short* bgr = (short*)element;
766 
767 				r = bgr[0];
768 				g = bgr[1];
769 				b = bgr[2];
770 			}
771 			break;
772 		case FORMAT_X16B16G16R16UI:
773 			{
774 				unsigned short* bgr = (unsigned short*)element;
775 
776 				r = bgr[0];
777 				g = bgr[1];
778 				b = bgr[2];
779 			}
780 			break;
781 		case FORMAT_A32B32G32R32I:
782 			{
783 				int* abgr = (int*)element;
784 
785 				r = static_cast<float>(abgr[0]);
786 				g = static_cast<float>(abgr[1]);
787 				b = static_cast<float>(abgr[2]);
788 				a = static_cast<float>(abgr[3]);
789 			}
790 			break;
791 		case FORMAT_A32B32G32R32UI:
792 			{
793 				unsigned int* abgr = (unsigned int*)element;
794 
795 				r = static_cast<float>(abgr[0]);
796 				g = static_cast<float>(abgr[1]);
797 				b = static_cast<float>(abgr[2]);
798 				a = static_cast<float>(abgr[3]);
799 			}
800 			break;
801 		case FORMAT_X32B32G32R32I:
802 			{
803 				int* bgr = (int*)element;
804 
805 				r = static_cast<float>(bgr[0]);
806 				g = static_cast<float>(bgr[1]);
807 				b = static_cast<float>(bgr[2]);
808 			}
809 			break;
810 		case FORMAT_X32B32G32R32UI:
811 			{
812 				unsigned int* bgr = (unsigned int*)element;
813 
814 				r = static_cast<float>(bgr[0]);
815 				g = static_cast<float>(bgr[1]);
816 				b = static_cast<float>(bgr[2]);
817 			}
818 			break;
819 		case FORMAT_G32R32I:
820 			{
821 				int* gr = (int*)element;
822 
823 				r = static_cast<float>(gr[0]);
824 				g = static_cast<float>(gr[1]);
825 			}
826 			break;
827 		case FORMAT_G32R32UI:
828 			{
829 				unsigned int* gr = (unsigned int*)element;
830 
831 				r = static_cast<float>(gr[0]);
832 				g = static_cast<float>(gr[1]);
833 			}
834 			break;
835 		case FORMAT_R32I:
836 			r = static_cast<float>(*((int*)element));
837 			break;
838 		case FORMAT_R32UI:
839 			r = static_cast<float>(*((unsigned int*)element));
840 			break;
841 		case FORMAT_V8U8:
842 			{
843 				unsigned short vu = *(unsigned short*)element;
844 
845 				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
846 				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
847 			}
848 			break;
849 		case FORMAT_L6V5U5:
850 			{
851 				unsigned short lvu = *(unsigned short*)element;
852 
853 				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
854 				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
855 				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
856 			}
857 			break;
858 		case FORMAT_Q8W8V8U8:
859 			{
860 				unsigned int qwvu = *(unsigned int*)element;
861 
862 				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
863 				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
864 				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
865 				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
866 			}
867 			break;
868 		case FORMAT_X8L8V8U8:
869 			{
870 				unsigned int xlvu = *(unsigned int*)element;
871 
872 				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
873 				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
874 				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
875 			}
876 			break;
877 		case FORMAT_R8G8B8:
878 			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
879 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
880 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
881 			break;
882 		case FORMAT_B8G8R8:
883 			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
884 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
885 			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
886 			break;
887 		case FORMAT_V16U16:
888 			{
889 				unsigned int vu = *(unsigned int*)element;
890 
891 				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
892 				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
893 			}
894 			break;
895 		case FORMAT_A2W10V10U10:
896 			{
897 				unsigned int awvu = *(unsigned int*)element;
898 
899 				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
900 				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
901 				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
902 				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
903 			}
904 			break;
905 		case FORMAT_A16W16V16U16:
906 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
907 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
908 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
909 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
910 			break;
911 		case FORMAT_Q16W16V16U16:
912 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
913 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
914 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
915 			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
916 			break;
917 		case FORMAT_L8:
918 			r =
919 			g =
920 			b = *(unsigned char*)element * (1.0f / 0xFF);
921 			break;
922 		case FORMAT_A4L4:
923 			{
924 				unsigned char al = *(unsigned char*)element;
925 
926 				r =
927 				g =
928 				b = (al & 0x0F) * (1.0f / 0x0F);
929 				a = (al & 0xF0) * (1.0f / 0xF0);
930 			}
931 			break;
932 		case FORMAT_L16:
933 			r =
934 			g =
935 			b = *(unsigned short*)element * (1.0f / 0xFFFF);
936 			break;
937 		case FORMAT_A8L8:
938 			r =
939 			g =
940 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
941 			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
942 			break;
943 		case FORMAT_L16F:
944 			r =
945 			g =
946 			b = *(half*)element;
947 			break;
948 		case FORMAT_A16L16F:
949 			r =
950 			g =
951 			b = ((half*)element)[0];
952 			a = ((half*)element)[1];
953 			break;
954 		case FORMAT_L32F:
955 			r =
956 			g =
957 			b = *(float*)element;
958 			break;
959 		case FORMAT_A32L32F:
960 			r =
961 			g =
962 			b = ((float*)element)[0];
963 			a = ((float*)element)[1];
964 			break;
965 		case FORMAT_A16F:
966 			a = *(half*)element;
967 			break;
968 		case FORMAT_R16F:
969 			r = *(half*)element;
970 			break;
971 		case FORMAT_G16R16F:
972 			r = ((half*)element)[0];
973 			g = ((half*)element)[1];
974 			break;
975 		case FORMAT_X16B16G16R16F:
976 		case FORMAT_X16B16G16R16F_UNSIGNED:
977 		case FORMAT_B16G16R16F:
978 			r = ((half*)element)[0];
979 			g = ((half*)element)[1];
980 			b = ((half*)element)[2];
981 			break;
982 		case FORMAT_A16B16G16R16F:
983 			r = ((half*)element)[0];
984 			g = ((half*)element)[1];
985 			b = ((half*)element)[2];
986 			a = ((half*)element)[3];
987 			break;
988 		case FORMAT_A32F:
989 			a = *(float*)element;
990 			break;
991 		case FORMAT_R32F:
992 			r = *(float*)element;
993 			break;
994 		case FORMAT_G32R32F:
995 			r = ((float*)element)[0];
996 			g = ((float*)element)[1];
997 			break;
998 		case FORMAT_X32B32G32R32F:
999 		case FORMAT_X32B32G32R32F_UNSIGNED:
1000 		case FORMAT_B32G32R32F:
1001 			r = ((float*)element)[0];
1002 			g = ((float*)element)[1];
1003 			b = ((float*)element)[2];
1004 			break;
1005 		case FORMAT_A32B32G32R32F:
1006 			r = ((float*)element)[0];
1007 			g = ((float*)element)[1];
1008 			b = ((float*)element)[2];
1009 			a = ((float*)element)[3];
1010 			break;
1011 		case FORMAT_D32F:
1012 		case FORMAT_D32FS8:
1013 		case FORMAT_D32F_LOCKABLE:
1014 		case FORMAT_D32FS8_TEXTURE:
1015 		case FORMAT_D32F_SHADOW:
1016 		case FORMAT_D32FS8_SHADOW:
1017 			r = *(float*)element;
1018 			g = r;
1019 			b = r;
1020 			a = r;
1021 			break;
1022 		case FORMAT_D32F_COMPLEMENTARY:
1023 		case FORMAT_D32FS8_COMPLEMENTARY:
1024 			r = 1.0f - *(float*)element;
1025 			g = r;
1026 			b = r;
1027 			a = r;
1028 			break;
1029 		case FORMAT_S8:
1030 			r = *(unsigned char*)element * (1.0f / 0xFF);
1031 			break;
1032 		default:
1033 			ASSERT(false);
1034 		}
1035 
1036 		if(isSRGBformat(format))
1037 		{
1038 			r = sRGBtoLinear(r);
1039 			g = sRGBtoLinear(g);
1040 			b = sRGBtoLinear(b);
1041 		}
1042 
1043 		return Color<float>(r, g, b, a);
1044 	}
1045 
sample(float x,float y,float z) const1046 	Color<float> Surface::Buffer::sample(float x, float y, float z) const
1047 	{
1048 		x -= 0.5f;
1049 		y -= 0.5f;
1050 		z -= 0.5f;
1051 
1052 		int x0 = clamp((int)x, 0, width - 1);
1053 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1054 
1055 		int y0 = clamp((int)y, 0, height - 1);
1056 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1057 
1058 		int z0 = clamp((int)z, 0, depth - 1);
1059 		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
1060 
1061 		Color<float> c000 = read(x0, y0, z0);
1062 		Color<float> c100 = read(x1, y0, z0);
1063 		Color<float> c010 = read(x0, y1, z0);
1064 		Color<float> c110 = read(x1, y1, z0);
1065 		Color<float> c001 = read(x0, y0, z1);
1066 		Color<float> c101 = read(x1, y0, z1);
1067 		Color<float> c011 = read(x0, y1, z1);
1068 		Color<float> c111 = read(x1, y1, z1);
1069 
1070 		float fx = x - x0;
1071 		float fy = y - y0;
1072 		float fz = z - z0;
1073 
1074 		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
1075 		c100 *= fx * (1 - fy) * (1 - fz);
1076 		c010 *= (1 - fx) * fy * (1 - fz);
1077 		c110 *= fx * fy * (1 - fz);
1078 		c001 *= (1 - fx) * (1 - fy) * fz;
1079 		c101 *= fx * (1 - fy) * fz;
1080 		c011 *= (1 - fx) * fy * fz;
1081 		c111 *= fx * fy * fz;
1082 
1083 		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
1084 	}
1085 
sample(float x,float y,int layer) const1086 	Color<float> Surface::Buffer::sample(float x, float y, int layer) const
1087 	{
1088 		x -= 0.5f;
1089 		y -= 0.5f;
1090 
1091 		int x0 = clamp((int)x, 0, width - 1);
1092 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1093 
1094 		int y0 = clamp((int)y, 0, height - 1);
1095 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1096 
1097 		Color<float> c00 = read(x0, y0, layer);
1098 		Color<float> c10 = read(x1, y0, layer);
1099 		Color<float> c01 = read(x0, y1, layer);
1100 		Color<float> c11 = read(x1, y1, layer);
1101 
1102 		float fx = x - x0;
1103 		float fy = y - y0;
1104 
1105 		c00 *= (1 - fx) * (1 - fy);
1106 		c10 *= fx * (1 - fy);
1107 		c01 *= (1 - fx) * fy;
1108 		c11 *= fx * fy;
1109 
1110 		return c00 + c10 + c01 + c11;
1111 	}
1112 
lockRect(int x,int y,int z,Lock lock)1113 	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
1114 	{
1115 		this->lock = lock;
1116 
1117 		switch(lock)
1118 		{
1119 		case LOCK_UNLOCKED:
1120 		case LOCK_READONLY:
1121 		case LOCK_UPDATE:
1122 			break;
1123 		case LOCK_WRITEONLY:
1124 		case LOCK_READWRITE:
1125 		case LOCK_DISCARD:
1126 			dirty = true;
1127 			break;
1128 		default:
1129 			ASSERT(false);
1130 		}
1131 
1132 		if(buffer)
1133 		{
1134 			x += border;
1135 			y += border;
1136 
1137 			switch(format)
1138 			{
1139 			case FORMAT_DXT1:
1140 			case FORMAT_ATI1:
1141 			case FORMAT_ETC1:
1142 			case FORMAT_R11_EAC:
1143 			case FORMAT_SIGNED_R11_EAC:
1144 			case FORMAT_RGB8_ETC2:
1145 			case FORMAT_SRGB8_ETC2:
1146 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1147 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1148 				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1149 			case FORMAT_RG11_EAC:
1150 			case FORMAT_SIGNED_RG11_EAC:
1151 			case FORMAT_RGBA8_ETC2_EAC:
1152 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1153 			case FORMAT_RGBA_ASTC_4x4_KHR:
1154 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1155 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1156 			case FORMAT_RGBA_ASTC_5x4_KHR:
1157 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1158 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
1159 			case FORMAT_RGBA_ASTC_5x5_KHR:
1160 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1161 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
1162 			case FORMAT_RGBA_ASTC_6x5_KHR:
1163 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1164 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
1165 			case FORMAT_RGBA_ASTC_6x6_KHR:
1166 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1167 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
1168 			case FORMAT_RGBA_ASTC_8x5_KHR:
1169 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1170 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
1171 			case FORMAT_RGBA_ASTC_8x6_KHR:
1172 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1173 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
1174 			case FORMAT_RGBA_ASTC_8x8_KHR:
1175 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1176 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
1177 			case FORMAT_RGBA_ASTC_10x5_KHR:
1178 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1179 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
1180 			case FORMAT_RGBA_ASTC_10x6_KHR:
1181 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1182 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
1183 			case FORMAT_RGBA_ASTC_10x8_KHR:
1184 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1185 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
1186 			case FORMAT_RGBA_ASTC_10x10_KHR:
1187 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1188 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
1189 			case FORMAT_RGBA_ASTC_12x10_KHR:
1190 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1191 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
1192 			case FORMAT_RGBA_ASTC_12x12_KHR:
1193 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1194 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
1195 			case FORMAT_DXT3:
1196 			case FORMAT_DXT5:
1197 			case FORMAT_ATI2:
1198 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1199 			default:
1200 				return (unsigned char*)buffer + x * bytes + y * pitchB + z * samples * sliceB;
1201 			}
1202 		}
1203 
1204 		return nullptr;
1205 	}
1206 
unlockRect()1207 	void Surface::Buffer::unlockRect()
1208 	{
1209 		lock = LOCK_UNLOCKED;
1210 	}
1211 
1212 	class SurfaceImplementation : public Surface
1213 	{
1214 	public:
SurfaceImplementation(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1215 		SurfaceImplementation(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1216 			: Surface(width, height, depth, format, pixels, pitch, slice) {}
SurfaceImplementation(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchP=0)1217 		SurfaceImplementation(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchP = 0)
1218 			: Surface(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchP) {}
~SurfaceImplementation()1219 		~SurfaceImplementation() override {};
1220 
lockInternal(int x,int y,int z,Lock lock,Accessor client)1221 		void *lockInternal(int x, int y, int z, Lock lock, Accessor client) override
1222 		{
1223 			return Surface::lockInternal(x, y, z, lock, client);
1224 		}
1225 
unlockInternal()1226 		void unlockInternal() override
1227 		{
1228 			Surface::unlockInternal();
1229 		}
1230 	};
1231 
create(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1232 	Surface *Surface::create(int width, int height, int depth, Format format, void *pixels, int pitch, int slice)
1233 	{
1234 		return new SurfaceImplementation(width, height, depth, format, pixels, pitch, slice);
1235 	}
1236 
create(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchPprovided)1237 	Surface *Surface::create(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided)
1238 	{
1239 		return new SurfaceImplementation(texture, width, height, depth, border, samples, format, lockable, renderTarget, pitchPprovided);
1240 	}
1241 
Surface(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1242 	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
1243 	{
1244 		resource = new Resource(0);
1245 		hasParent = false;
1246 		ownExternal = false;
1247 		depth = max(1, depth);
1248 
1249 		external.buffer = pixels;
1250 		external.width = width;
1251 		external.height = height;
1252 		external.depth = depth;
1253 		external.samples = 1;
1254 		external.format = format;
1255 		external.bytes = bytes(external.format);
1256 		external.pitchB = pitch;
1257 		external.pitchP = external.bytes ? pitch / external.bytes : 0;
1258 		external.sliceB = slice;
1259 		external.sliceP = external.bytes ? slice / external.bytes : 0;
1260 		external.border = 0;
1261 		external.lock = LOCK_UNLOCKED;
1262 		external.dirty = true;
1263 
1264 		internal.buffer = nullptr;
1265 		internal.width = width;
1266 		internal.height = height;
1267 		internal.depth = depth;
1268 		internal.samples = 1;
1269 		internal.format = selectInternalFormat(format);
1270 		internal.bytes = bytes(internal.format);
1271 		internal.pitchB = pitchB(internal.width, 0, internal.format, false);
1272 		internal.pitchP = pitchP(internal.width, 0, internal.format, false);
1273 		internal.sliceB = sliceB(internal.width, internal.height, 0, internal.format, false);
1274 		internal.sliceP = sliceP(internal.width, internal.height, 0, internal.format, false);
1275 		internal.border = 0;
1276 		internal.lock = LOCK_UNLOCKED;
1277 		internal.dirty = false;
1278 
1279 		stencil.buffer = nullptr;
1280 		stencil.width = width;
1281 		stencil.height = height;
1282 		stencil.depth = depth;
1283 		stencil.samples = 1;
1284 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
1285 		stencil.bytes = bytes(stencil.format);
1286 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, false);
1287 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, false);
1288 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, false);
1289 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, false);
1290 		stencil.border = 0;
1291 		stencil.lock = LOCK_UNLOCKED;
1292 		stencil.dirty = false;
1293 
1294 		dirtyContents = true;
1295 		paletteUsed = 0;
1296 	}
1297 
Surface(Resource * texture,int width,int height,int depth,int border,int samples,Format format,bool lockable,bool renderTarget,int pitchPprovided)1298 	Surface::Surface(Resource *texture, int width, int height, int depth, int border, int samples, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
1299 	{
1300 		resource = texture ? texture : new Resource(0);
1301 		hasParent = texture != nullptr;
1302 		ownExternal = true;
1303 		depth = max(1, depth);
1304 		samples = max(1, samples);
1305 
1306 		external.buffer = nullptr;
1307 		external.width = width;
1308 		external.height = height;
1309 		external.depth = depth;
1310 		external.samples = (short)samples;
1311 		external.format = format;
1312 		external.bytes = bytes(external.format);
1313 		external.pitchB = pitchB(external.width, 0, external.format, renderTarget && !texture);
1314 		external.pitchP = pitchP(external.width, 0, external.format, renderTarget && !texture);
1315 		external.sliceB = sliceB(external.width, external.height, 0, external.format, renderTarget && !texture);
1316 		external.sliceP = sliceP(external.width, external.height, 0, external.format, renderTarget && !texture);
1317 		external.border = 0;
1318 		external.lock = LOCK_UNLOCKED;
1319 		external.dirty = false;
1320 
1321 		internal.buffer = nullptr;
1322 		internal.width = width;
1323 		internal.height = height;
1324 		internal.depth = depth;
1325 		internal.samples = (short)samples;
1326 		internal.format = selectInternalFormat(format);
1327 		internal.bytes = bytes(internal.format);
1328 		internal.pitchB = !pitchPprovided ? pitchB(internal.width, border, internal.format, renderTarget) : pitchPprovided * internal.bytes;
1329 		internal.pitchP = !pitchPprovided ? pitchP(internal.width, border, internal.format, renderTarget) : pitchPprovided;
1330 		internal.sliceB = sliceB(internal.width, internal.height, border, internal.format, renderTarget);
1331 		internal.sliceP = sliceP(internal.width, internal.height, border, internal.format, renderTarget);
1332 		internal.border = (short)border;
1333 		internal.lock = LOCK_UNLOCKED;
1334 		internal.dirty = false;
1335 
1336 		stencil.buffer = nullptr;
1337 		stencil.width = width;
1338 		stencil.height = height;
1339 		stencil.depth = depth;
1340 		stencil.samples = (short)samples;
1341 		stencil.format = isStencil(format) ? FORMAT_S8 : FORMAT_NULL;
1342 		stencil.bytes = bytes(stencil.format);
1343 		stencil.pitchB = pitchB(stencil.width, 0, stencil.format, renderTarget);
1344 		stencil.pitchP = pitchP(stencil.width, 0, stencil.format, renderTarget);
1345 		stencil.sliceB = sliceB(stencil.width, stencil.height, 0, stencil.format, renderTarget);
1346 		stencil.sliceP = sliceP(stencil.width, stencil.height, 0, stencil.format, renderTarget);
1347 		stencil.border = 0;
1348 		stencil.lock = LOCK_UNLOCKED;
1349 		stencil.dirty = false;
1350 
1351 		dirtyContents = true;
1352 		paletteUsed = 0;
1353 	}
1354 
~Surface()1355 	Surface::~Surface()
1356 	{
1357 		// sync() must be called before this destructor to ensure all locks have been released.
1358 		// We can't call it here because the parent resource may already have been destroyed.
1359 		ASSERT(isUnlocked());
1360 
1361 		if(!hasParent)
1362 		{
1363 			resource->destruct();
1364 		}
1365 
1366 		if(ownExternal)
1367 		{
1368 			deallocate(external.buffer);
1369 		}
1370 
1371 		if(internal.buffer != external.buffer)
1372 		{
1373 			deallocate(internal.buffer);
1374 		}
1375 
1376 		deallocate(stencil.buffer);
1377 
1378 		external.buffer = 0;
1379 		internal.buffer = 0;
1380 		stencil.buffer = 0;
1381 	}
1382 
lockExternal(int x,int y,int z,Lock lock,Accessor client)1383 	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
1384 	{
1385 		resource->lock(client);
1386 
1387 		if(!external.buffer)
1388 		{
1389 			if(internal.buffer && identicalFormats())
1390 			{
1391 				external.buffer = internal.buffer;
1392 			}
1393 			else
1394 			{
1395 				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.border, external.samples, external.format);
1396 			}
1397 		}
1398 
1399 		if(internal.dirty)
1400 		{
1401 			if(lock != LOCK_DISCARD)
1402 			{
1403 				update(external, internal);
1404 			}
1405 
1406 			internal.dirty = false;
1407 		}
1408 
1409 		switch(lock)
1410 		{
1411 		case LOCK_READONLY:
1412 			break;
1413 		case LOCK_WRITEONLY:
1414 		case LOCK_READWRITE:
1415 		case LOCK_DISCARD:
1416 			dirtyContents = true;
1417 			break;
1418 		default:
1419 			ASSERT(false);
1420 		}
1421 
1422 		return external.lockRect(x, y, z, lock);
1423 	}
1424 
unlockExternal()1425 	void Surface::unlockExternal()
1426 	{
1427 		external.unlockRect();
1428 
1429 		resource->unlock();
1430 	}
1431 
lockInternal(int x,int y,int z,Lock lock,Accessor client)1432 	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
1433 	{
1434 		if(lock != LOCK_UNLOCKED)
1435 		{
1436 			resource->lock(client);
1437 		}
1438 
1439 		if(!internal.buffer)
1440 		{
1441 			if(external.buffer && identicalFormats())
1442 			{
1443 				internal.buffer = external.buffer;
1444 			}
1445 			else
1446 			{
1447 				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.border, internal.samples, internal.format);
1448 			}
1449 		}
1450 
1451 		// FIXME: WHQL requires conversion to lower external precision and back
1452 		if(logPrecision >= WHQL)
1453 		{
1454 			if(internal.dirty && renderTarget && internal.format != external.format)
1455 			{
1456 				if(lock != LOCK_DISCARD)
1457 				{
1458 					switch(external.format)
1459 					{
1460 					case FORMAT_R3G3B2:
1461 					case FORMAT_A8R3G3B2:
1462 					case FORMAT_A1R5G5B5:
1463 					case FORMAT_A2R10G10B10:
1464 					case FORMAT_A2B10G10R10:
1465 						lockExternal(0, 0, 0, LOCK_READWRITE, client);
1466 						unlockExternal();
1467 						break;
1468 					default:
1469 						// Difference passes WHQL
1470 						break;
1471 					}
1472 				}
1473 			}
1474 		}
1475 
1476 		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
1477 		{
1478 			if(lock != LOCK_DISCARD)
1479 			{
1480 				update(internal, external);
1481 			}
1482 
1483 			external.dirty = false;
1484 			paletteUsed = Surface::paletteID;
1485 		}
1486 
1487 		switch(lock)
1488 		{
1489 		case LOCK_UNLOCKED:
1490 		case LOCK_READONLY:
1491 			break;
1492 		case LOCK_WRITEONLY:
1493 		case LOCK_READWRITE:
1494 		case LOCK_DISCARD:
1495 			dirtyContents = true;
1496 			break;
1497 		default:
1498 			ASSERT(false);
1499 		}
1500 
1501 		if(lock == LOCK_READONLY && client == PUBLIC)
1502 		{
1503 			resolve();
1504 		}
1505 
1506 		return internal.lockRect(x, y, z, lock);
1507 	}
1508 
unlockInternal()1509 	void Surface::unlockInternal()
1510 	{
1511 		internal.unlockRect();
1512 
1513 		resource->unlock();
1514 	}
1515 
lockStencil(int x,int y,int front,Accessor client)1516 	void *Surface::lockStencil(int x, int y, int front, Accessor client)
1517 	{
1518 		if(stencil.format == FORMAT_NULL)
1519 		{
1520 			return nullptr;
1521 		}
1522 
1523 		resource->lock(client);
1524 
1525 		if(!stencil.buffer)
1526 		{
1527 			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.border, stencil.samples, stencil.format);
1528 		}
1529 
1530 		return stencil.lockRect(x, y, front, LOCK_READWRITE);   // FIXME
1531 	}
1532 
unlockStencil()1533 	void Surface::unlockStencil()
1534 	{
1535 		stencil.unlockRect();
1536 
1537 		resource->unlock();
1538 	}
1539 
bytes(Format format)1540 	int Surface::bytes(Format format)
1541 	{
1542 		switch(format)
1543 		{
1544 		case FORMAT_NULL:				return 0;
1545 		case FORMAT_P8:					return 1;
1546 		case FORMAT_A8P8:				return 2;
1547 		case FORMAT_A8:					return 1;
1548 		case FORMAT_R8I:				return 1;
1549 		case FORMAT_R8:					return 1;
1550 		case FORMAT_R3G3B2:				return 1;
1551 		case FORMAT_R16I:				return 2;
1552 		case FORMAT_R16UI:				return 2;
1553 		case FORMAT_A8R3G3B2:			return 2;
1554 		case FORMAT_R5G6B5:				return 2;
1555 		case FORMAT_A1R5G5B5:			return 2;
1556 		case FORMAT_X1R5G5B5:			return 2;
1557 		case FORMAT_R5G5B5A1:           return 2;
1558 		case FORMAT_X4R4G4B4:			return 2;
1559 		case FORMAT_A4R4G4B4:			return 2;
1560 		case FORMAT_R4G4B4A4:           return 2;
1561 		case FORMAT_R8G8B8:				return 3;
1562 		case FORMAT_B8G8R8:             return 3;
1563 		case FORMAT_R32I:				return 4;
1564 		case FORMAT_R32UI:				return 4;
1565 		case FORMAT_X8R8G8B8:			return 4;
1566 	//	case FORMAT_X8G8R8B8Q:			return 4;
1567 		case FORMAT_A8R8G8B8:			return 4;
1568 	//	case FORMAT_A8G8R8B8Q:			return 4;
1569 		case FORMAT_X8B8G8R8I:			return 4;
1570 		case FORMAT_X8B8G8R8:			return 4;
1571 		case FORMAT_SRGB8_X8:			return 4;
1572 		case FORMAT_SRGB8_A8:			return 4;
1573 		case FORMAT_A8B8G8R8I:			return 4;
1574 		case FORMAT_R8UI:				return 1;
1575 		case FORMAT_G8R8UI:				return 2;
1576 		case FORMAT_X8B8G8R8UI:			return 4;
1577 		case FORMAT_A8B8G8R8UI:			return 4;
1578 		case FORMAT_A8B8G8R8:			return 4;
1579 		case FORMAT_R8_SNORM:			return 1;
1580 		case FORMAT_G8R8_SNORM:		return 2;
1581 		case FORMAT_X8B8G8R8_SNORM:	return 4;
1582 		case FORMAT_A8B8G8R8_SNORM:	return 4;
1583 		case FORMAT_A2R10G10B10:		return 4;
1584 		case FORMAT_A2B10G10R10:		return 4;
1585 		case FORMAT_A2B10G10R10UI:		return 4;
1586 		case FORMAT_G8R8I:				return 2;
1587 		case FORMAT_G8R8:				return 2;
1588 		case FORMAT_G16R16I:			return 4;
1589 		case FORMAT_G16R16UI:			return 4;
1590 		case FORMAT_G16R16:				return 4;
1591 		case FORMAT_G32R32I:			return 8;
1592 		case FORMAT_G32R32UI:			return 8;
1593 		case FORMAT_X16B16G16R16I:		return 8;
1594 		case FORMAT_X16B16G16R16UI:		return 8;
1595 		case FORMAT_A16B16G16R16I:		return 8;
1596 		case FORMAT_A16B16G16R16UI:		return 8;
1597 		case FORMAT_A16B16G16R16:		return 8;
1598 		case FORMAT_X32B32G32R32I:		return 16;
1599 		case FORMAT_X32B32G32R32UI:		return 16;
1600 		case FORMAT_A32B32G32R32I:		return 16;
1601 		case FORMAT_A32B32G32R32UI:		return 16;
1602 		// Compressed formats
1603 		case FORMAT_DXT1:				return 2;   // Column of four pixels
1604 		case FORMAT_DXT3:				return 4;   // Column of four pixels
1605 		case FORMAT_DXT5:				return 4;   // Column of four pixels
1606 		case FORMAT_ATI1:				return 2;   // Column of four pixels
1607 		case FORMAT_ATI2:				return 4;   // Column of four pixels
1608 		case FORMAT_ETC1:				return 2;   // Column of four pixels
1609 		case FORMAT_R11_EAC:			return 2;
1610 		case FORMAT_SIGNED_R11_EAC:		return 2;
1611 		case FORMAT_RG11_EAC:			return 4;
1612 		case FORMAT_SIGNED_RG11_EAC:	return 4;
1613 		case FORMAT_RGB8_ETC2:			return 2;
1614 		case FORMAT_SRGB8_ETC2:			return 2;
1615 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1616 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1617 		case FORMAT_RGBA8_ETC2_EAC:			return 4;
1618 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
1619 		case FORMAT_RGBA_ASTC_4x4_KHR:
1620 		case FORMAT_RGBA_ASTC_5x4_KHR:
1621 		case FORMAT_RGBA_ASTC_5x5_KHR:
1622 		case FORMAT_RGBA_ASTC_6x5_KHR:
1623 		case FORMAT_RGBA_ASTC_6x6_KHR:
1624 		case FORMAT_RGBA_ASTC_8x5_KHR:
1625 		case FORMAT_RGBA_ASTC_8x6_KHR:
1626 		case FORMAT_RGBA_ASTC_8x8_KHR:
1627 		case FORMAT_RGBA_ASTC_10x5_KHR:
1628 		case FORMAT_RGBA_ASTC_10x6_KHR:
1629 		case FORMAT_RGBA_ASTC_10x8_KHR:
1630 		case FORMAT_RGBA_ASTC_10x10_KHR:
1631 		case FORMAT_RGBA_ASTC_12x10_KHR:
1632 		case FORMAT_RGBA_ASTC_12x12_KHR:
1633 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1634 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1635 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1636 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1637 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1638 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1639 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1640 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1641 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1642 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1643 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1644 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1645 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1646 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
1647 		// Bumpmap formats
1648 		case FORMAT_V8U8:				return 2;
1649 		case FORMAT_L6V5U5:				return 2;
1650 		case FORMAT_Q8W8V8U8:			return 4;
1651 		case FORMAT_X8L8V8U8:			return 4;
1652 		case FORMAT_A2W10V10U10:		return 4;
1653 		case FORMAT_V16U16:				return 4;
1654 		case FORMAT_A16W16V16U16:		return 8;
1655 		case FORMAT_Q16W16V16U16:		return 8;
1656 		// Luminance formats
1657 		case FORMAT_L8:					return 1;
1658 		case FORMAT_A4L4:				return 1;
1659 		case FORMAT_L16:				return 2;
1660 		case FORMAT_A8L8:				return 2;
1661 		case FORMAT_L16F:               return 2;
1662 		case FORMAT_A16L16F:            return 4;
1663 		case FORMAT_L32F:               return 4;
1664 		case FORMAT_A32L32F:            return 8;
1665 		// Floating-point formats
1666 		case FORMAT_A16F:				return 2;
1667 		case FORMAT_R16F:				return 2;
1668 		case FORMAT_G16R16F:			return 4;
1669 		case FORMAT_B16G16R16F:			return 6;
1670 		case FORMAT_X16B16G16R16F:		return 8;
1671 		case FORMAT_A16B16G16R16F:		return 8;
1672 		case FORMAT_X16B16G16R16F_UNSIGNED: return 8;
1673 		case FORMAT_A32F:				return 4;
1674 		case FORMAT_R32F:				return 4;
1675 		case FORMAT_G32R32F:			return 8;
1676 		case FORMAT_B32G32R32F:			return 12;
1677 		case FORMAT_X32B32G32R32F:		return 16;
1678 		case FORMAT_A32B32G32R32F:		return 16;
1679 		case FORMAT_X32B32G32R32F_UNSIGNED: return 16;
1680 		// Depth/stencil formats
1681 		case FORMAT_D16:				return 2;
1682 		case FORMAT_D32:				return 4;
1683 		case FORMAT_D24X8:				return 4;
1684 		case FORMAT_D24S8:				return 4;
1685 		case FORMAT_D24FS8:				return 4;
1686 		case FORMAT_D32F:				return 4;
1687 		case FORMAT_D32FS8:				return 4;
1688 		case FORMAT_D32F_COMPLEMENTARY:	return 4;
1689 		case FORMAT_D32FS8_COMPLEMENTARY: return 4;
1690 		case FORMAT_D32F_LOCKABLE:		return 4;
1691 		case FORMAT_D32FS8_TEXTURE:		return 4;
1692 		case FORMAT_D32F_SHADOW:		return 4;
1693 		case FORMAT_D32FS8_SHADOW:		return 4;
1694 		case FORMAT_DF24S8:				return 4;
1695 		case FORMAT_DF16S8:				return 2;
1696 		case FORMAT_INTZ:				return 4;
1697 		case FORMAT_S8:					return 1;
1698 		case FORMAT_YV12_BT601:         return 1;   // Y plane only
1699 		case FORMAT_YV12_BT709:         return 1;   // Y plane only
1700 		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
1701 		default:
1702 			ASSERT(false);
1703 		}
1704 
1705 		return 0;
1706 	}
1707 
pitchB(int width,int border,Format format,bool target)1708 	int Surface::pitchB(int width, int border, Format format, bool target)
1709 	{
1710 		width += 2 * border;
1711 
1712 		if(target || isDepth(format) || isStencil(format))
1713 		{
1714 			width = align(width, 2);
1715 		}
1716 
1717 		switch(format)
1718 		{
1719 		case FORMAT_DXT1:
1720 		case FORMAT_ETC1:
1721 		case FORMAT_R11_EAC:
1722 		case FORMAT_SIGNED_R11_EAC:
1723 		case FORMAT_RGB8_ETC2:
1724 		case FORMAT_SRGB8_ETC2:
1725 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1726 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1727 			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1728 		case FORMAT_RG11_EAC:
1729 		case FORMAT_SIGNED_RG11_EAC:
1730 		case FORMAT_RGBA8_ETC2_EAC:
1731 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1732 		case FORMAT_RGBA_ASTC_4x4_KHR:
1733 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1734 			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
1735 		case FORMAT_RGBA_ASTC_5x4_KHR:
1736 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1737 		case FORMAT_RGBA_ASTC_5x5_KHR:
1738 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1739 			return 16 * ((width + 4) / 5);
1740 		case FORMAT_RGBA_ASTC_6x5_KHR:
1741 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1742 		case FORMAT_RGBA_ASTC_6x6_KHR:
1743 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1744 			return 16 * ((width + 5) / 6);
1745 		case FORMAT_RGBA_ASTC_8x5_KHR:
1746 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1747 		case FORMAT_RGBA_ASTC_8x6_KHR:
1748 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1749 		case FORMAT_RGBA_ASTC_8x8_KHR:
1750 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1751 			return 16 * ((width + 7) / 8);
1752 		case FORMAT_RGBA_ASTC_10x5_KHR:
1753 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1754 		case FORMAT_RGBA_ASTC_10x6_KHR:
1755 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1756 		case FORMAT_RGBA_ASTC_10x8_KHR:
1757 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1758 		case FORMAT_RGBA_ASTC_10x10_KHR:
1759 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1760 			return 16 * ((width + 9) / 10);
1761 		case FORMAT_RGBA_ASTC_12x10_KHR:
1762 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1763 		case FORMAT_RGBA_ASTC_12x12_KHR:
1764 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1765 			return 16 * ((width + 11) / 12);
1766 		case FORMAT_DXT3:
1767 		case FORMAT_DXT5:
1768 			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1769 		case FORMAT_ATI1:
1770 			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1771 		case FORMAT_ATI2:
1772 			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1773 		case FORMAT_YV12_BT601:
1774 		case FORMAT_YV12_BT709:
1775 		case FORMAT_YV12_JFIF:
1776 			return align(width, 16);
1777 		default:
1778 			return bytes(format) * width;
1779 		}
1780 	}
1781 
pitchP(int width,int border,Format format,bool target)1782 	int Surface::pitchP(int width, int border, Format format, bool target)
1783 	{
1784 		int B = bytes(format);
1785 
1786 		return B > 0 ? pitchB(width, border, format, target) / B : 0;
1787 	}
1788 
sliceB(int width,int height,int border,Format format,bool target)1789 	int Surface::sliceB(int width, int height, int border, Format format, bool target)
1790 	{
1791 		height += 2 * border;
1792 
1793 		if(target || isDepth(format) || isStencil(format))
1794 		{
1795 			height = ((height + 1) & ~1);
1796 		}
1797 
1798 		switch(format)
1799 		{
1800 		case FORMAT_DXT1:
1801 		case FORMAT_DXT3:
1802 		case FORMAT_DXT5:
1803 		case FORMAT_ETC1:
1804 		case FORMAT_R11_EAC:
1805 		case FORMAT_SIGNED_R11_EAC:
1806 		case FORMAT_RG11_EAC:
1807 		case FORMAT_SIGNED_RG11_EAC:
1808 		case FORMAT_RGB8_ETC2:
1809 		case FORMAT_SRGB8_ETC2:
1810 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1811 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1812 		case FORMAT_RGBA8_ETC2_EAC:
1813 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1814 		case FORMAT_RGBA_ASTC_4x4_KHR:
1815 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1816 		case FORMAT_RGBA_ASTC_5x4_KHR:
1817 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1818 			return pitchB(width, border, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1819 		case FORMAT_RGBA_ASTC_5x5_KHR:
1820 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1821 		case FORMAT_RGBA_ASTC_6x5_KHR:
1822 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1823 		case FORMAT_RGBA_ASTC_8x5_KHR:
1824 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1825 		case FORMAT_RGBA_ASTC_10x5_KHR:
1826 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1827 			return pitchB(width, border, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
1828 		case FORMAT_RGBA_ASTC_6x6_KHR:
1829 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1830 		case FORMAT_RGBA_ASTC_8x6_KHR:
1831 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1832 		case FORMAT_RGBA_ASTC_10x6_KHR:
1833 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1834 			return pitchB(width, border, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
1835 		case FORMAT_RGBA_ASTC_8x8_KHR:
1836 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1837 		case FORMAT_RGBA_ASTC_10x8_KHR:
1838 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1839 			return pitchB(width, border, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
1840 		case FORMAT_RGBA_ASTC_10x10_KHR:
1841 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1842 		case FORMAT_RGBA_ASTC_12x10_KHR:
1843 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1844 			return pitchB(width, border, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
1845 		case FORMAT_RGBA_ASTC_12x12_KHR:
1846 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1847 			return pitchB(width, border, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
1848 		case FORMAT_ATI1:
1849 		case FORMAT_ATI2:
1850 		default:
1851 			return pitchB(width, border, format, target) * height;   // Pitch computed per row
1852 		}
1853 	}
1854 
sliceP(int width,int height,int border,Format format,bool target)1855 	int Surface::sliceP(int width, int height, int border, Format format, bool target)
1856 	{
1857 		int B = bytes(format);
1858 
1859 		return B > 0 ? sliceB(width, height, border, format, target) / B : 0;
1860 	}
1861 
update(Buffer & destination,Buffer & source)1862 	void Surface::update(Buffer &destination, Buffer &source)
1863 	{
1864 	//	ASSERT(source.lock != LOCK_UNLOCKED);
1865 	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1866 
1867 		if(destination.buffer != source.buffer)
1868 		{
1869 			ASSERT(source.dirty && !destination.dirty);
1870 
1871 			switch(source.format)
1872 			{
1873 			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1874 			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1875 			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1876 			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1877 			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1878 			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1879 			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1880 			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1881 			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1882 			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1883 			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1884 			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
1885 			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
1886 			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
1887 			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
1888 			case FORMAT_ETC1:
1889 			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
1890 			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
1891 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
1892 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
1893 			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
1894 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
1895 			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
1896 			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
1897 			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
1898 			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
1899 			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
1900 			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
1901 			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
1902 			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
1903 			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
1904 			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
1905 			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
1906 			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
1907 			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
1908 			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
1909 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
1910 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
1911 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
1912 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
1913 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
1914 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
1915 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
1916 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
1917 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
1918 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
1919 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
1920 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
1921 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
1922 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
1923 			default:				genericUpdate(destination, source);		break;
1924 			}
1925 		}
1926 	}
1927 
genericUpdate(Buffer & destination,Buffer & source)1928 	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1929 	{
1930 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1931 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1932 
1933 		int depth = min(destination.depth, source.depth);
1934 		int height = min(destination.height, source.height);
1935 		int width = min(destination.width, source.width);
1936 		int rowBytes = width * source.bytes;
1937 
1938 		for(int z = 0; z < depth; z++)
1939 		{
1940 			unsigned char *sourceRow = sourceSlice;
1941 			unsigned char *destinationRow = destinationSlice;
1942 
1943 			for(int y = 0; y < height; y++)
1944 			{
1945 				if(source.format == destination.format)
1946 				{
1947 					memcpy(destinationRow, sourceRow, rowBytes);
1948 				}
1949 				else
1950 				{
1951 					unsigned char *sourceElement = sourceRow;
1952 					unsigned char *destinationElement = destinationRow;
1953 
1954 					for(int x = 0; x < width; x++)
1955 					{
1956 						Color<float> color = source.read(sourceElement);
1957 						destination.write(destinationElement, color);
1958 
1959 						sourceElement += source.bytes;
1960 						destinationElement += destination.bytes;
1961 					}
1962 				}
1963 
1964 				sourceRow += source.pitchB;
1965 				destinationRow += destination.pitchB;
1966 			}
1967 
1968 			sourceSlice += source.sliceB;
1969 			destinationSlice += destination.sliceB;
1970 		}
1971 
1972 		source.unlockRect();
1973 		destination.unlockRect();
1974 	}
1975 
decodeR8G8B8(Buffer & destination,Buffer & source)1976 	void Surface::decodeR8G8B8(Buffer &destination, Buffer &source)
1977 	{
1978 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
1979 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
1980 
1981 		int depth = min(destination.depth, source.depth);
1982 		int height = min(destination.height, source.height);
1983 		int width = min(destination.width, source.width);
1984 
1985 		for(int z = 0; z < depth; z++)
1986 		{
1987 			unsigned char *sourceRow = sourceSlice;
1988 			unsigned char *destinationRow = destinationSlice;
1989 
1990 			for(int y = 0; y < height; y++)
1991 			{
1992 				unsigned char *sourceElement = sourceRow;
1993 				unsigned char *destinationElement = destinationRow;
1994 
1995 				for(int x = 0; x < width; x++)
1996 				{
1997 					unsigned int b = sourceElement[0];
1998 					unsigned int g = sourceElement[1];
1999 					unsigned int r = sourceElement[2];
2000 
2001 					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
2002 
2003 					sourceElement += source.bytes;
2004 					destinationElement += destination.bytes;
2005 				}
2006 
2007 				sourceRow += source.pitchB;
2008 				destinationRow += destination.pitchB;
2009 			}
2010 
2011 			sourceSlice += source.sliceB;
2012 			destinationSlice += destination.sliceB;
2013 		}
2014 
2015 		source.unlockRect();
2016 		destination.unlockRect();
2017 	}
2018 
decodeX1R5G5B5(Buffer & destination,Buffer & source)2019 	void Surface::decodeX1R5G5B5(Buffer &destination, Buffer &source)
2020 	{
2021 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2022 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2023 
2024 		int depth = min(destination.depth, source.depth);
2025 		int height = min(destination.height, source.height);
2026 		int width = min(destination.width, source.width);
2027 
2028 		for(int z = 0; z < depth; z++)
2029 		{
2030 			unsigned char *sourceRow = sourceSlice;
2031 			unsigned char *destinationRow = destinationSlice;
2032 
2033 			for(int y = 0; y < height; y++)
2034 			{
2035 				unsigned char *sourceElement = sourceRow;
2036 				unsigned char *destinationElement = destinationRow;
2037 
2038 				for(int x = 0; x < width; x++)
2039 				{
2040 					unsigned int xrgb = *(unsigned short*)sourceElement;
2041 
2042 					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
2043 					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
2044 					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
2045 
2046 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2047 
2048 					sourceElement += source.bytes;
2049 					destinationElement += destination.bytes;
2050 				}
2051 
2052 				sourceRow += source.pitchB;
2053 				destinationRow += destination.pitchB;
2054 			}
2055 
2056 			sourceSlice += source.sliceB;
2057 			destinationSlice += destination.sliceB;
2058 		}
2059 
2060 		source.unlockRect();
2061 		destination.unlockRect();
2062 	}
2063 
decodeA1R5G5B5(Buffer & destination,Buffer & source)2064 	void Surface::decodeA1R5G5B5(Buffer &destination, Buffer &source)
2065 	{
2066 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2067 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2068 
2069 		int depth = min(destination.depth, source.depth);
2070 		int height = min(destination.height, source.height);
2071 		int width = min(destination.width, source.width);
2072 
2073 		for(int z = 0; z < depth; z++)
2074 		{
2075 			unsigned char *sourceRow = sourceSlice;
2076 			unsigned char *destinationRow = destinationSlice;
2077 
2078 			for(int y = 0; y < height; y++)
2079 			{
2080 				unsigned char *sourceElement = sourceRow;
2081 				unsigned char *destinationElement = destinationRow;
2082 
2083 				for(int x = 0; x < width; x++)
2084 				{
2085 					unsigned int argb = *(unsigned short*)sourceElement;
2086 
2087 					unsigned int a =   (argb & 0x8000) * 130560;
2088 					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
2089 					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
2090 					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
2091 
2092 					*(unsigned int*)destinationElement = a | r | g | b;
2093 
2094 					sourceElement += source.bytes;
2095 					destinationElement += destination.bytes;
2096 				}
2097 
2098 				sourceRow += source.pitchB;
2099 				destinationRow += destination.pitchB;
2100 			}
2101 
2102 			sourceSlice += source.sliceB;
2103 			destinationSlice += destination.sliceB;
2104 		}
2105 
2106 		source.unlockRect();
2107 		destination.unlockRect();
2108 	}
2109 
decodeX4R4G4B4(Buffer & destination,Buffer & source)2110 	void Surface::decodeX4R4G4B4(Buffer &destination, Buffer &source)
2111 	{
2112 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2113 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2114 
2115 		int depth = min(destination.depth, source.depth);
2116 		int height = min(destination.height, source.height);
2117 		int width = min(destination.width, source.width);
2118 
2119 		for(int z = 0; z < depth; z++)
2120 		{
2121 			unsigned char *sourceRow = sourceSlice;
2122 			unsigned char *destinationRow = destinationSlice;
2123 
2124 			for(int y = 0; y < height; y++)
2125 			{
2126 				unsigned char *sourceElement = sourceRow;
2127 				unsigned char *destinationElement = destinationRow;
2128 
2129 				for(int x = 0; x < width; x++)
2130 				{
2131 					unsigned int xrgb = *(unsigned short*)sourceElement;
2132 
2133 					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
2134 					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
2135 					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
2136 
2137 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2138 
2139 					sourceElement += source.bytes;
2140 					destinationElement += destination.bytes;
2141 				}
2142 
2143 				sourceRow += source.pitchB;
2144 				destinationRow += destination.pitchB;
2145 			}
2146 
2147 			sourceSlice += source.sliceB;
2148 			destinationSlice += destination.sliceB;
2149 		}
2150 
2151 		source.unlockRect();
2152 		destination.unlockRect();
2153 	}
2154 
decodeA4R4G4B4(Buffer & destination,Buffer & source)2155 	void Surface::decodeA4R4G4B4(Buffer &destination, Buffer &source)
2156 	{
2157 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2158 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2159 
2160 		int depth = min(destination.depth, source.depth);
2161 		int height = min(destination.height, source.height);
2162 		int width = min(destination.width, source.width);
2163 
2164 		for(int z = 0; z < depth; z++)
2165 		{
2166 			unsigned char *sourceRow = sourceSlice;
2167 			unsigned char *destinationRow = destinationSlice;
2168 
2169 			for(int y = 0; y < height; y++)
2170 			{
2171 				unsigned char *sourceElement = sourceRow;
2172 				unsigned char *destinationElement = destinationRow;
2173 
2174 				for(int x = 0; x < width; x++)
2175 				{
2176 					unsigned int argb = *(unsigned short*)sourceElement;
2177 
2178 					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
2179 					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
2180 					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
2181 					unsigned int b =  (argb & 0x000F) * 0x00000011;
2182 
2183 					*(unsigned int*)destinationElement = a | r | g | b;
2184 
2185 					sourceElement += source.bytes;
2186 					destinationElement += destination.bytes;
2187 				}
2188 
2189 				sourceRow += source.pitchB;
2190 				destinationRow += destination.pitchB;
2191 			}
2192 
2193 			sourceSlice += source.sliceB;
2194 			destinationSlice += destination.sliceB;
2195 		}
2196 
2197 		source.unlockRect();
2198 		destination.unlockRect();
2199 	}
2200 
decodeP8(Buffer & destination,Buffer & source)2201 	void Surface::decodeP8(Buffer &destination, Buffer &source)
2202 	{
2203 		unsigned char *sourceSlice = (unsigned char*)source.lockRect(0, 0, 0, sw::LOCK_READONLY);
2204 		unsigned char *destinationSlice = (unsigned char*)destination.lockRect(0, 0, 0, sw::LOCK_UPDATE);
2205 
2206 		int depth = min(destination.depth, source.depth);
2207 		int height = min(destination.height, source.height);
2208 		int width = min(destination.width, source.width);
2209 
2210 		for(int z = 0; z < depth; z++)
2211 		{
2212 			unsigned char *sourceRow = sourceSlice;
2213 			unsigned char *destinationRow = destinationSlice;
2214 
2215 			for(int y = 0; y < height; y++)
2216 			{
2217 				unsigned char *sourceElement = sourceRow;
2218 				unsigned char *destinationElement = destinationRow;
2219 
2220 				for(int x = 0; x < width; x++)
2221 				{
2222 					unsigned int abgr = palette[*(unsigned char*)sourceElement];
2223 
2224 					unsigned int r = (abgr & 0x000000FF) << 16;
2225 					unsigned int g = (abgr & 0x0000FF00) << 0;
2226 					unsigned int b = (abgr & 0x00FF0000) >> 16;
2227 					unsigned int a = (abgr & 0xFF000000) >> 0;
2228 
2229 					*(unsigned int*)destinationElement = a | r | g | b;
2230 
2231 					sourceElement += source.bytes;
2232 					destinationElement += destination.bytes;
2233 				}
2234 
2235 				sourceRow += source.pitchB;
2236 				destinationRow += destination.pitchB;
2237 			}
2238 
2239 			sourceSlice += source.sliceB;
2240 			destinationSlice += destination.sliceB;
2241 		}
2242 
2243 		source.unlockRect();
2244 		destination.unlockRect();
2245 	}
2246 
decodeDXT1(Buffer & internal,Buffer & external)2247 	void Surface::decodeDXT1(Buffer &internal, Buffer &external)
2248 	{
2249 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2250 		const DXT1 *source = (const DXT1*)external.lockRect(0, 0, 0, LOCK_READONLY);
2251 
2252 		for(int z = 0; z < external.depth; z++)
2253 		{
2254 			unsigned int *dest = destSlice;
2255 
2256 			for(int y = 0; y < external.height; y += 4)
2257 			{
2258 				for(int x = 0; x < external.width; x += 4)
2259 				{
2260 					Color<byte> c[4];
2261 
2262 					c[0] = source->c0;
2263 					c[1] = source->c1;
2264 
2265 					if(source->c0 > source->c1)   // No transparency
2266 					{
2267 						// c2 = 2 / 3 * c0 + 1 / 3 * c1
2268 						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2269 						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2270 						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2271 						c[2].a = 0xFF;
2272 
2273 						// c3 = 1 / 3 * c0 + 2 / 3 * c1
2274 						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2275 						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2276 						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2277 						c[3].a = 0xFF;
2278 					}
2279 					else   // c3 transparent
2280 					{
2281 						// c2 = 1 / 2 * c0 + 1 / 2 * c1
2282 						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
2283 						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
2284 						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
2285 						c[2].a = 0xFF;
2286 
2287 						c[3].r = 0;
2288 						c[3].g = 0;
2289 						c[3].b = 0;
2290 						c[3].a = 0;
2291 					}
2292 
2293 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2294 					{
2295 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2296 						{
2297 							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
2298 						}
2299 					}
2300 
2301 					source++;
2302 				}
2303 			}
2304 
2305 			(byte*&)destSlice += internal.sliceB;
2306 		}
2307 
2308 		external.unlockRect();
2309 		internal.unlockRect();
2310 	}
2311 
decodeDXT3(Buffer & internal,Buffer & external)2312 	void Surface::decodeDXT3(Buffer &internal, Buffer &external)
2313 	{
2314 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2315 		const DXT3 *source = (const DXT3*)external.lockRect(0, 0, 0, LOCK_READONLY);
2316 
2317 		for(int z = 0; z < external.depth; z++)
2318 		{
2319 			unsigned int *dest = destSlice;
2320 
2321 			for(int y = 0; y < external.height; y += 4)
2322 			{
2323 				for(int x = 0; x < external.width; x += 4)
2324 				{
2325 					Color<byte> c[4];
2326 
2327 					c[0] = source->c0;
2328 					c[1] = source->c1;
2329 
2330 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2331 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2332 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2333 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2334 
2335 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2336 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2337 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2338 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2339 
2340 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2341 					{
2342 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2343 						{
2344 							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
2345 							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
2346 
2347 							dest[(x + i) + (y + j) * internal.width] = color;
2348 						}
2349 					}
2350 
2351 					source++;
2352 				}
2353 			}
2354 
2355 			(byte*&)destSlice += internal.sliceB;
2356 		}
2357 
2358 		external.unlockRect();
2359 		internal.unlockRect();
2360 	}
2361 
decodeDXT5(Buffer & internal,Buffer & external)2362 	void Surface::decodeDXT5(Buffer &internal, Buffer &external)
2363 	{
2364 		unsigned int *destSlice = (unsigned int*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2365 		const DXT5 *source = (const DXT5*)external.lockRect(0, 0, 0, LOCK_READONLY);
2366 
2367 		for(int z = 0; z < external.depth; z++)
2368 		{
2369 			unsigned int *dest = destSlice;
2370 
2371 			for(int y = 0; y < external.height; y += 4)
2372 			{
2373 				for(int x = 0; x < external.width; x += 4)
2374 				{
2375 					Color<byte> c[4];
2376 
2377 					c[0] = source->c0;
2378 					c[1] = source->c1;
2379 
2380 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2381 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2382 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2383 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2384 
2385 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2386 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2387 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2388 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2389 
2390 					byte a[8];
2391 
2392 					a[0] = source->a0;
2393 					a[1] = source->a1;
2394 
2395 					if(a[0] > a[1])
2396 					{
2397 						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
2398 						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
2399 						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
2400 						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
2401 						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
2402 						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
2403 					}
2404 					else
2405 					{
2406 						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
2407 						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
2408 						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
2409 						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
2410 						a[6] = 0;
2411 						a[7] = 0xFF;
2412 					}
2413 
2414 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2415 					{
2416 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2417 						{
2418 							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
2419 							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
2420 
2421 							dest[(x + i) + (y + j) * internal.width] = color;
2422 						}
2423 					}
2424 
2425 					source++;
2426 				}
2427 			}
2428 
2429 			(byte*&)destSlice += internal.sliceB;
2430 		}
2431 
2432 		external.unlockRect();
2433 		internal.unlockRect();
2434 	}
2435 
decodeATI1(Buffer & internal,Buffer & external)2436 	void Surface::decodeATI1(Buffer &internal, Buffer &external)
2437 	{
2438 		byte *destSlice = (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2439 		const ATI1 *source = (const ATI1*)external.lockRect(0, 0, 0, LOCK_READONLY);
2440 
2441 		for(int z = 0; z < external.depth; z++)
2442 		{
2443 			byte *dest = destSlice;
2444 
2445 			for(int y = 0; y < external.height; y += 4)
2446 			{
2447 				for(int x = 0; x < external.width; x += 4)
2448 				{
2449 					byte r[8];
2450 
2451 					r[0] = source->r0;
2452 					r[1] = source->r1;
2453 
2454 					if(r[0] > r[1])
2455 					{
2456 						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
2457 						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
2458 						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
2459 						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
2460 						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
2461 						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
2462 					}
2463 					else
2464 					{
2465 						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
2466 						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
2467 						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
2468 						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
2469 						r[6] = 0;
2470 						r[7] = 0xFF;
2471 					}
2472 
2473 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2474 					{
2475 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2476 						{
2477 							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
2478 						}
2479 					}
2480 
2481 					source++;
2482 				}
2483 			}
2484 
2485 			destSlice += internal.sliceB;
2486 		}
2487 
2488 		external.unlockRect();
2489 		internal.unlockRect();
2490 	}
2491 
decodeATI2(Buffer & internal,Buffer & external)2492 	void Surface::decodeATI2(Buffer &internal, Buffer &external)
2493 	{
2494 		word *destSlice = (word*)internal.lockRect(0, 0, 0, LOCK_UPDATE);
2495 		const ATI2 *source = (const ATI2*)external.lockRect(0, 0, 0, LOCK_READONLY);
2496 
2497 		for(int z = 0; z < external.depth; z++)
2498 		{
2499 			word *dest = destSlice;
2500 
2501 			for(int y = 0; y < external.height; y += 4)
2502 			{
2503 				for(int x = 0; x < external.width; x += 4)
2504 				{
2505 					byte X[8];
2506 
2507 					X[0] = source->x0;
2508 					X[1] = source->x1;
2509 
2510 					if(X[0] > X[1])
2511 					{
2512 						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
2513 						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
2514 						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
2515 						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
2516 						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
2517 						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
2518 					}
2519 					else
2520 					{
2521 						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
2522 						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
2523 						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
2524 						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
2525 						X[6] = 0;
2526 						X[7] = 0xFF;
2527 					}
2528 
2529 					byte Y[8];
2530 
2531 					Y[0] = source->y0;
2532 					Y[1] = source->y1;
2533 
2534 					if(Y[0] > Y[1])
2535 					{
2536 						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
2537 						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
2538 						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
2539 						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
2540 						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
2541 						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
2542 					}
2543 					else
2544 					{
2545 						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
2546 						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
2547 						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
2548 						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
2549 						Y[6] = 0;
2550 						Y[7] = 0xFF;
2551 					}
2552 
2553 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2554 					{
2555 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2556 						{
2557 							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
2558 							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
2559 
2560 							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
2561 						}
2562 					}
2563 
2564 					source++;
2565 				}
2566 			}
2567 
2568 			(byte*&)destSlice += internal.sliceB;
2569 		}
2570 
2571 		external.unlockRect();
2572 		internal.unlockRect();
2573 	}
2574 
decodeETC2(Buffer & internal,Buffer & external,int nbAlphaBits,bool isSRGB)2575 	void Surface::decodeETC2(Buffer &internal, Buffer &external, int nbAlphaBits, bool isSRGB)
2576 	{
2577 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), (byte*)internal.lockRect(0, 0, 0, LOCK_UPDATE), external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2578 		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
2579 		external.unlockRect();
2580 		internal.unlockRect();
2581 
2582 		if(isSRGB)
2583 		{
2584 			static byte sRGBtoLinearTable[256];
2585 			static bool sRGBtoLinearTableDirty = true;
2586 			if(sRGBtoLinearTableDirty)
2587 			{
2588 				for(int i = 0; i < 256; i++)
2589 				{
2590 					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
2591 				}
2592 				sRGBtoLinearTableDirty = false;
2593 			}
2594 
2595 			// Perform sRGB conversion in place after decoding
2596 			byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
2597 			for(int y = 0; y < internal.height; y++)
2598 			{
2599 				byte *srcRow = src + y * internal.pitchB;
2600 				for(int x = 0; x <  internal.width; x++)
2601 				{
2602 					byte *srcPix = srcRow + x * internal.bytes;
2603 					for(int i = 0; i < 3; i++)
2604 					{
2605 						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
2606 					}
2607 				}
2608 			}
2609 			internal.unlockRect();
2610 		}
2611 	}
2612 
decodeEAC(Buffer & internal,Buffer & external,int nbChannels,bool isSigned)2613 	void Surface::decodeEAC(Buffer &internal, Buffer &external, int nbChannels, bool isSigned)
2614 	{
2615 		ASSERT(nbChannels == 1 || nbChannels == 2);
2616 
2617 		byte *src = (byte*)internal.lockRect(0, 0, 0, LOCK_READWRITE);
2618 		ETC_Decoder::Decode((const byte*)external.lockRect(0, 0, 0, LOCK_READONLY), src, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2619 		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
2620 		external.unlockRect();
2621 
2622 		// FIXME: We convert EAC data to float, until signed short internal formats are supported
2623 		//        This code can be removed if ETC2 images are decoded to internal 16 bit signed R/RG formats
2624 		const float normalization = isSigned ? (1.0f / (8.0f * 127.875f)) : (1.0f / (8.0f * 255.875f));
2625 		for(int y = 0; y < internal.height; y++)
2626 		{
2627 			byte* srcRow = src + y * internal.pitchB;
2628 			for(int x = internal.width - 1; x >= 0; x--)
2629 			{
2630 				int* srcPix = reinterpret_cast<int*>(srcRow + x * internal.bytes);
2631 				float* dstPix = reinterpret_cast<float*>(srcPix);
2632 				for(int c = nbChannels - 1; c >= 0; c--)
2633 				{
2634 					dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
2635 				}
2636 			}
2637 		}
2638 
2639 		internal.unlockRect();
2640 	}
2641 
decodeASTC(Buffer & internal,Buffer & external,int xBlockSize,int yBlockSize,int zBlockSize,bool isSRGB)2642 	void Surface::decodeASTC(Buffer &internal, Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
2643 	{
2644 	}
2645 
size(int width,int height,int depth,int border,int samples,Format format)2646 	unsigned int Surface::size(int width, int height, int depth, int border, int samples, Format format)
2647 	{
2648 		width += 2 * border;
2649 		height += 2 * border;
2650 
2651 		// Dimensions rounded up to multiples of 4, used for compressed formats
2652 		int width4 = align(width, 4);
2653 		int height4 = align(height, 4);
2654 
2655 		switch(format)
2656 		{
2657 		case FORMAT_DXT1:
2658 		case FORMAT_ATI1:
2659 		case FORMAT_ETC1:
2660 		case FORMAT_R11_EAC:
2661 		case FORMAT_SIGNED_R11_EAC:
2662 		case FORMAT_RGB8_ETC2:
2663 		case FORMAT_SRGB8_ETC2:
2664 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2665 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2666 			return width4 * height4 * depth / 2;
2667 		case FORMAT_DXT3:
2668 		case FORMAT_DXT5:
2669 		case FORMAT_ATI2:
2670 		case FORMAT_RG11_EAC:
2671 		case FORMAT_SIGNED_RG11_EAC:
2672 		case FORMAT_RGBA8_ETC2_EAC:
2673 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2674 		case FORMAT_RGBA_ASTC_4x4_KHR:
2675 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2676 			return width4 * height4 * depth;
2677 		case FORMAT_RGBA_ASTC_5x4_KHR:
2678 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2679 			return align(width, 5) * height4 * depth;
2680 		case FORMAT_RGBA_ASTC_5x5_KHR:
2681 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2682 			return align(width, 5) * align(height, 5) * depth;
2683 		case FORMAT_RGBA_ASTC_6x5_KHR:
2684 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2685 			return align(width, 6) * align(height, 5) * depth;
2686 		case FORMAT_RGBA_ASTC_6x6_KHR:
2687 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2688 			return align(width, 6) * align(height, 6) * depth;
2689 		case FORMAT_RGBA_ASTC_8x5_KHR:
2690 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2691 			return align(width, 8) * align(height, 5) * depth;
2692 		case FORMAT_RGBA_ASTC_8x6_KHR:
2693 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2694 			return align(width, 8) * align(height, 6) * depth;
2695 		case FORMAT_RGBA_ASTC_8x8_KHR:
2696 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2697 			return align(width, 8) * align(height, 8) * depth;
2698 		case FORMAT_RGBA_ASTC_10x5_KHR:
2699 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2700 			return align(width, 10) * align(height, 5) * depth;
2701 		case FORMAT_RGBA_ASTC_10x6_KHR:
2702 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2703 			return align(width, 10) * align(height, 6) * depth;
2704 		case FORMAT_RGBA_ASTC_10x8_KHR:
2705 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2706 			return align(width, 10) * align(height, 8) * depth;
2707 		case FORMAT_RGBA_ASTC_10x10_KHR:
2708 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2709 			return align(width, 10) * align(height, 10) * depth;
2710 		case FORMAT_RGBA_ASTC_12x10_KHR:
2711 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2712 			return align(width, 12) * align(height, 10) * depth;
2713 		case FORMAT_RGBA_ASTC_12x12_KHR:
2714 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2715 			return align(width, 12) * align(height, 12) * depth;
2716 		case FORMAT_YV12_BT601:
2717 		case FORMAT_YV12_BT709:
2718 		case FORMAT_YV12_JFIF:
2719 			{
2720 				unsigned int YStride = align(width, 16);
2721 				unsigned int YSize = YStride * height;
2722 				unsigned int CStride = align(YStride / 2, 16);
2723 				unsigned int CSize = CStride * height / 2;
2724 
2725 				return YSize + 2 * CSize;
2726 			}
2727 		default:
2728 			return bytes(format) * width * height * depth * samples;
2729 		}
2730 	}
2731 
isStencil(Format format)2732 	bool Surface::isStencil(Format format)
2733 	{
2734 		switch(format)
2735 		{
2736 		case FORMAT_D32:
2737 		case FORMAT_D16:
2738 		case FORMAT_D24X8:
2739 		case FORMAT_D32F:
2740 		case FORMAT_D32F_COMPLEMENTARY:
2741 		case FORMAT_D32F_LOCKABLE:
2742 		case FORMAT_D32F_SHADOW:
2743 			return false;
2744 		case FORMAT_D24S8:
2745 		case FORMAT_D24FS8:
2746 		case FORMAT_S8:
2747 		case FORMAT_DF24S8:
2748 		case FORMAT_DF16S8:
2749 		case FORMAT_D32FS8_TEXTURE:
2750 		case FORMAT_D32FS8_SHADOW:
2751 		case FORMAT_D32FS8:
2752 		case FORMAT_D32FS8_COMPLEMENTARY:
2753 		case FORMAT_INTZ:
2754 			return true;
2755 		default:
2756 			return false;
2757 		}
2758 	}
2759 
isDepth(Format format)2760 	bool Surface::isDepth(Format format)
2761 	{
2762 		switch(format)
2763 		{
2764 		case FORMAT_D32:
2765 		case FORMAT_D16:
2766 		case FORMAT_D24X8:
2767 		case FORMAT_D24S8:
2768 		case FORMAT_D24FS8:
2769 		case FORMAT_D32F:
2770 		case FORMAT_D32FS8:
2771 		case FORMAT_D32F_COMPLEMENTARY:
2772 		case FORMAT_D32FS8_COMPLEMENTARY:
2773 		case FORMAT_D32F_LOCKABLE:
2774 		case FORMAT_DF24S8:
2775 		case FORMAT_DF16S8:
2776 		case FORMAT_D32FS8_TEXTURE:
2777 		case FORMAT_D32F_SHADOW:
2778 		case FORMAT_D32FS8_SHADOW:
2779 		case FORMAT_INTZ:
2780 			return true;
2781 		case FORMAT_S8:
2782 			return false;
2783 		default:
2784 			return false;
2785 		}
2786 	}
2787 
hasQuadLayout(Format format)2788 	bool Surface::hasQuadLayout(Format format)
2789 	{
2790 		switch(format)
2791 		{
2792 		case FORMAT_D32:
2793 		case FORMAT_D16:
2794 		case FORMAT_D24X8:
2795 		case FORMAT_D24S8:
2796 		case FORMAT_D24FS8:
2797 		case FORMAT_D32F:
2798 		case FORMAT_D32FS8:
2799 		case FORMAT_D32F_COMPLEMENTARY:
2800 		case FORMAT_D32FS8_COMPLEMENTARY:
2801 		case FORMAT_DF24S8:
2802 		case FORMAT_DF16S8:
2803 		case FORMAT_INTZ:
2804 		case FORMAT_S8:
2805 		case FORMAT_A8G8R8B8Q:
2806 		case FORMAT_X8G8R8B8Q:
2807 			return true;
2808 		case FORMAT_D32F_LOCKABLE:
2809 		case FORMAT_D32FS8_TEXTURE:
2810 		case FORMAT_D32F_SHADOW:
2811 		case FORMAT_D32FS8_SHADOW:
2812 		default:
2813 			break;
2814 		}
2815 
2816 		return false;
2817 	}
2818 
isPalette(Format format)2819 	bool Surface::isPalette(Format format)
2820 	{
2821 		switch(format)
2822 		{
2823 		case FORMAT_P8:
2824 		case FORMAT_A8P8:
2825 			return true;
2826 		default:
2827 			return false;
2828 		}
2829 	}
2830 
isFloatFormat(Format format)2831 	bool Surface::isFloatFormat(Format format)
2832 	{
2833 		switch(format)
2834 		{
2835 		case FORMAT_R5G6B5:
2836 		case FORMAT_R8G8B8:
2837 		case FORMAT_B8G8R8:
2838 		case FORMAT_X8R8G8B8:
2839 		case FORMAT_X8B8G8R8I:
2840 		case FORMAT_X8B8G8R8:
2841 		case FORMAT_A8R8G8B8:
2842 		case FORMAT_SRGB8_X8:
2843 		case FORMAT_SRGB8_A8:
2844 		case FORMAT_A8B8G8R8I:
2845 		case FORMAT_R8UI:
2846 		case FORMAT_G8R8UI:
2847 		case FORMAT_X8B8G8R8UI:
2848 		case FORMAT_A8B8G8R8UI:
2849 		case FORMAT_A8B8G8R8:
2850 		case FORMAT_G8R8I:
2851 		case FORMAT_G8R8:
2852 		case FORMAT_A2B10G10R10:
2853 		case FORMAT_A2B10G10R10UI:
2854 		case FORMAT_R8_SNORM:
2855 		case FORMAT_G8R8_SNORM:
2856 		case FORMAT_X8B8G8R8_SNORM:
2857 		case FORMAT_A8B8G8R8_SNORM:
2858 		case FORMAT_R16I:
2859 		case FORMAT_R16UI:
2860 		case FORMAT_G16R16I:
2861 		case FORMAT_G16R16UI:
2862 		case FORMAT_G16R16:
2863 		case FORMAT_X16B16G16R16I:
2864 		case FORMAT_X16B16G16R16UI:
2865 		case FORMAT_A16B16G16R16I:
2866 		case FORMAT_A16B16G16R16UI:
2867 		case FORMAT_A16B16G16R16:
2868 		case FORMAT_V8U8:
2869 		case FORMAT_Q8W8V8U8:
2870 		case FORMAT_X8L8V8U8:
2871 		case FORMAT_V16U16:
2872 		case FORMAT_A16W16V16U16:
2873 		case FORMAT_Q16W16V16U16:
2874 		case FORMAT_A8:
2875 		case FORMAT_R8I:
2876 		case FORMAT_R8:
2877 		case FORMAT_S8:
2878 		case FORMAT_L8:
2879 		case FORMAT_L16:
2880 		case FORMAT_A8L8:
2881 		case FORMAT_YV12_BT601:
2882 		case FORMAT_YV12_BT709:
2883 		case FORMAT_YV12_JFIF:
2884 		case FORMAT_R32I:
2885 		case FORMAT_R32UI:
2886 		case FORMAT_G32R32I:
2887 		case FORMAT_G32R32UI:
2888 		case FORMAT_X32B32G32R32I:
2889 		case FORMAT_X32B32G32R32UI:
2890 		case FORMAT_A32B32G32R32I:
2891 		case FORMAT_A32B32G32R32UI:
2892 			return false;
2893 		case FORMAT_R16F:
2894 		case FORMAT_G16R16F:
2895 		case FORMAT_B16G16R16F:
2896 		case FORMAT_X16B16G16R16F:
2897 		case FORMAT_A16B16G16R16F:
2898 		case FORMAT_X16B16G16R16F_UNSIGNED:
2899 		case FORMAT_R32F:
2900 		case FORMAT_G32R32F:
2901 		case FORMAT_B32G32R32F:
2902 		case FORMAT_X32B32G32R32F:
2903 		case FORMAT_A32B32G32R32F:
2904 		case FORMAT_X32B32G32R32F_UNSIGNED:
2905 		case FORMAT_D32F:
2906 		case FORMAT_D32FS8:
2907 		case FORMAT_D32F_COMPLEMENTARY:
2908 		case FORMAT_D32FS8_COMPLEMENTARY:
2909 		case FORMAT_D32F_LOCKABLE:
2910 		case FORMAT_D32FS8_TEXTURE:
2911 		case FORMAT_D32F_SHADOW:
2912 		case FORMAT_D32FS8_SHADOW:
2913 		case FORMAT_L16F:
2914 		case FORMAT_A16L16F:
2915 		case FORMAT_L32F:
2916 		case FORMAT_A32L32F:
2917 			return true;
2918 		default:
2919 			ASSERT(false);
2920 		}
2921 
2922 		return false;
2923 	}
2924 
isUnsignedComponent(Format format,int component)2925 	bool Surface::isUnsignedComponent(Format format, int component)
2926 	{
2927 		switch(format)
2928 		{
2929 		case FORMAT_NULL:
2930 		case FORMAT_R5G6B5:
2931 		case FORMAT_R8G8B8:
2932 		case FORMAT_B8G8R8:
2933 		case FORMAT_X8R8G8B8:
2934 		case FORMAT_X8B8G8R8:
2935 		case FORMAT_A8R8G8B8:
2936 		case FORMAT_A8B8G8R8:
2937 		case FORMAT_SRGB8_X8:
2938 		case FORMAT_SRGB8_A8:
2939 		case FORMAT_G8R8:
2940 		case FORMAT_A2B10G10R10:
2941 		case FORMAT_A2B10G10R10UI:
2942 		case FORMAT_R16UI:
2943 		case FORMAT_G16R16:
2944 		case FORMAT_G16R16UI:
2945 		case FORMAT_X16B16G16R16UI:
2946 		case FORMAT_A16B16G16R16:
2947 		case FORMAT_A16B16G16R16UI:
2948 		case FORMAT_R32UI:
2949 		case FORMAT_G32R32UI:
2950 		case FORMAT_X32B32G32R32UI:
2951 		case FORMAT_A32B32G32R32UI:
2952 		case FORMAT_X32B32G32R32F_UNSIGNED:
2953 		case FORMAT_R8UI:
2954 		case FORMAT_G8R8UI:
2955 		case FORMAT_X8B8G8R8UI:
2956 		case FORMAT_A8B8G8R8UI:
2957 		case FORMAT_D32F:
2958 		case FORMAT_D32FS8:
2959 		case FORMAT_D32F_COMPLEMENTARY:
2960 		case FORMAT_D32FS8_COMPLEMENTARY:
2961 		case FORMAT_D32F_LOCKABLE:
2962 		case FORMAT_D32FS8_TEXTURE:
2963 		case FORMAT_D32F_SHADOW:
2964 		case FORMAT_D32FS8_SHADOW:
2965 		case FORMAT_A8:
2966 		case FORMAT_R8:
2967 		case FORMAT_L8:
2968 		case FORMAT_L16:
2969 		case FORMAT_A8L8:
2970 		case FORMAT_YV12_BT601:
2971 		case FORMAT_YV12_BT709:
2972 		case FORMAT_YV12_JFIF:
2973 			return true;
2974 		case FORMAT_A8B8G8R8I:
2975 		case FORMAT_A16B16G16R16I:
2976 		case FORMAT_A32B32G32R32I:
2977 		case FORMAT_A8B8G8R8_SNORM:
2978 		case FORMAT_Q8W8V8U8:
2979 		case FORMAT_Q16W16V16U16:
2980 		case FORMAT_A32B32G32R32F:
2981 			return false;
2982 		case FORMAT_R32F:
2983 		case FORMAT_R8I:
2984 		case FORMAT_R16I:
2985 		case FORMAT_R32I:
2986 		case FORMAT_R8_SNORM:
2987 			return component >= 1;
2988 		case FORMAT_V8U8:
2989 		case FORMAT_X8L8V8U8:
2990 		case FORMAT_V16U16:
2991 		case FORMAT_G32R32F:
2992 		case FORMAT_G8R8I:
2993 		case FORMAT_G16R16I:
2994 		case FORMAT_G32R32I:
2995 		case FORMAT_G8R8_SNORM:
2996 			return component >= 2;
2997 		case FORMAT_A16W16V16U16:
2998 		case FORMAT_B32G32R32F:
2999 		case FORMAT_X32B32G32R32F:
3000 		case FORMAT_X8B8G8R8I:
3001 		case FORMAT_X16B16G16R16I:
3002 		case FORMAT_X32B32G32R32I:
3003 		case FORMAT_X8B8G8R8_SNORM:
3004 			return component >= 3;
3005 		default:
3006 			ASSERT(false);
3007 		}
3008 
3009 		return false;
3010 	}
3011 
isSRGBreadable(Format format)3012 	bool Surface::isSRGBreadable(Format format)
3013 	{
3014 		// Keep in sync with Capabilities::isSRGBreadable
3015 		switch(format)
3016 		{
3017 		case FORMAT_L8:
3018 		case FORMAT_A8L8:
3019 		case FORMAT_R8G8B8:
3020 		case FORMAT_A8R8G8B8:
3021 		case FORMAT_X8R8G8B8:
3022 		case FORMAT_A8B8G8R8:
3023 		case FORMAT_X8B8G8R8:
3024 		case FORMAT_SRGB8_X8:
3025 		case FORMAT_SRGB8_A8:
3026 		case FORMAT_R5G6B5:
3027 		case FORMAT_X1R5G5B5:
3028 		case FORMAT_A1R5G5B5:
3029 		case FORMAT_A4R4G4B4:
3030 		case FORMAT_DXT1:
3031 		case FORMAT_DXT3:
3032 		case FORMAT_DXT5:
3033 		case FORMAT_ATI1:
3034 		case FORMAT_ATI2:
3035 			return true;
3036 		default:
3037 			return false;
3038 		}
3039 	}
3040 
isSRGBwritable(Format format)3041 	bool Surface::isSRGBwritable(Format format)
3042 	{
3043 		// Keep in sync with Capabilities::isSRGBwritable
3044 		switch(format)
3045 		{
3046 		case FORMAT_NULL:
3047 		case FORMAT_A8R8G8B8:
3048 		case FORMAT_X8R8G8B8:
3049 		case FORMAT_A8B8G8R8:
3050 		case FORMAT_X8B8G8R8:
3051 		case FORMAT_SRGB8_X8:
3052 		case FORMAT_SRGB8_A8:
3053 		case FORMAT_R5G6B5:
3054 			return true;
3055 		default:
3056 			return false;
3057 		}
3058 	}
3059 
isSRGBformat(Format format)3060 	bool Surface::isSRGBformat(Format format)
3061 	{
3062 		switch(format)
3063 		{
3064 		case FORMAT_SRGB8_X8:
3065 		case FORMAT_SRGB8_A8:
3066 			return true;
3067 		default:
3068 			return false;
3069 		}
3070 	}
3071 
isCompressed(Format format)3072 	bool Surface::isCompressed(Format format)
3073 	{
3074 		switch(format)
3075 		{
3076 		case FORMAT_DXT1:
3077 		case FORMAT_DXT3:
3078 		case FORMAT_DXT5:
3079 		case FORMAT_ATI1:
3080 		case FORMAT_ATI2:
3081 		case FORMAT_ETC1:
3082 		case FORMAT_R11_EAC:
3083 		case FORMAT_SIGNED_R11_EAC:
3084 		case FORMAT_RG11_EAC:
3085 		case FORMAT_SIGNED_RG11_EAC:
3086 		case FORMAT_RGB8_ETC2:
3087 		case FORMAT_SRGB8_ETC2:
3088 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3089 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3090 		case FORMAT_RGBA8_ETC2_EAC:
3091 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3092 		case FORMAT_RGBA_ASTC_4x4_KHR:
3093 		case FORMAT_RGBA_ASTC_5x4_KHR:
3094 		case FORMAT_RGBA_ASTC_5x5_KHR:
3095 		case FORMAT_RGBA_ASTC_6x5_KHR:
3096 		case FORMAT_RGBA_ASTC_6x6_KHR:
3097 		case FORMAT_RGBA_ASTC_8x5_KHR:
3098 		case FORMAT_RGBA_ASTC_8x6_KHR:
3099 		case FORMAT_RGBA_ASTC_8x8_KHR:
3100 		case FORMAT_RGBA_ASTC_10x5_KHR:
3101 		case FORMAT_RGBA_ASTC_10x6_KHR:
3102 		case FORMAT_RGBA_ASTC_10x8_KHR:
3103 		case FORMAT_RGBA_ASTC_10x10_KHR:
3104 		case FORMAT_RGBA_ASTC_12x10_KHR:
3105 		case FORMAT_RGBA_ASTC_12x12_KHR:
3106 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3107 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3108 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3109 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3110 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3111 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3112 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3113 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3114 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3115 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3116 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3117 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3118 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3119 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3120 			return true;
3121 		default:
3122 			return false;
3123 		}
3124 	}
3125 
isSignedNonNormalizedInteger(Format format)3126 	bool Surface::isSignedNonNormalizedInteger(Format format)
3127 	{
3128 		switch(format)
3129 		{
3130 		case FORMAT_A8B8G8R8I:
3131 		case FORMAT_X8B8G8R8I:
3132 		case FORMAT_G8R8I:
3133 		case FORMAT_R8I:
3134 		case FORMAT_A16B16G16R16I:
3135 		case FORMAT_X16B16G16R16I:
3136 		case FORMAT_G16R16I:
3137 		case FORMAT_R16I:
3138 		case FORMAT_A32B32G32R32I:
3139 		case FORMAT_X32B32G32R32I:
3140 		case FORMAT_G32R32I:
3141 		case FORMAT_R32I:
3142 			return true;
3143 		default:
3144 			return false;
3145 		}
3146 	}
3147 
isUnsignedNonNormalizedInteger(Format format)3148 	bool Surface::isUnsignedNonNormalizedInteger(Format format)
3149 	{
3150 		switch(format)
3151 		{
3152 		case FORMAT_A8B8G8R8UI:
3153 		case FORMAT_X8B8G8R8UI:
3154 		case FORMAT_G8R8UI:
3155 		case FORMAT_R8UI:
3156 		case FORMAT_A16B16G16R16UI:
3157 		case FORMAT_X16B16G16R16UI:
3158 		case FORMAT_G16R16UI:
3159 		case FORMAT_R16UI:
3160 		case FORMAT_A32B32G32R32UI:
3161 		case FORMAT_X32B32G32R32UI:
3162 		case FORMAT_G32R32UI:
3163 		case FORMAT_R32UI:
3164 			return true;
3165 		default:
3166 			return false;
3167 		}
3168 	}
3169 
isNonNormalizedInteger(Format format)3170 	bool Surface::isNonNormalizedInteger(Format format)
3171 	{
3172 		return isSignedNonNormalizedInteger(format) ||
3173 		       isUnsignedNonNormalizedInteger(format);
3174 	}
3175 
isNormalizedInteger(Format format)3176 	bool Surface::isNormalizedInteger(Format format)
3177 	{
3178 		return !isFloatFormat(format) &&
3179 		       !isNonNormalizedInteger(format) &&
3180 		       !isCompressed(format) &&
3181 		       !isDepth(format) &&
3182 		       !isStencil(format);
3183 	}
3184 
componentCount(Format format)3185 	int Surface::componentCount(Format format)
3186 	{
3187 		switch(format)
3188 		{
3189 		case FORMAT_R5G6B5:         return 3;
3190 		case FORMAT_X8R8G8B8:       return 3;
3191 		case FORMAT_X8B8G8R8I:      return 3;
3192 		case FORMAT_X8B8G8R8:       return 3;
3193 		case FORMAT_A8R8G8B8:       return 4;
3194 		case FORMAT_SRGB8_X8:       return 3;
3195 		case FORMAT_SRGB8_A8:       return 4;
3196 		case FORMAT_A8B8G8R8I:      return 4;
3197 		case FORMAT_A8B8G8R8:       return 4;
3198 		case FORMAT_G8R8I:          return 2;
3199 		case FORMAT_G8R8:           return 2;
3200 		case FORMAT_R8_SNORM:      return 1;
3201 		case FORMAT_G8R8_SNORM:    return 2;
3202 		case FORMAT_X8B8G8R8_SNORM:return 3;
3203 		case FORMAT_A8B8G8R8_SNORM:return 4;
3204 		case FORMAT_R8UI:           return 1;
3205 		case FORMAT_G8R8UI:         return 2;
3206 		case FORMAT_X8B8G8R8UI:     return 3;
3207 		case FORMAT_A8B8G8R8UI:     return 4;
3208 		case FORMAT_A2B10G10R10:    return 4;
3209 		case FORMAT_A2B10G10R10UI:  return 4;
3210 		case FORMAT_G16R16I:        return 2;
3211 		case FORMAT_G16R16UI:       return 2;
3212 		case FORMAT_G16R16:         return 2;
3213 		case FORMAT_G32R32I:        return 2;
3214 		case FORMAT_G32R32UI:       return 2;
3215 		case FORMAT_X16B16G16R16I:  return 3;
3216 		case FORMAT_X16B16G16R16UI: return 3;
3217 		case FORMAT_A16B16G16R16I:  return 4;
3218 		case FORMAT_A16B16G16R16UI: return 4;
3219 		case FORMAT_A16B16G16R16:   return 4;
3220 		case FORMAT_X32B32G32R32I:  return 3;
3221 		case FORMAT_X32B32G32R32UI: return 3;
3222 		case FORMAT_A32B32G32R32I:  return 4;
3223 		case FORMAT_A32B32G32R32UI: return 4;
3224 		case FORMAT_V8U8:           return 2;
3225 		case FORMAT_Q8W8V8U8:       return 4;
3226 		case FORMAT_X8L8V8U8:       return 3;
3227 		case FORMAT_V16U16:         return 2;
3228 		case FORMAT_A16W16V16U16:   return 4;
3229 		case FORMAT_Q16W16V16U16:   return 4;
3230 		case FORMAT_R32F:           return 1;
3231 		case FORMAT_G32R32F:        return 2;
3232 		case FORMAT_X32B32G32R32F:  return 3;
3233 		case FORMAT_A32B32G32R32F:  return 4;
3234 		case FORMAT_X32B32G32R32F_UNSIGNED: return 3;
3235 		case FORMAT_D32F:           return 1;
3236 		case FORMAT_D32FS8:         return 1;
3237 		case FORMAT_D32F_LOCKABLE:  return 1;
3238 		case FORMAT_D32FS8_TEXTURE: return 1;
3239 		case FORMAT_D32F_SHADOW:    return 1;
3240 		case FORMAT_D32FS8_SHADOW:  return 1;
3241 		case FORMAT_A8:             return 1;
3242 		case FORMAT_R8I:            return 1;
3243 		case FORMAT_R8:             return 1;
3244 		case FORMAT_R16I:           return 1;
3245 		case FORMAT_R16UI:          return 1;
3246 		case FORMAT_R32I:           return 1;
3247 		case FORMAT_R32UI:          return 1;
3248 		case FORMAT_L8:             return 1;
3249 		case FORMAT_L16:            return 1;
3250 		case FORMAT_A8L8:           return 2;
3251 		case FORMAT_YV12_BT601:     return 3;
3252 		case FORMAT_YV12_BT709:     return 3;
3253 		case FORMAT_YV12_JFIF:      return 3;
3254 		default:
3255 			ASSERT(false);
3256 		}
3257 
3258 		return 1;
3259 	}
3260 
allocateBuffer(int width,int height,int depth,int border,int samples,Format format)3261 	void *Surface::allocateBuffer(int width, int height, int depth, int border, int samples, Format format)
3262 	{
3263 		// Render targets require 2x2 quads
3264 		int width2 = (width + 1) & ~1;
3265 		int height2 = (height + 1) & ~1;
3266 
3267 		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
3268 		// and stencil operations also read 8 bytes per four 8-bit stencil values,
3269 		// so we have to allocate 4 extra bytes to avoid buffer overruns.
3270 		return allocate(size(width2, height2, depth, border, samples, format) + 4);
3271 	}
3272 
memfill4(void * buffer,int pattern,int bytes)3273 	void Surface::memfill4(void *buffer, int pattern, int bytes)
3274 	{
3275 		while((size_t)buffer & 0x1 && bytes >= 1)
3276 		{
3277 			*(char*)buffer = (char)pattern;
3278 			(char*&)buffer += 1;
3279 			bytes -= 1;
3280 		}
3281 
3282 		while((size_t)buffer & 0x3 && bytes >= 2)
3283 		{
3284 			*(short*)buffer = (short)pattern;
3285 			(short*&)buffer += 1;
3286 			bytes -= 2;
3287 		}
3288 
3289 		#if defined(__i386__) || defined(__x86_64__)
3290 			if(CPUID::supportsSSE())
3291 			{
3292 				while((size_t)buffer & 0xF && bytes >= 4)
3293 				{
3294 					*(int*)buffer = pattern;
3295 					(int*&)buffer += 1;
3296 					bytes -= 4;
3297 				}
3298 
3299 				__m128 quad = _mm_set_ps1((float&)pattern);
3300 
3301 				float *pointer = (float*)buffer;
3302 				int qxwords = bytes / 64;
3303 				bytes -= qxwords * 64;
3304 
3305 				while(qxwords--)
3306 				{
3307 					_mm_stream_ps(pointer + 0, quad);
3308 					_mm_stream_ps(pointer + 4, quad);
3309 					_mm_stream_ps(pointer + 8, quad);
3310 					_mm_stream_ps(pointer + 12, quad);
3311 
3312 					pointer += 16;
3313 				}
3314 
3315 				buffer = pointer;
3316 			}
3317 		#endif
3318 
3319 		while(bytes >= 4)
3320 		{
3321 			*(int*)buffer = (int)pattern;
3322 			(int*&)buffer += 1;
3323 			bytes -= 4;
3324 		}
3325 
3326 		while(bytes >= 2)
3327 		{
3328 			*(short*)buffer = (short)pattern;
3329 			(short*&)buffer += 1;
3330 			bytes -= 2;
3331 		}
3332 
3333 		while(bytes >= 1)
3334 		{
3335 			*(char*)buffer = (char)pattern;
3336 			(char*&)buffer += 1;
3337 			bytes -= 1;
3338 		}
3339 	}
3340 
sync()3341 	void Surface::sync()
3342 	{
3343 		resource->lock(EXCLUSIVE);
3344 		resource->unlock();
3345 	}
3346 
isEntire(const Rect & rect) const3347 	bool Surface::isEntire(const Rect& rect) const
3348 	{
3349 		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
3350 	}
3351 
getRect() const3352 	Rect Surface::getRect() const
3353 	{
3354 		return Rect(0, 0, internal.width, internal.height);
3355 	}
3356 
clearDepth(float depth,int x0,int y0,int width,int height)3357 	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
3358 	{
3359 		if(width == 0 || height == 0) return;
3360 
3361 		// Not overlapping
3362 		if(x0 > internal.width) return;
3363 		if(y0 > internal.height) return;
3364 		if(x0 + width < 0) return;
3365 		if(y0 + height < 0) return;
3366 
3367 		// Clip against dimensions
3368 		if(x0 < 0) {width += x0; x0 = 0;}
3369 		if(x0 + width > internal.width) width = internal.width - x0;
3370 		if(y0 < 0) {height += y0; y0 = 0;}
3371 		if(y0 + height > internal.height) height = internal.height - y0;
3372 
3373 		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
3374 		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
3375 
3376 		int x1 = x0 + width;
3377 		int y1 = y0 + height;
3378 
3379 		if(!hasQuadLayout(internal.format))
3380 		{
3381 			float *target = (float*)lockInternal(x0, y0, 0, lock, PUBLIC);
3382 
3383 			for(int z = 0; z < internal.samples; z++)
3384 			{
3385 				float *row = target;
3386 				for(int y = y0; y < y1; y++)
3387 				{
3388 					memfill4(row, (int&)depth, width * sizeof(float));
3389 					row += internal.pitchP;
3390 				}
3391 				target += internal.sliceP;
3392 			}
3393 
3394 			unlockInternal();
3395 		}
3396 		else   // Quad layout
3397 		{
3398 			if(complementaryDepthBuffer)
3399 			{
3400 				depth = 1 - depth;
3401 			}
3402 
3403 			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
3404 
3405 			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3406 			int oddX1 = (x1 & ~1) * 2;
3407 			int evenX0 = ((x0 + 1) & ~1) * 2;
3408 			int evenBytes = (oddX1 - evenX0) * sizeof(float);
3409 
3410 			for(int z = 0; z < internal.samples; z++)
3411 			{
3412 				for(int y = y0; y < y1; y++)
3413 				{
3414 					float *target = buffer + (y & ~1) * internal.pitchP + (y & 1) * 2;
3415 
3416 					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
3417 					{
3418 						if((x0 & 1) != 0)
3419 						{
3420 							target[oddX0 + 0] = depth;
3421 							target[oddX0 + 2] = depth;
3422 						}
3423 
3424 					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
3425 					//	{
3426 					//		target[x2 + 0] = depth;
3427 					//		target[x2 + 1] = depth;
3428 					//		target[x2 + 2] = depth;
3429 					//		target[x2 + 3] = depth;
3430 					//	}
3431 
3432 					//	__asm
3433 					//	{
3434 					//		movss xmm0, depth
3435 					//		shufps xmm0, xmm0, 0x00
3436 					//
3437 					//		mov eax, x0
3438 					//		add eax, 1
3439 					//		and eax, 0xFFFFFFFE
3440 					//		cmp eax, x1
3441 					//		jge qEnd
3442 					//
3443 					//		mov edi, target
3444 					//
3445 					//	qLoop:
3446 					//		movntps [edi+8*eax], xmm0
3447 					//
3448 					//		add eax, 2
3449 					//		cmp eax, x1
3450 					//		jl qLoop
3451 					//	qEnd:
3452 					//	}
3453 
3454 						memfill4(&target[evenX0], (int&)depth, evenBytes);
3455 
3456 						if((x1 & 1) != 0)
3457 						{
3458 							target[oddX1 + 0] = depth;
3459 							target[oddX1 + 2] = depth;
3460 						}
3461 
3462 						y++;
3463 					}
3464 					else
3465 					{
3466 						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3467 						{
3468 							target[i] = depth;
3469 						}
3470 					}
3471 				}
3472 
3473 				buffer += internal.sliceP;
3474 			}
3475 
3476 			unlockInternal();
3477 		}
3478 	}
3479 
clearStencil(unsigned char s,unsigned char mask,int x0,int y0,int width,int height)3480 	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
3481 	{
3482 		if(mask == 0 || width == 0 || height == 0) return;
3483 
3484 		// Not overlapping
3485 		if(x0 > internal.width) return;
3486 		if(y0 > internal.height) return;
3487 		if(x0 + width < 0) return;
3488 		if(y0 + height < 0) return;
3489 
3490 		// Clip against dimensions
3491 		if(x0 < 0) {width += x0; x0 = 0;}
3492 		if(x0 + width > internal.width) width = internal.width - x0;
3493 		if(y0 < 0) {height += y0; y0 = 0;}
3494 		if(y0 + height > internal.height) height = internal.height - y0;
3495 
3496 		int x1 = x0 + width;
3497 		int y1 = y0 + height;
3498 
3499 		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3500 		int oddX1 = (x1 & ~1) * 2;
3501 		int evenX0 = ((x0 + 1) & ~1) * 2;
3502 		int evenBytes = oddX1 - evenX0;
3503 
3504 		unsigned char maskedS = s & mask;
3505 		unsigned char invMask = ~mask;
3506 		unsigned int fill = maskedS;
3507 		fill = fill | (fill << 8) | (fill << 16) | (fill << 24);
3508 
3509 		char *buffer = (char*)lockStencil(0, 0, 0, PUBLIC);
3510 
3511 		// Stencil buffers are assumed to use quad layout
3512 		for(int z = 0; z < stencil.samples; z++)
3513 		{
3514 			for(int y = y0; y < y1; y++)
3515 			{
3516 				char *target = buffer + (y & ~1) * stencil.pitchP + (y & 1) * 2;
3517 
3518 				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
3519 				{
3520 					if((x0 & 1) != 0)
3521 					{
3522 						target[oddX0 + 0] = fill;
3523 						target[oddX0 + 2] = fill;
3524 					}
3525 
3526 					memfill4(&target[evenX0], fill, evenBytes);
3527 
3528 					if((x1 & 1) != 0)
3529 					{
3530 						target[oddX1 + 0] = fill;
3531 						target[oddX1 + 2] = fill;
3532 					}
3533 
3534 					y++;
3535 				}
3536 				else
3537 				{
3538 					for(int x = x0; x < x1; x++)
3539 					{
3540 						int i = (x & ~1) * 2 + (x & 1);
3541 						target[i] = maskedS | (target[i] & invMask);
3542 					}
3543 				}
3544 			}
3545 
3546 			buffer += stencil.sliceP;
3547 		}
3548 
3549 		unlockStencil();
3550 	}
3551 
fill(const Color<float> & color,int x0,int y0,int width,int height)3552 	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3553 	{
3554 		unsigned char *row;
3555 		Buffer *buffer;
3556 
3557 		if(internal.dirty)
3558 		{
3559 			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3560 			buffer = &internal;
3561 		}
3562 		else
3563 		{
3564 			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3565 			buffer = &external;
3566 		}
3567 
3568 		if(buffer->bytes <= 4)
3569 		{
3570 			int c;
3571 			buffer->write(&c, color);
3572 
3573 			if(buffer->bytes <= 1) c = (c << 8)  | c;
3574 			if(buffer->bytes <= 2) c = (c << 16) | c;
3575 
3576 			for(int y = 0; y < height; y++)
3577 			{
3578 				memfill4(row, c, width * buffer->bytes);
3579 
3580 				row += buffer->pitchB;
3581 			}
3582 		}
3583 		else   // Generic
3584 		{
3585 			for(int y = 0; y < height; y++)
3586 			{
3587 				unsigned char *element = row;
3588 
3589 				for(int x = 0; x < width; x++)
3590 				{
3591 					buffer->write(element, color);
3592 
3593 					element += buffer->bytes;
3594 				}
3595 
3596 				row += buffer->pitchB;
3597 			}
3598 		}
3599 
3600 		if(buffer == &internal)
3601 		{
3602 			unlockInternal();
3603 		}
3604 		else
3605 		{
3606 			unlockExternal();
3607 		}
3608 	}
3609 
copyInternal(const Surface * source,int x,int y,float srcX,float srcY,bool filter)3610 	void Surface::copyInternal(const Surface *source, int x, int y, float srcX, float srcY, bool filter)
3611 	{
3612 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3613 
3614 		sw::Color<float> color;
3615 
3616 		if(!filter)
3617 		{
3618 			color = source->internal.read((int)srcX, (int)srcY, 0);
3619 		}
3620 		else   // Bilinear filtering
3621 		{
3622 			color = source->internal.sample(srcX, srcY, 0);
3623 		}
3624 
3625 		internal.write(x, y, color);
3626 	}
3627 
copyInternal(const Surface * source,int x,int y,int z,float srcX,float srcY,float srcZ,bool filter)3628 	void Surface::copyInternal(const Surface *source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
3629 	{
3630 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3631 
3632 		sw::Color<float> color;
3633 
3634 		if(!filter)
3635 		{
3636 			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
3637 		}
3638 		else   // Bilinear filtering
3639 		{
3640 			color = source->internal.sample(srcX, srcY, srcZ);
3641 		}
3642 
3643 		internal.write(x, y, z, color);
3644 	}
3645 
copyCubeEdge(Edge dstEdge,Surface * src,Edge srcEdge)3646 	void Surface::copyCubeEdge(Edge dstEdge, Surface *src, Edge srcEdge)
3647 	{
3648 		Surface *dst = this;
3649 
3650 		// Figure out if the edges to be copied in reverse order respectively from one another
3651 		// The copy should be reversed whenever the same edges are contiguous or if we're
3652 		// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
3653 		//
3654 		//      | +y |
3655 		// | -x | +z | +x | -z |
3656 		//      | -y |
3657 
3658 		bool reverse = (srcEdge == dstEdge) ||
3659 		               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
3660 		               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
3661 		               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
3662 		               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
3663 
3664 		int srcBytes = src->bytes(src->Surface::getInternalFormat());
3665 		int srcPitch = src->getInternalPitchB();
3666 		int dstBytes = dst->bytes(dst->Surface::getInternalFormat());
3667 		int dstPitch = dst->getInternalPitchB();
3668 
3669 		int srcW = src->getWidth();
3670 		int srcH = src->getHeight();
3671 		int dstW = dst->getWidth();
3672 		int dstH = dst->getHeight();
3673 
3674 		ASSERT(srcW == srcH && dstW == dstH && srcW == dstW && srcBytes == dstBytes);
3675 
3676 		// Src is expressed in the regular [0, width-1], [0, height-1] space
3677 		int srcDelta = ((srcEdge == TOP) || (srcEdge == BOTTOM)) ? srcBytes : srcPitch;
3678 		int srcStart = ((srcEdge == BOTTOM) ? srcPitch * (srcH - 1) : ((srcEdge == RIGHT) ? srcBytes * (srcW - 1) : 0));
3679 
3680 		// Dst contains borders, so it is expressed in the [-1, width+1], [-1, height+1] space
3681 		int dstDelta = (((dstEdge == TOP) || (dstEdge == BOTTOM)) ? dstBytes : dstPitch) * (reverse ? -1 : 1);
3682 		int dstStart = ((dstEdge == BOTTOM) ? dstPitch * (dstH + 1) : ((dstEdge == RIGHT) ? dstBytes * (dstW + 1) : 0)) + (reverse ? dstW * -dstDelta : dstDelta);
3683 
3684 		char *srcBuf = (char*)src->lockInternal(0, 0, 0, sw::LOCK_READONLY, sw::PRIVATE) + srcStart;
3685 		char *dstBuf = (char*)dst->lockInternal(-1, -1, 0, sw::LOCK_READWRITE, sw::PRIVATE) + dstStart;
3686 
3687 		for(int i = 0; i < srcW; ++i, dstBuf += dstDelta, srcBuf += srcDelta)
3688 		{
3689 			memcpy(dstBuf, srcBuf, srcBytes);
3690 		}
3691 
3692 		if(dstEdge == LEFT || dstEdge == RIGHT)
3693 		{
3694 			// TOP and BOTTOM are already set, let's average out the corners
3695 			int x0 = (dstEdge == RIGHT) ? dstW : -1;
3696 			int y0 = -1;
3697 			int x1 = (dstEdge == RIGHT) ? dstW - 1 : 0;
3698 			int y1 = 0;
3699 			dst->computeCubeCorner(x0, y0, x1, y1);
3700 			y0 = dstH;
3701 			y1 = dstH - 1;
3702 			dst->computeCubeCorner(x0, y0, x1, y1);
3703 		}
3704 
3705 		src->unlockInternal();
3706 		dst->unlockInternal();
3707 	}
3708 
computeCubeCorner(int x0,int y0,int x1,int y1)3709 	void Surface::computeCubeCorner(int x0, int y0, int x1, int y1)
3710 	{
3711 		ASSERT(internal.lock != LOCK_UNLOCKED);
3712 
3713 		sw::Color<float> color = internal.read(x0, y1);
3714 		color += internal.read(x1, y0);
3715 		color += internal.read(x1, y1);
3716 		color *= (1.0f / 3.0f);
3717 
3718 		internal.write(x0, y0, color);
3719 	}
3720 
hasStencil() const3721 	bool Surface::hasStencil() const
3722 	{
3723 		return isStencil(external.format);
3724 	}
3725 
hasDepth() const3726 	bool Surface::hasDepth() const
3727 	{
3728 		return isDepth(external.format);
3729 	}
3730 
hasPalette() const3731 	bool Surface::hasPalette() const
3732 	{
3733 		return isPalette(external.format);
3734 	}
3735 
isRenderTarget() const3736 	bool Surface::isRenderTarget() const
3737 	{
3738 		return renderTarget;
3739 	}
3740 
hasDirtyContents() const3741 	bool Surface::hasDirtyContents() const
3742 	{
3743 		return dirtyContents;
3744 	}
3745 
markContentsClean()3746 	void Surface::markContentsClean()
3747 	{
3748 		dirtyContents = false;
3749 	}
3750 
getResource()3751 	Resource *Surface::getResource()
3752 	{
3753 		return resource;
3754 	}
3755 
identicalFormats() const3756 	bool Surface::identicalFormats() const
3757 	{
3758 		return external.format == internal.format &&
3759 		       external.width  == internal.width &&
3760 		       external.height == internal.height &&
3761 		       external.depth  == internal.depth &&
3762 		       external.pitchB == internal.pitchB &&
3763 		       external.sliceB == internal.sliceB &&
3764 		       external.border == internal.border &&
3765 		       external.samples == internal.samples;
3766 	}
3767 
selectInternalFormat(Format format) const3768 	Format Surface::selectInternalFormat(Format format) const
3769 	{
3770 		switch(format)
3771 		{
3772 		case FORMAT_NULL:
3773 			return FORMAT_NULL;
3774 		case FORMAT_P8:
3775 		case FORMAT_A8P8:
3776 		case FORMAT_A4R4G4B4:
3777 		case FORMAT_A1R5G5B5:
3778 		case FORMAT_A8R3G3B2:
3779 			return FORMAT_A8R8G8B8;
3780 		case FORMAT_A8:
3781 			return FORMAT_A8;
3782 		case FORMAT_R8I:
3783 			return FORMAT_R8I;
3784 		case FORMAT_R8UI:
3785 			return FORMAT_R8UI;
3786 		case FORMAT_R8_SNORM:
3787 			return FORMAT_R8_SNORM;
3788 		case FORMAT_R8:
3789 			return FORMAT_R8;
3790 		case FORMAT_R16I:
3791 			return FORMAT_R16I;
3792 		case FORMAT_R16UI:
3793 			return FORMAT_R16UI;
3794 		case FORMAT_R32I:
3795 			return FORMAT_R32I;
3796 		case FORMAT_R32UI:
3797 			return FORMAT_R32UI;
3798 		case FORMAT_X16B16G16R16I:
3799 			return FORMAT_X16B16G16R16I;
3800 		case FORMAT_A16B16G16R16I:
3801 			return FORMAT_A16B16G16R16I;
3802 		case FORMAT_X16B16G16R16UI:
3803 			return FORMAT_X16B16G16R16UI;
3804 		case FORMAT_A16B16G16R16UI:
3805 			return FORMAT_A16B16G16R16UI;
3806 		case FORMAT_A2R10G10B10:
3807 		case FORMAT_A2B10G10R10:
3808 		case FORMAT_A16B16G16R16:
3809 			return FORMAT_A16B16G16R16;
3810 		case FORMAT_A2B10G10R10UI:
3811 			return FORMAT_A16B16G16R16UI;
3812 		case FORMAT_X32B32G32R32I:
3813 			return FORMAT_X32B32G32R32I;
3814 		case FORMAT_A32B32G32R32I:
3815 			return FORMAT_A32B32G32R32I;
3816 		case FORMAT_X32B32G32R32UI:
3817 			return FORMAT_X32B32G32R32UI;
3818 		case FORMAT_A32B32G32R32UI:
3819 			return FORMAT_A32B32G32R32UI;
3820 		case FORMAT_G8R8I:
3821 			return FORMAT_G8R8I;
3822 		case FORMAT_G8R8UI:
3823 			return FORMAT_G8R8UI;
3824 		case FORMAT_G8R8_SNORM:
3825 			return FORMAT_G8R8_SNORM;
3826 		case FORMAT_G8R8:
3827 			return FORMAT_G8R8;
3828 		case FORMAT_G16R16I:
3829 			return FORMAT_G16R16I;
3830 		case FORMAT_G16R16UI:
3831 			return FORMAT_G16R16UI;
3832 		case FORMAT_G16R16:
3833 			return FORMAT_G16R16;
3834 		case FORMAT_G32R32I:
3835 			return FORMAT_G32R32I;
3836 		case FORMAT_G32R32UI:
3837 			return FORMAT_G32R32UI;
3838 		case FORMAT_A8R8G8B8:
3839 			if(lockable || !quadLayoutEnabled)
3840 			{
3841 				return FORMAT_A8R8G8B8;
3842 			}
3843 			else
3844 			{
3845 				return FORMAT_A8G8R8B8Q;
3846 			}
3847 		case FORMAT_A8B8G8R8I:
3848 			return FORMAT_A8B8G8R8I;
3849 		case FORMAT_A8B8G8R8UI:
3850 			return FORMAT_A8B8G8R8UI;
3851 		case FORMAT_A8B8G8R8_SNORM:
3852 			return FORMAT_A8B8G8R8_SNORM;
3853 		case FORMAT_R5G5B5A1:
3854 		case FORMAT_R4G4B4A4:
3855 		case FORMAT_A8B8G8R8:
3856 			return FORMAT_A8B8G8R8;
3857 		case FORMAT_R5G6B5:
3858 			return FORMAT_R5G6B5;
3859 		case FORMAT_R3G3B2:
3860 		case FORMAT_R8G8B8:
3861 		case FORMAT_X4R4G4B4:
3862 		case FORMAT_X1R5G5B5:
3863 		case FORMAT_X8R8G8B8:
3864 			if(lockable || !quadLayoutEnabled)
3865 			{
3866 				return FORMAT_X8R8G8B8;
3867 			}
3868 			else
3869 			{
3870 				return FORMAT_X8G8R8B8Q;
3871 			}
3872 		case FORMAT_X8B8G8R8I:
3873 			return FORMAT_X8B8G8R8I;
3874 		case FORMAT_X8B8G8R8UI:
3875 			return FORMAT_X8B8G8R8UI;
3876 		case FORMAT_X8B8G8R8_SNORM:
3877 			return FORMAT_X8B8G8R8_SNORM;
3878 		case FORMAT_B8G8R8:
3879 		case FORMAT_X8B8G8R8:
3880 			return FORMAT_X8B8G8R8;
3881 		case FORMAT_SRGB8_X8:
3882 			return FORMAT_SRGB8_X8;
3883 		case FORMAT_SRGB8_A8:
3884 			return FORMAT_SRGB8_A8;
3885 		// Compressed formats
3886 		case FORMAT_DXT1:
3887 		case FORMAT_DXT3:
3888 		case FORMAT_DXT5:
3889 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3890 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3891 		case FORMAT_RGBA8_ETC2_EAC:
3892 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3893 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3894 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3895 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3896 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3897 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3898 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3899 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3900 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3901 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3902 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3903 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3904 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3905 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3906 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3907 			return FORMAT_A8R8G8B8;
3908 		case FORMAT_RGBA_ASTC_4x4_KHR:
3909 		case FORMAT_RGBA_ASTC_5x4_KHR:
3910 		case FORMAT_RGBA_ASTC_5x5_KHR:
3911 		case FORMAT_RGBA_ASTC_6x5_KHR:
3912 		case FORMAT_RGBA_ASTC_6x6_KHR:
3913 		case FORMAT_RGBA_ASTC_8x5_KHR:
3914 		case FORMAT_RGBA_ASTC_8x6_KHR:
3915 		case FORMAT_RGBA_ASTC_8x8_KHR:
3916 		case FORMAT_RGBA_ASTC_10x5_KHR:
3917 		case FORMAT_RGBA_ASTC_10x6_KHR:
3918 		case FORMAT_RGBA_ASTC_10x8_KHR:
3919 		case FORMAT_RGBA_ASTC_10x10_KHR:
3920 		case FORMAT_RGBA_ASTC_12x10_KHR:
3921 		case FORMAT_RGBA_ASTC_12x12_KHR:
3922 			// ASTC supports HDR, so a floating point format is required to represent it properly
3923 			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
3924 		case FORMAT_ATI1:
3925 			return FORMAT_R8;
3926 		case FORMAT_R11_EAC:
3927 		case FORMAT_SIGNED_R11_EAC:
3928 			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
3929 		case FORMAT_ATI2:
3930 			return FORMAT_G8R8;
3931 		case FORMAT_RG11_EAC:
3932 		case FORMAT_SIGNED_RG11_EAC:
3933 			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
3934 		case FORMAT_ETC1:
3935 		case FORMAT_RGB8_ETC2:
3936 		case FORMAT_SRGB8_ETC2:
3937 			return FORMAT_X8R8G8B8;
3938 		// Bumpmap formats
3939 		case FORMAT_V8U8:			return FORMAT_V8U8;
3940 		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
3941 		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
3942 		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
3943 		case FORMAT_V16U16:			return FORMAT_V16U16;
3944 		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
3945 		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
3946 		// Floating-point formats
3947 		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
3948 		case FORMAT_R16F:			return FORMAT_R32F;
3949 		case FORMAT_G16R16F:		return FORMAT_G32R32F;
3950 		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
3951 		case FORMAT_X16B16G16R16F:	return FORMAT_X32B32G32R32F;
3952 		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
3953 		case FORMAT_X16B16G16R16F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
3954 		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
3955 		case FORMAT_R32F:			return FORMAT_R32F;
3956 		case FORMAT_G32R32F:		return FORMAT_G32R32F;
3957 		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
3958 		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
3959 		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
3960 		case FORMAT_X32B32G32R32F_UNSIGNED: return FORMAT_X32B32G32R32F_UNSIGNED;
3961 		// Luminance formats
3962 		case FORMAT_L8:				return FORMAT_L8;
3963 		case FORMAT_A4L4:			return FORMAT_A8L8;
3964 		case FORMAT_L16:			return FORMAT_L16;
3965 		case FORMAT_A8L8:			return FORMAT_A8L8;
3966 		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
3967 		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
3968 		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
3969 		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
3970 		// Depth/stencil formats
3971 		case FORMAT_D16:
3972 		case FORMAT_D32:
3973 		case FORMAT_D24X8:
3974 			if(hasParent)   // Texture
3975 			{
3976 				return FORMAT_D32F_SHADOW;
3977 			}
3978 			else if(complementaryDepthBuffer)
3979 			{
3980 				return FORMAT_D32F_COMPLEMENTARY;
3981 			}
3982 			else
3983 			{
3984 				return FORMAT_D32F;
3985 			}
3986 		case FORMAT_D24S8:
3987 		case FORMAT_D24FS8:
3988 			if(hasParent)   // Texture
3989 			{
3990 				return FORMAT_D32FS8_SHADOW;
3991 			}
3992 			else if(complementaryDepthBuffer)
3993 			{
3994 				return FORMAT_D32FS8_COMPLEMENTARY;
3995 			}
3996 			else
3997 			{
3998 				return FORMAT_D32FS8;
3999 			}
4000 		case FORMAT_D32F:           return FORMAT_D32F;
4001 		case FORMAT_D32FS8:         return FORMAT_D32FS8;
4002 		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
4003 		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
4004 		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
4005 		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
4006 		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
4007 		case FORMAT_S8:             return FORMAT_S8;
4008 		// YUV formats
4009 		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
4010 		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
4011 		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
4012 		default:
4013 			ASSERT(false);
4014 		}
4015 
4016 		return FORMAT_NULL;
4017 	}
4018 
setTexturePalette(unsigned int * palette)4019 	void Surface::setTexturePalette(unsigned int *palette)
4020 	{
4021 		Surface::palette = palette;
4022 		Surface::paletteID++;
4023 	}
4024 
resolve()4025 	void Surface::resolve()
4026 	{
4027 		if(internal.samples <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
4028 		{
4029 			return;
4030 		}
4031 
4032 		ASSERT(internal.depth == 1);  // Unimplemented
4033 
4034 		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
4035 
4036 		int width = internal.width;
4037 		int height = internal.height;
4038 		int pitch = internal.pitchB;
4039 		int slice = internal.sliceB;
4040 
4041 		unsigned char *source0 = (unsigned char*)source;
4042 		unsigned char *source1 = source0 + slice;
4043 		unsigned char *source2 = source1 + slice;
4044 		unsigned char *source3 = source2 + slice;
4045 		unsigned char *source4 = source3 + slice;
4046 		unsigned char *source5 = source4 + slice;
4047 		unsigned char *source6 = source5 + slice;
4048 		unsigned char *source7 = source6 + slice;
4049 		unsigned char *source8 = source7 + slice;
4050 		unsigned char *source9 = source8 + slice;
4051 		unsigned char *sourceA = source9 + slice;
4052 		unsigned char *sourceB = sourceA + slice;
4053 		unsigned char *sourceC = sourceB + slice;
4054 		unsigned char *sourceD = sourceC + slice;
4055 		unsigned char *sourceE = sourceD + slice;
4056 		unsigned char *sourceF = sourceE + slice;
4057 
4058 		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
4059 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
4060 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
4061 		{
4062 			#if defined(__i386__) || defined(__x86_64__)
4063 				if(CPUID::supportsSSE2() && (width % 4) == 0)
4064 				{
4065 					if(internal.samples == 2)
4066 					{
4067 						for(int y = 0; y < height; y++)
4068 						{
4069 							for(int x = 0; x < width; x += 4)
4070 							{
4071 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4072 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4073 
4074 								c0 = _mm_avg_epu8(c0, c1);
4075 
4076 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4077 							}
4078 
4079 							source0 += pitch;
4080 							source1 += pitch;
4081 						}
4082 					}
4083 					else if(internal.samples == 4)
4084 					{
4085 						for(int y = 0; y < height; y++)
4086 						{
4087 							for(int x = 0; x < width; x += 4)
4088 							{
4089 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4090 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4091 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4092 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4093 
4094 								c0 = _mm_avg_epu8(c0, c1);
4095 								c2 = _mm_avg_epu8(c2, c3);
4096 								c0 = _mm_avg_epu8(c0, c2);
4097 
4098 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4099 							}
4100 
4101 							source0 += pitch;
4102 							source1 += pitch;
4103 							source2 += pitch;
4104 							source3 += pitch;
4105 						}
4106 					}
4107 					else if(internal.samples == 8)
4108 					{
4109 						for(int y = 0; y < height; y++)
4110 						{
4111 							for(int x = 0; x < width; x += 4)
4112 							{
4113 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4114 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4115 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4116 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4117 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4118 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4119 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4120 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4121 
4122 								c0 = _mm_avg_epu8(c0, c1);
4123 								c2 = _mm_avg_epu8(c2, c3);
4124 								c4 = _mm_avg_epu8(c4, c5);
4125 								c6 = _mm_avg_epu8(c6, c7);
4126 								c0 = _mm_avg_epu8(c0, c2);
4127 								c4 = _mm_avg_epu8(c4, c6);
4128 								c0 = _mm_avg_epu8(c0, c4);
4129 
4130 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4131 							}
4132 
4133 							source0 += pitch;
4134 							source1 += pitch;
4135 							source2 += pitch;
4136 							source3 += pitch;
4137 							source4 += pitch;
4138 							source5 += pitch;
4139 							source6 += pitch;
4140 							source7 += pitch;
4141 						}
4142 					}
4143 					else if(internal.samples == 16)
4144 					{
4145 						for(int y = 0; y < height; y++)
4146 						{
4147 							for(int x = 0; x < width; x += 4)
4148 							{
4149 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4150 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4151 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4152 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4153 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4154 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4155 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4156 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4157 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4158 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4159 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4160 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4161 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4162 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4163 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4164 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4165 
4166 								c0 = _mm_avg_epu8(c0, c1);
4167 								c2 = _mm_avg_epu8(c2, c3);
4168 								c4 = _mm_avg_epu8(c4, c5);
4169 								c6 = _mm_avg_epu8(c6, c7);
4170 								c8 = _mm_avg_epu8(c8, c9);
4171 								cA = _mm_avg_epu8(cA, cB);
4172 								cC = _mm_avg_epu8(cC, cD);
4173 								cE = _mm_avg_epu8(cE, cF);
4174 								c0 = _mm_avg_epu8(c0, c2);
4175 								c4 = _mm_avg_epu8(c4, c6);
4176 								c8 = _mm_avg_epu8(c8, cA);
4177 								cC = _mm_avg_epu8(cC, cE);
4178 								c0 = _mm_avg_epu8(c0, c4);
4179 								c8 = _mm_avg_epu8(c8, cC);
4180 								c0 = _mm_avg_epu8(c0, c8);
4181 
4182 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4183 							}
4184 
4185 							source0 += pitch;
4186 							source1 += pitch;
4187 							source2 += pitch;
4188 							source3 += pitch;
4189 							source4 += pitch;
4190 							source5 += pitch;
4191 							source6 += pitch;
4192 							source7 += pitch;
4193 							source8 += pitch;
4194 							source9 += pitch;
4195 							sourceA += pitch;
4196 							sourceB += pitch;
4197 							sourceC += pitch;
4198 							sourceD += pitch;
4199 							sourceE += pitch;
4200 							sourceF += pitch;
4201 						}
4202 					}
4203 					else ASSERT(false);
4204 				}
4205 				else
4206 			#endif
4207 			{
4208 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
4209 
4210 				if(internal.samples == 2)
4211 				{
4212 					for(int y = 0; y < height; y++)
4213 					{
4214 						for(int x = 0; x < width; x++)
4215 						{
4216 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4217 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4218 
4219 							c0 = AVERAGE(c0, c1);
4220 
4221 							*(unsigned int*)(source0 + 4 * x) = c0;
4222 						}
4223 
4224 						source0 += pitch;
4225 						source1 += pitch;
4226 					}
4227 				}
4228 				else if(internal.samples == 4)
4229 				{
4230 					for(int y = 0; y < height; y++)
4231 					{
4232 						for(int x = 0; x < width; x++)
4233 						{
4234 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4235 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4236 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4237 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4238 
4239 							c0 = AVERAGE(c0, c1);
4240 							c2 = AVERAGE(c2, c3);
4241 							c0 = AVERAGE(c0, c2);
4242 
4243 							*(unsigned int*)(source0 + 4 * x) = c0;
4244 						}
4245 
4246 						source0 += pitch;
4247 						source1 += pitch;
4248 						source2 += pitch;
4249 						source3 += pitch;
4250 					}
4251 				}
4252 				else if(internal.samples == 8)
4253 				{
4254 					for(int y = 0; y < height; y++)
4255 					{
4256 						for(int x = 0; x < width; x++)
4257 						{
4258 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4259 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4260 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4261 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4262 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4263 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4264 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4265 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4266 
4267 							c0 = AVERAGE(c0, c1);
4268 							c2 = AVERAGE(c2, c3);
4269 							c4 = AVERAGE(c4, c5);
4270 							c6 = AVERAGE(c6, c7);
4271 							c0 = AVERAGE(c0, c2);
4272 							c4 = AVERAGE(c4, c6);
4273 							c0 = AVERAGE(c0, c4);
4274 
4275 							*(unsigned int*)(source0 + 4 * x) = c0;
4276 						}
4277 
4278 						source0 += pitch;
4279 						source1 += pitch;
4280 						source2 += pitch;
4281 						source3 += pitch;
4282 						source4 += pitch;
4283 						source5 += pitch;
4284 						source6 += pitch;
4285 						source7 += pitch;
4286 					}
4287 				}
4288 				else if(internal.samples == 16)
4289 				{
4290 					for(int y = 0; y < height; y++)
4291 					{
4292 						for(int x = 0; x < width; x++)
4293 						{
4294 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4295 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4296 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4297 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4298 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4299 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4300 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4301 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4302 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4303 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4304 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4305 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4306 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4307 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4308 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4309 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4310 
4311 							c0 = AVERAGE(c0, c1);
4312 							c2 = AVERAGE(c2, c3);
4313 							c4 = AVERAGE(c4, c5);
4314 							c6 = AVERAGE(c6, c7);
4315 							c8 = AVERAGE(c8, c9);
4316 							cA = AVERAGE(cA, cB);
4317 							cC = AVERAGE(cC, cD);
4318 							cE = AVERAGE(cE, cF);
4319 							c0 = AVERAGE(c0, c2);
4320 							c4 = AVERAGE(c4, c6);
4321 							c8 = AVERAGE(c8, cA);
4322 							cC = AVERAGE(cC, cE);
4323 							c0 = AVERAGE(c0, c4);
4324 							c8 = AVERAGE(c8, cC);
4325 							c0 = AVERAGE(c0, c8);
4326 
4327 							*(unsigned int*)(source0 + 4 * x) = c0;
4328 						}
4329 
4330 						source0 += pitch;
4331 						source1 += pitch;
4332 						source2 += pitch;
4333 						source3 += pitch;
4334 						source4 += pitch;
4335 						source5 += pitch;
4336 						source6 += pitch;
4337 						source7 += pitch;
4338 						source8 += pitch;
4339 						source9 += pitch;
4340 						sourceA += pitch;
4341 						sourceB += pitch;
4342 						sourceC += pitch;
4343 						sourceD += pitch;
4344 						sourceE += pitch;
4345 						sourceF += pitch;
4346 					}
4347 				}
4348 				else ASSERT(false);
4349 
4350 				#undef AVERAGE
4351 			}
4352 		}
4353 		else if(internal.format == FORMAT_G16R16)
4354 		{
4355 
4356 			#if defined(__i386__) || defined(__x86_64__)
4357 				if(CPUID::supportsSSE2() && (width % 4) == 0)
4358 				{
4359 					if(internal.samples == 2)
4360 					{
4361 						for(int y = 0; y < height; y++)
4362 						{
4363 							for(int x = 0; x < width; x += 4)
4364 							{
4365 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4366 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4367 
4368 								c0 = _mm_avg_epu16(c0, c1);
4369 
4370 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4371 							}
4372 
4373 							source0 += pitch;
4374 							source1 += pitch;
4375 						}
4376 					}
4377 					else if(internal.samples == 4)
4378 					{
4379 						for(int y = 0; y < height; y++)
4380 						{
4381 							for(int x = 0; x < width; x += 4)
4382 							{
4383 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4384 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4385 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4386 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4387 
4388 								c0 = _mm_avg_epu16(c0, c1);
4389 								c2 = _mm_avg_epu16(c2, c3);
4390 								c0 = _mm_avg_epu16(c0, c2);
4391 
4392 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4393 							}
4394 
4395 							source0 += pitch;
4396 							source1 += pitch;
4397 							source2 += pitch;
4398 							source3 += pitch;
4399 						}
4400 					}
4401 					else if(internal.samples == 8)
4402 					{
4403 						for(int y = 0; y < height; y++)
4404 						{
4405 							for(int x = 0; x < width; x += 4)
4406 							{
4407 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4408 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4409 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4410 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4411 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4412 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4413 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4414 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4415 
4416 								c0 = _mm_avg_epu16(c0, c1);
4417 								c2 = _mm_avg_epu16(c2, c3);
4418 								c4 = _mm_avg_epu16(c4, c5);
4419 								c6 = _mm_avg_epu16(c6, c7);
4420 								c0 = _mm_avg_epu16(c0, c2);
4421 								c4 = _mm_avg_epu16(c4, c6);
4422 								c0 = _mm_avg_epu16(c0, c4);
4423 
4424 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4425 							}
4426 
4427 							source0 += pitch;
4428 							source1 += pitch;
4429 							source2 += pitch;
4430 							source3 += pitch;
4431 							source4 += pitch;
4432 							source5 += pitch;
4433 							source6 += pitch;
4434 							source7 += pitch;
4435 						}
4436 					}
4437 					else if(internal.samples == 16)
4438 					{
4439 						for(int y = 0; y < height; y++)
4440 						{
4441 							for(int x = 0; x < width; x += 4)
4442 							{
4443 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4444 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4445 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4446 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4447 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4448 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4449 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4450 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4451 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4452 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4453 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4454 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4455 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4456 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4457 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4458 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4459 
4460 								c0 = _mm_avg_epu16(c0, c1);
4461 								c2 = _mm_avg_epu16(c2, c3);
4462 								c4 = _mm_avg_epu16(c4, c5);
4463 								c6 = _mm_avg_epu16(c6, c7);
4464 								c8 = _mm_avg_epu16(c8, c9);
4465 								cA = _mm_avg_epu16(cA, cB);
4466 								cC = _mm_avg_epu16(cC, cD);
4467 								cE = _mm_avg_epu16(cE, cF);
4468 								c0 = _mm_avg_epu16(c0, c2);
4469 								c4 = _mm_avg_epu16(c4, c6);
4470 								c8 = _mm_avg_epu16(c8, cA);
4471 								cC = _mm_avg_epu16(cC, cE);
4472 								c0 = _mm_avg_epu16(c0, c4);
4473 								c8 = _mm_avg_epu16(c8, cC);
4474 								c0 = _mm_avg_epu16(c0, c8);
4475 
4476 								_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4477 							}
4478 
4479 							source0 += pitch;
4480 							source1 += pitch;
4481 							source2 += pitch;
4482 							source3 += pitch;
4483 							source4 += pitch;
4484 							source5 += pitch;
4485 							source6 += pitch;
4486 							source7 += pitch;
4487 							source8 += pitch;
4488 							source9 += pitch;
4489 							sourceA += pitch;
4490 							sourceB += pitch;
4491 							sourceC += pitch;
4492 							sourceD += pitch;
4493 							sourceE += pitch;
4494 							sourceF += pitch;
4495 						}
4496 					}
4497 					else ASSERT(false);
4498 				}
4499 				else
4500 			#endif
4501 			{
4502 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4503 
4504 				if(internal.samples == 2)
4505 				{
4506 					for(int y = 0; y < height; y++)
4507 					{
4508 						for(int x = 0; x < width; x++)
4509 						{
4510 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4511 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4512 
4513 							c0 = AVERAGE(c0, c1);
4514 
4515 							*(unsigned int*)(source0 + 4 * x) = c0;
4516 						}
4517 
4518 						source0 += pitch;
4519 						source1 += pitch;
4520 					}
4521 				}
4522 				else if(internal.samples == 4)
4523 				{
4524 					for(int y = 0; y < height; y++)
4525 					{
4526 						for(int x = 0; x < width; x++)
4527 						{
4528 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4529 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4530 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4531 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4532 
4533 							c0 = AVERAGE(c0, c1);
4534 							c2 = AVERAGE(c2, c3);
4535 							c0 = AVERAGE(c0, c2);
4536 
4537 							*(unsigned int*)(source0 + 4 * x) = c0;
4538 						}
4539 
4540 						source0 += pitch;
4541 						source1 += pitch;
4542 						source2 += pitch;
4543 						source3 += pitch;
4544 					}
4545 				}
4546 				else if(internal.samples == 8)
4547 				{
4548 					for(int y = 0; y < height; y++)
4549 					{
4550 						for(int x = 0; x < width; x++)
4551 						{
4552 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4553 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4554 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4555 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4556 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4557 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4558 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4559 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4560 
4561 							c0 = AVERAGE(c0, c1);
4562 							c2 = AVERAGE(c2, c3);
4563 							c4 = AVERAGE(c4, c5);
4564 							c6 = AVERAGE(c6, c7);
4565 							c0 = AVERAGE(c0, c2);
4566 							c4 = AVERAGE(c4, c6);
4567 							c0 = AVERAGE(c0, c4);
4568 
4569 							*(unsigned int*)(source0 + 4 * x) = c0;
4570 						}
4571 
4572 						source0 += pitch;
4573 						source1 += pitch;
4574 						source2 += pitch;
4575 						source3 += pitch;
4576 						source4 += pitch;
4577 						source5 += pitch;
4578 						source6 += pitch;
4579 						source7 += pitch;
4580 					}
4581 				}
4582 				else if(internal.samples == 16)
4583 				{
4584 					for(int y = 0; y < height; y++)
4585 					{
4586 						for(int x = 0; x < width; x++)
4587 						{
4588 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4589 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4590 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4591 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4592 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4593 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4594 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4595 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4596 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4597 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4598 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4599 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4600 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4601 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4602 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4603 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4604 
4605 							c0 = AVERAGE(c0, c1);
4606 							c2 = AVERAGE(c2, c3);
4607 							c4 = AVERAGE(c4, c5);
4608 							c6 = AVERAGE(c6, c7);
4609 							c8 = AVERAGE(c8, c9);
4610 							cA = AVERAGE(cA, cB);
4611 							cC = AVERAGE(cC, cD);
4612 							cE = AVERAGE(cE, cF);
4613 							c0 = AVERAGE(c0, c2);
4614 							c4 = AVERAGE(c4, c6);
4615 							c8 = AVERAGE(c8, cA);
4616 							cC = AVERAGE(cC, cE);
4617 							c0 = AVERAGE(c0, c4);
4618 							c8 = AVERAGE(c8, cC);
4619 							c0 = AVERAGE(c0, c8);
4620 
4621 							*(unsigned int*)(source0 + 4 * x) = c0;
4622 						}
4623 
4624 						source0 += pitch;
4625 						source1 += pitch;
4626 						source2 += pitch;
4627 						source3 += pitch;
4628 						source4 += pitch;
4629 						source5 += pitch;
4630 						source6 += pitch;
4631 						source7 += pitch;
4632 						source8 += pitch;
4633 						source9 += pitch;
4634 						sourceA += pitch;
4635 						sourceB += pitch;
4636 						sourceC += pitch;
4637 						sourceD += pitch;
4638 						sourceE += pitch;
4639 						sourceF += pitch;
4640 					}
4641 				}
4642 				else ASSERT(false);
4643 
4644 				#undef AVERAGE
4645 			}
4646 		}
4647 		else if(internal.format == FORMAT_A16B16G16R16)
4648 		{
4649 			#if defined(__i386__) || defined(__x86_64__)
4650 				if(CPUID::supportsSSE2() && (width % 2) == 0)
4651 				{
4652 					if(internal.samples == 2)
4653 					{
4654 						for(int y = 0; y < height; y++)
4655 						{
4656 							for(int x = 0; x < width; x += 2)
4657 							{
4658 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4659 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4660 
4661 								c0 = _mm_avg_epu16(c0, c1);
4662 
4663 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4664 							}
4665 
4666 							source0 += pitch;
4667 							source1 += pitch;
4668 						}
4669 					}
4670 					else if(internal.samples == 4)
4671 					{
4672 						for(int y = 0; y < height; y++)
4673 						{
4674 							for(int x = 0; x < width; x += 2)
4675 							{
4676 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4677 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4678 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4679 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4680 
4681 								c0 = _mm_avg_epu16(c0, c1);
4682 								c2 = _mm_avg_epu16(c2, c3);
4683 								c0 = _mm_avg_epu16(c0, c2);
4684 
4685 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4686 							}
4687 
4688 							source0 += pitch;
4689 							source1 += pitch;
4690 							source2 += pitch;
4691 							source3 += pitch;
4692 						}
4693 					}
4694 					else if(internal.samples == 8)
4695 					{
4696 						for(int y = 0; y < height; y++)
4697 						{
4698 							for(int x = 0; x < width; x += 2)
4699 							{
4700 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4701 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4702 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4703 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4704 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4705 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4706 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4707 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4708 
4709 								c0 = _mm_avg_epu16(c0, c1);
4710 								c2 = _mm_avg_epu16(c2, c3);
4711 								c4 = _mm_avg_epu16(c4, c5);
4712 								c6 = _mm_avg_epu16(c6, c7);
4713 								c0 = _mm_avg_epu16(c0, c2);
4714 								c4 = _mm_avg_epu16(c4, c6);
4715 								c0 = _mm_avg_epu16(c0, c4);
4716 
4717 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4718 							}
4719 
4720 							source0 += pitch;
4721 							source1 += pitch;
4722 							source2 += pitch;
4723 							source3 += pitch;
4724 							source4 += pitch;
4725 							source5 += pitch;
4726 							source6 += pitch;
4727 							source7 += pitch;
4728 						}
4729 					}
4730 					else if(internal.samples == 16)
4731 					{
4732 						for(int y = 0; y < height; y++)
4733 						{
4734 							for(int x = 0; x < width; x += 2)
4735 							{
4736 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4737 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4738 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4739 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4740 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4741 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4742 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4743 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4744 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4745 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4746 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4747 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4748 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4749 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4750 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4751 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4752 
4753 								c0 = _mm_avg_epu16(c0, c1);
4754 								c2 = _mm_avg_epu16(c2, c3);
4755 								c4 = _mm_avg_epu16(c4, c5);
4756 								c6 = _mm_avg_epu16(c6, c7);
4757 								c8 = _mm_avg_epu16(c8, c9);
4758 								cA = _mm_avg_epu16(cA, cB);
4759 								cC = _mm_avg_epu16(cC, cD);
4760 								cE = _mm_avg_epu16(cE, cF);
4761 								c0 = _mm_avg_epu16(c0, c2);
4762 								c4 = _mm_avg_epu16(c4, c6);
4763 								c8 = _mm_avg_epu16(c8, cA);
4764 								cC = _mm_avg_epu16(cC, cE);
4765 								c0 = _mm_avg_epu16(c0, c4);
4766 								c8 = _mm_avg_epu16(c8, cC);
4767 								c0 = _mm_avg_epu16(c0, c8);
4768 
4769 								_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4770 							}
4771 
4772 							source0 += pitch;
4773 							source1 += pitch;
4774 							source2 += pitch;
4775 							source3 += pitch;
4776 							source4 += pitch;
4777 							source5 += pitch;
4778 							source6 += pitch;
4779 							source7 += pitch;
4780 							source8 += pitch;
4781 							source9 += pitch;
4782 							sourceA += pitch;
4783 							sourceB += pitch;
4784 							sourceC += pitch;
4785 							sourceD += pitch;
4786 							sourceE += pitch;
4787 							sourceF += pitch;
4788 						}
4789 					}
4790 					else ASSERT(false);
4791 				}
4792 				else
4793 			#endif
4794 			{
4795 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4796 
4797 				if(internal.samples == 2)
4798 				{
4799 					for(int y = 0; y < height; y++)
4800 					{
4801 						for(int x = 0; x < 2 * width; x++)
4802 						{
4803 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4804 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4805 
4806 							c0 = AVERAGE(c0, c1);
4807 
4808 							*(unsigned int*)(source0 + 4 * x) = c0;
4809 						}
4810 
4811 						source0 += pitch;
4812 						source1 += pitch;
4813 					}
4814 				}
4815 				else if(internal.samples == 4)
4816 				{
4817 					for(int y = 0; y < height; y++)
4818 					{
4819 						for(int x = 0; x < 2 * width; x++)
4820 						{
4821 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4822 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4823 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4824 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4825 
4826 							c0 = AVERAGE(c0, c1);
4827 							c2 = AVERAGE(c2, c3);
4828 							c0 = AVERAGE(c0, c2);
4829 
4830 							*(unsigned int*)(source0 + 4 * x) = c0;
4831 						}
4832 
4833 						source0 += pitch;
4834 						source1 += pitch;
4835 						source2 += pitch;
4836 						source3 += pitch;
4837 					}
4838 				}
4839 				else if(internal.samples == 8)
4840 				{
4841 					for(int y = 0; y < height; y++)
4842 					{
4843 						for(int x = 0; x < 2 * width; x++)
4844 						{
4845 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4846 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4847 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4848 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4849 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4850 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4851 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4852 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4853 
4854 							c0 = AVERAGE(c0, c1);
4855 							c2 = AVERAGE(c2, c3);
4856 							c4 = AVERAGE(c4, c5);
4857 							c6 = AVERAGE(c6, c7);
4858 							c0 = AVERAGE(c0, c2);
4859 							c4 = AVERAGE(c4, c6);
4860 							c0 = AVERAGE(c0, c4);
4861 
4862 							*(unsigned int*)(source0 + 4 * x) = c0;
4863 						}
4864 
4865 						source0 += pitch;
4866 						source1 += pitch;
4867 						source2 += pitch;
4868 						source3 += pitch;
4869 						source4 += pitch;
4870 						source5 += pitch;
4871 						source6 += pitch;
4872 						source7 += pitch;
4873 					}
4874 				}
4875 				else if(internal.samples == 16)
4876 				{
4877 					for(int y = 0; y < height; y++)
4878 					{
4879 						for(int x = 0; x < 2 * width; x++)
4880 						{
4881 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4882 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4883 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4884 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4885 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4886 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4887 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4888 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4889 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4890 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4891 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4892 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4893 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4894 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4895 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4896 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4897 
4898 							c0 = AVERAGE(c0, c1);
4899 							c2 = AVERAGE(c2, c3);
4900 							c4 = AVERAGE(c4, c5);
4901 							c6 = AVERAGE(c6, c7);
4902 							c8 = AVERAGE(c8, c9);
4903 							cA = AVERAGE(cA, cB);
4904 							cC = AVERAGE(cC, cD);
4905 							cE = AVERAGE(cE, cF);
4906 							c0 = AVERAGE(c0, c2);
4907 							c4 = AVERAGE(c4, c6);
4908 							c8 = AVERAGE(c8, cA);
4909 							cC = AVERAGE(cC, cE);
4910 							c0 = AVERAGE(c0, c4);
4911 							c8 = AVERAGE(c8, cC);
4912 							c0 = AVERAGE(c0, c8);
4913 
4914 							*(unsigned int*)(source0 + 4 * x) = c0;
4915 						}
4916 
4917 						source0 += pitch;
4918 						source1 += pitch;
4919 						source2 += pitch;
4920 						source3 += pitch;
4921 						source4 += pitch;
4922 						source5 += pitch;
4923 						source6 += pitch;
4924 						source7 += pitch;
4925 						source8 += pitch;
4926 						source9 += pitch;
4927 						sourceA += pitch;
4928 						sourceB += pitch;
4929 						sourceC += pitch;
4930 						sourceD += pitch;
4931 						sourceE += pitch;
4932 						sourceF += pitch;
4933 					}
4934 				}
4935 				else ASSERT(false);
4936 
4937 				#undef AVERAGE
4938 			}
4939 		}
4940 		else if(internal.format == FORMAT_R32F)
4941 		{
4942 			#if defined(__i386__) || defined(__x86_64__)
4943 				if(CPUID::supportsSSE() && (width % 4) == 0)
4944 				{
4945 					if(internal.samples == 2)
4946 					{
4947 						for(int y = 0; y < height; y++)
4948 						{
4949 							for(int x = 0; x < width; x += 4)
4950 							{
4951 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4952 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4953 
4954 								c0 = _mm_add_ps(c0, c1);
4955 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4956 
4957 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4958 							}
4959 
4960 							source0 += pitch;
4961 							source1 += pitch;
4962 						}
4963 					}
4964 					else if(internal.samples == 4)
4965 					{
4966 						for(int y = 0; y < height; y++)
4967 						{
4968 							for(int x = 0; x < width; x += 4)
4969 							{
4970 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4971 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4972 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4973 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4974 
4975 								c0 = _mm_add_ps(c0, c1);
4976 								c2 = _mm_add_ps(c2, c3);
4977 								c0 = _mm_add_ps(c0, c2);
4978 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4979 
4980 								_mm_store_ps((float*)(source0 + 4 * x), c0);
4981 							}
4982 
4983 							source0 += pitch;
4984 							source1 += pitch;
4985 							source2 += pitch;
4986 							source3 += pitch;
4987 						}
4988 					}
4989 					else if(internal.samples == 8)
4990 					{
4991 						for(int y = 0; y < height; y++)
4992 						{
4993 							for(int x = 0; x < width; x += 4)
4994 							{
4995 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4996 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4997 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4998 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4999 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
5000 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
5001 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
5002 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
5003 
5004 								c0 = _mm_add_ps(c0, c1);
5005 								c2 = _mm_add_ps(c2, c3);
5006 								c4 = _mm_add_ps(c4, c5);
5007 								c6 = _mm_add_ps(c6, c7);
5008 								c0 = _mm_add_ps(c0, c2);
5009 								c4 = _mm_add_ps(c4, c6);
5010 								c0 = _mm_add_ps(c0, c4);
5011 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5012 
5013 								_mm_store_ps((float*)(source0 + 4 * x), c0);
5014 							}
5015 
5016 							source0 += pitch;
5017 							source1 += pitch;
5018 							source2 += pitch;
5019 							source3 += pitch;
5020 							source4 += pitch;
5021 							source5 += pitch;
5022 							source6 += pitch;
5023 							source7 += pitch;
5024 						}
5025 					}
5026 					else if(internal.samples == 16)
5027 					{
5028 						for(int y = 0; y < height; y++)
5029 						{
5030 							for(int x = 0; x < width; x += 4)
5031 							{
5032 								__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
5033 								__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
5034 								__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
5035 								__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
5036 								__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
5037 								__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
5038 								__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
5039 								__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
5040 								__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
5041 								__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
5042 								__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
5043 								__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
5044 								__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
5045 								__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
5046 								__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
5047 								__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
5048 
5049 								c0 = _mm_add_ps(c0, c1);
5050 								c2 = _mm_add_ps(c2, c3);
5051 								c4 = _mm_add_ps(c4, c5);
5052 								c6 = _mm_add_ps(c6, c7);
5053 								c8 = _mm_add_ps(c8, c9);
5054 								cA = _mm_add_ps(cA, cB);
5055 								cC = _mm_add_ps(cC, cD);
5056 								cE = _mm_add_ps(cE, cF);
5057 								c0 = _mm_add_ps(c0, c2);
5058 								c4 = _mm_add_ps(c4, c6);
5059 								c8 = _mm_add_ps(c8, cA);
5060 								cC = _mm_add_ps(cC, cE);
5061 								c0 = _mm_add_ps(c0, c4);
5062 								c8 = _mm_add_ps(c8, cC);
5063 								c0 = _mm_add_ps(c0, c8);
5064 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5065 
5066 								_mm_store_ps((float*)(source0 + 4 * x), c0);
5067 							}
5068 
5069 							source0 += pitch;
5070 							source1 += pitch;
5071 							source2 += pitch;
5072 							source3 += pitch;
5073 							source4 += pitch;
5074 							source5 += pitch;
5075 							source6 += pitch;
5076 							source7 += pitch;
5077 							source8 += pitch;
5078 							source9 += pitch;
5079 							sourceA += pitch;
5080 							sourceB += pitch;
5081 							sourceC += pitch;
5082 							sourceD += pitch;
5083 							sourceE += pitch;
5084 							sourceF += pitch;
5085 						}
5086 					}
5087 					else ASSERT(false);
5088 				}
5089 				else
5090 			#endif
5091 			{
5092 				if(internal.samples == 2)
5093 				{
5094 					for(int y = 0; y < height; y++)
5095 					{
5096 						for(int x = 0; x < width; x++)
5097 						{
5098 							float c0 = *(float*)(source0 + 4 * x);
5099 							float c1 = *(float*)(source1 + 4 * x);
5100 
5101 							c0 = c0 + c1;
5102 							c0 *= 1.0f / 2.0f;
5103 
5104 							*(float*)(source0 + 4 * x) = c0;
5105 						}
5106 
5107 						source0 += pitch;
5108 						source1 += pitch;
5109 					}
5110 				}
5111 				else if(internal.samples == 4)
5112 				{
5113 					for(int y = 0; y < height; y++)
5114 					{
5115 						for(int x = 0; x < width; x++)
5116 						{
5117 							float c0 = *(float*)(source0 + 4 * x);
5118 							float c1 = *(float*)(source1 + 4 * x);
5119 							float c2 = *(float*)(source2 + 4 * x);
5120 							float c3 = *(float*)(source3 + 4 * x);
5121 
5122 							c0 = c0 + c1;
5123 							c2 = c2 + c3;
5124 							c0 = c0 + c2;
5125 							c0 *= 1.0f / 4.0f;
5126 
5127 							*(float*)(source0 + 4 * x) = c0;
5128 						}
5129 
5130 						source0 += pitch;
5131 						source1 += pitch;
5132 						source2 += pitch;
5133 						source3 += pitch;
5134 					}
5135 				}
5136 				else if(internal.samples == 8)
5137 				{
5138 					for(int y = 0; y < height; y++)
5139 					{
5140 						for(int x = 0; x < width; x++)
5141 						{
5142 							float c0 = *(float*)(source0 + 4 * x);
5143 							float c1 = *(float*)(source1 + 4 * x);
5144 							float c2 = *(float*)(source2 + 4 * x);
5145 							float c3 = *(float*)(source3 + 4 * x);
5146 							float c4 = *(float*)(source4 + 4 * x);
5147 							float c5 = *(float*)(source5 + 4 * x);
5148 							float c6 = *(float*)(source6 + 4 * x);
5149 							float c7 = *(float*)(source7 + 4 * x);
5150 
5151 							c0 = c0 + c1;
5152 							c2 = c2 + c3;
5153 							c4 = c4 + c5;
5154 							c6 = c6 + c7;
5155 							c0 = c0 + c2;
5156 							c4 = c4 + c6;
5157 							c0 = c0 + c4;
5158 							c0 *= 1.0f / 8.0f;
5159 
5160 							*(float*)(source0 + 4 * x) = c0;
5161 						}
5162 
5163 						source0 += pitch;
5164 						source1 += pitch;
5165 						source2 += pitch;
5166 						source3 += pitch;
5167 						source4 += pitch;
5168 						source5 += pitch;
5169 						source6 += pitch;
5170 						source7 += pitch;
5171 					}
5172 				}
5173 				else if(internal.samples == 16)
5174 				{
5175 					for(int y = 0; y < height; y++)
5176 					{
5177 						for(int x = 0; x < width; x++)
5178 						{
5179 							float c0 = *(float*)(source0 + 4 * x);
5180 							float c1 = *(float*)(source1 + 4 * x);
5181 							float c2 = *(float*)(source2 + 4 * x);
5182 							float c3 = *(float*)(source3 + 4 * x);
5183 							float c4 = *(float*)(source4 + 4 * x);
5184 							float c5 = *(float*)(source5 + 4 * x);
5185 							float c6 = *(float*)(source6 + 4 * x);
5186 							float c7 = *(float*)(source7 + 4 * x);
5187 							float c8 = *(float*)(source8 + 4 * x);
5188 							float c9 = *(float*)(source9 + 4 * x);
5189 							float cA = *(float*)(sourceA + 4 * x);
5190 							float cB = *(float*)(sourceB + 4 * x);
5191 							float cC = *(float*)(sourceC + 4 * x);
5192 							float cD = *(float*)(sourceD + 4 * x);
5193 							float cE = *(float*)(sourceE + 4 * x);
5194 							float cF = *(float*)(sourceF + 4 * x);
5195 
5196 							c0 = c0 + c1;
5197 							c2 = c2 + c3;
5198 							c4 = c4 + c5;
5199 							c6 = c6 + c7;
5200 							c8 = c8 + c9;
5201 							cA = cA + cB;
5202 							cC = cC + cD;
5203 							cE = cE + cF;
5204 							c0 = c0 + c2;
5205 							c4 = c4 + c6;
5206 							c8 = c8 + cA;
5207 							cC = cC + cE;
5208 							c0 = c0 + c4;
5209 							c8 = c8 + cC;
5210 							c0 = c0 + c8;
5211 							c0 *= 1.0f / 16.0f;
5212 
5213 							*(float*)(source0 + 4 * x) = c0;
5214 						}
5215 
5216 						source0 += pitch;
5217 						source1 += pitch;
5218 						source2 += pitch;
5219 						source3 += pitch;
5220 						source4 += pitch;
5221 						source5 += pitch;
5222 						source6 += pitch;
5223 						source7 += pitch;
5224 						source8 += pitch;
5225 						source9 += pitch;
5226 						sourceA += pitch;
5227 						sourceB += pitch;
5228 						sourceC += pitch;
5229 						sourceD += pitch;
5230 						sourceE += pitch;
5231 						sourceF += pitch;
5232 					}
5233 				}
5234 				else ASSERT(false);
5235 			}
5236 		}
5237 		else if(internal.format == FORMAT_G32R32F)
5238 		{
5239 			#if defined(__i386__) || defined(__x86_64__)
5240 				if(CPUID::supportsSSE() && (width % 2) == 0)
5241 				{
5242 					if(internal.samples == 2)
5243 					{
5244 						for(int y = 0; y < height; y++)
5245 						{
5246 							for(int x = 0; x < width; x += 2)
5247 							{
5248 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5249 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5250 
5251 								c0 = _mm_add_ps(c0, c1);
5252 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5253 
5254 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5255 							}
5256 
5257 							source0 += pitch;
5258 							source1 += pitch;
5259 						}
5260 					}
5261 					else if(internal.samples == 4)
5262 					{
5263 						for(int y = 0; y < height; y++)
5264 						{
5265 							for(int x = 0; x < width; x += 2)
5266 							{
5267 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5268 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5269 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5270 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5271 
5272 								c0 = _mm_add_ps(c0, c1);
5273 								c2 = _mm_add_ps(c2, c3);
5274 								c0 = _mm_add_ps(c0, c2);
5275 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5276 
5277 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5278 							}
5279 
5280 							source0 += pitch;
5281 							source1 += pitch;
5282 							source2 += pitch;
5283 							source3 += pitch;
5284 						}
5285 					}
5286 					else if(internal.samples == 8)
5287 					{
5288 						for(int y = 0; y < height; y++)
5289 						{
5290 							for(int x = 0; x < width; x += 2)
5291 							{
5292 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5293 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5294 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5295 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5296 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5297 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5298 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5299 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5300 
5301 								c0 = _mm_add_ps(c0, c1);
5302 								c2 = _mm_add_ps(c2, c3);
5303 								c4 = _mm_add_ps(c4, c5);
5304 								c6 = _mm_add_ps(c6, c7);
5305 								c0 = _mm_add_ps(c0, c2);
5306 								c4 = _mm_add_ps(c4, c6);
5307 								c0 = _mm_add_ps(c0, c4);
5308 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5309 
5310 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5311 							}
5312 
5313 							source0 += pitch;
5314 							source1 += pitch;
5315 							source2 += pitch;
5316 							source3 += pitch;
5317 							source4 += pitch;
5318 							source5 += pitch;
5319 							source6 += pitch;
5320 							source7 += pitch;
5321 						}
5322 					}
5323 					else if(internal.samples == 16)
5324 					{
5325 						for(int y = 0; y < height; y++)
5326 						{
5327 							for(int x = 0; x < width; x += 2)
5328 							{
5329 								__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
5330 								__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
5331 								__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
5332 								__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
5333 								__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
5334 								__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
5335 								__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
5336 								__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
5337 								__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
5338 								__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
5339 								__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
5340 								__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
5341 								__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
5342 								__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
5343 								__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
5344 								__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
5345 
5346 								c0 = _mm_add_ps(c0, c1);
5347 								c2 = _mm_add_ps(c2, c3);
5348 								c4 = _mm_add_ps(c4, c5);
5349 								c6 = _mm_add_ps(c6, c7);
5350 								c8 = _mm_add_ps(c8, c9);
5351 								cA = _mm_add_ps(cA, cB);
5352 								cC = _mm_add_ps(cC, cD);
5353 								cE = _mm_add_ps(cE, cF);
5354 								c0 = _mm_add_ps(c0, c2);
5355 								c4 = _mm_add_ps(c4, c6);
5356 								c8 = _mm_add_ps(c8, cA);
5357 								cC = _mm_add_ps(cC, cE);
5358 								c0 = _mm_add_ps(c0, c4);
5359 								c8 = _mm_add_ps(c8, cC);
5360 								c0 = _mm_add_ps(c0, c8);
5361 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5362 
5363 								_mm_store_ps((float*)(source0 + 8 * x), c0);
5364 							}
5365 
5366 							source0 += pitch;
5367 							source1 += pitch;
5368 							source2 += pitch;
5369 							source3 += pitch;
5370 							source4 += pitch;
5371 							source5 += pitch;
5372 							source6 += pitch;
5373 							source7 += pitch;
5374 							source8 += pitch;
5375 							source9 += pitch;
5376 							sourceA += pitch;
5377 							sourceB += pitch;
5378 							sourceC += pitch;
5379 							sourceD += pitch;
5380 							sourceE += pitch;
5381 							sourceF += pitch;
5382 						}
5383 					}
5384 					else ASSERT(false);
5385 				}
5386 				else
5387 			#endif
5388 			{
5389 				if(internal.samples == 2)
5390 				{
5391 					for(int y = 0; y < height; y++)
5392 					{
5393 						for(int x = 0; x < 2 * width; x++)
5394 						{
5395 							float c0 = *(float*)(source0 + 4 * x);
5396 							float c1 = *(float*)(source1 + 4 * x);
5397 
5398 							c0 = c0 + c1;
5399 							c0 *= 1.0f / 2.0f;
5400 
5401 							*(float*)(source0 + 4 * x) = c0;
5402 						}
5403 
5404 						source0 += pitch;
5405 						source1 += pitch;
5406 					}
5407 				}
5408 				else if(internal.samples == 4)
5409 				{
5410 					for(int y = 0; y < height; y++)
5411 					{
5412 						for(int x = 0; x < 2 * width; x++)
5413 						{
5414 							float c0 = *(float*)(source0 + 4 * x);
5415 							float c1 = *(float*)(source1 + 4 * x);
5416 							float c2 = *(float*)(source2 + 4 * x);
5417 							float c3 = *(float*)(source3 + 4 * x);
5418 
5419 							c0 = c0 + c1;
5420 							c2 = c2 + c3;
5421 							c0 = c0 + c2;
5422 							c0 *= 1.0f / 4.0f;
5423 
5424 							*(float*)(source0 + 4 * x) = c0;
5425 						}
5426 
5427 						source0 += pitch;
5428 						source1 += pitch;
5429 						source2 += pitch;
5430 						source3 += pitch;
5431 					}
5432 				}
5433 				else if(internal.samples == 8)
5434 				{
5435 					for(int y = 0; y < height; y++)
5436 					{
5437 						for(int x = 0; x < 2 * width; x++)
5438 						{
5439 							float c0 = *(float*)(source0 + 4 * x);
5440 							float c1 = *(float*)(source1 + 4 * x);
5441 							float c2 = *(float*)(source2 + 4 * x);
5442 							float c3 = *(float*)(source3 + 4 * x);
5443 							float c4 = *(float*)(source4 + 4 * x);
5444 							float c5 = *(float*)(source5 + 4 * x);
5445 							float c6 = *(float*)(source6 + 4 * x);
5446 							float c7 = *(float*)(source7 + 4 * x);
5447 
5448 							c0 = c0 + c1;
5449 							c2 = c2 + c3;
5450 							c4 = c4 + c5;
5451 							c6 = c6 + c7;
5452 							c0 = c0 + c2;
5453 							c4 = c4 + c6;
5454 							c0 = c0 + c4;
5455 							c0 *= 1.0f / 8.0f;
5456 
5457 							*(float*)(source0 + 4 * x) = c0;
5458 						}
5459 
5460 						source0 += pitch;
5461 						source1 += pitch;
5462 						source2 += pitch;
5463 						source3 += pitch;
5464 						source4 += pitch;
5465 						source5 += pitch;
5466 						source6 += pitch;
5467 						source7 += pitch;
5468 					}
5469 				}
5470 				else if(internal.samples == 16)
5471 				{
5472 					for(int y = 0; y < height; y++)
5473 					{
5474 						for(int x = 0; x < 2 * width; x++)
5475 						{
5476 							float c0 = *(float*)(source0 + 4 * x);
5477 							float c1 = *(float*)(source1 + 4 * x);
5478 							float c2 = *(float*)(source2 + 4 * x);
5479 							float c3 = *(float*)(source3 + 4 * x);
5480 							float c4 = *(float*)(source4 + 4 * x);
5481 							float c5 = *(float*)(source5 + 4 * x);
5482 							float c6 = *(float*)(source6 + 4 * x);
5483 							float c7 = *(float*)(source7 + 4 * x);
5484 							float c8 = *(float*)(source8 + 4 * x);
5485 							float c9 = *(float*)(source9 + 4 * x);
5486 							float cA = *(float*)(sourceA + 4 * x);
5487 							float cB = *(float*)(sourceB + 4 * x);
5488 							float cC = *(float*)(sourceC + 4 * x);
5489 							float cD = *(float*)(sourceD + 4 * x);
5490 							float cE = *(float*)(sourceE + 4 * x);
5491 							float cF = *(float*)(sourceF + 4 * x);
5492 
5493 							c0 = c0 + c1;
5494 							c2 = c2 + c3;
5495 							c4 = c4 + c5;
5496 							c6 = c6 + c7;
5497 							c8 = c8 + c9;
5498 							cA = cA + cB;
5499 							cC = cC + cD;
5500 							cE = cE + cF;
5501 							c0 = c0 + c2;
5502 							c4 = c4 + c6;
5503 							c8 = c8 + cA;
5504 							cC = cC + cE;
5505 							c0 = c0 + c4;
5506 							c8 = c8 + cC;
5507 							c0 = c0 + c8;
5508 							c0 *= 1.0f / 16.0f;
5509 
5510 							*(float*)(source0 + 4 * x) = c0;
5511 						}
5512 
5513 						source0 += pitch;
5514 						source1 += pitch;
5515 						source2 += pitch;
5516 						source3 += pitch;
5517 						source4 += pitch;
5518 						source5 += pitch;
5519 						source6 += pitch;
5520 						source7 += pitch;
5521 						source8 += pitch;
5522 						source9 += pitch;
5523 						sourceA += pitch;
5524 						sourceB += pitch;
5525 						sourceC += pitch;
5526 						sourceD += pitch;
5527 						sourceE += pitch;
5528 						sourceF += pitch;
5529 					}
5530 				}
5531 				else ASSERT(false);
5532 			}
5533 		}
5534 		else if(internal.format == FORMAT_A32B32G32R32F ||
5535 		        internal.format == FORMAT_X32B32G32R32F ||
5536 		        internal.format == FORMAT_X32B32G32R32F_UNSIGNED)
5537 		{
5538 			#if defined(__i386__) || defined(__x86_64__)
5539 				if(CPUID::supportsSSE())
5540 				{
5541 					if(internal.samples == 2)
5542 					{
5543 						for(int y = 0; y < height; y++)
5544 						{
5545 							for(int x = 0; x < width; x++)
5546 							{
5547 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5548 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5549 
5550 								c0 = _mm_add_ps(c0, c1);
5551 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5552 
5553 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5554 							}
5555 
5556 							source0 += pitch;
5557 							source1 += pitch;
5558 						}
5559 					}
5560 					else if(internal.samples == 4)
5561 					{
5562 						for(int y = 0; y < height; y++)
5563 						{
5564 							for(int x = 0; x < width; x++)
5565 							{
5566 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5567 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5568 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5569 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5570 
5571 								c0 = _mm_add_ps(c0, c1);
5572 								c2 = _mm_add_ps(c2, c3);
5573 								c0 = _mm_add_ps(c0, c2);
5574 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5575 
5576 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5577 							}
5578 
5579 							source0 += pitch;
5580 							source1 += pitch;
5581 							source2 += pitch;
5582 							source3 += pitch;
5583 						}
5584 					}
5585 					else if(internal.samples == 8)
5586 					{
5587 						for(int y = 0; y < height; y++)
5588 						{
5589 							for(int x = 0; x < width; x++)
5590 							{
5591 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5592 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5593 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5594 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5595 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5596 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5597 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5598 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5599 
5600 								c0 = _mm_add_ps(c0, c1);
5601 								c2 = _mm_add_ps(c2, c3);
5602 								c4 = _mm_add_ps(c4, c5);
5603 								c6 = _mm_add_ps(c6, c7);
5604 								c0 = _mm_add_ps(c0, c2);
5605 								c4 = _mm_add_ps(c4, c6);
5606 								c0 = _mm_add_ps(c0, c4);
5607 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5608 
5609 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5610 							}
5611 
5612 							source0 += pitch;
5613 							source1 += pitch;
5614 							source2 += pitch;
5615 							source3 += pitch;
5616 							source4 += pitch;
5617 							source5 += pitch;
5618 							source6 += pitch;
5619 							source7 += pitch;
5620 						}
5621 					}
5622 					else if(internal.samples == 16)
5623 					{
5624 						for(int y = 0; y < height; y++)
5625 						{
5626 							for(int x = 0; x < width; x++)
5627 							{
5628 								__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5629 								__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5630 								__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5631 								__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5632 								__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5633 								__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5634 								__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5635 								__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5636 								__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
5637 								__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
5638 								__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
5639 								__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
5640 								__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
5641 								__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
5642 								__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
5643 								__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
5644 
5645 								c0 = _mm_add_ps(c0, c1);
5646 								c2 = _mm_add_ps(c2, c3);
5647 								c4 = _mm_add_ps(c4, c5);
5648 								c6 = _mm_add_ps(c6, c7);
5649 								c8 = _mm_add_ps(c8, c9);
5650 								cA = _mm_add_ps(cA, cB);
5651 								cC = _mm_add_ps(cC, cD);
5652 								cE = _mm_add_ps(cE, cF);
5653 								c0 = _mm_add_ps(c0, c2);
5654 								c4 = _mm_add_ps(c4, c6);
5655 								c8 = _mm_add_ps(c8, cA);
5656 								cC = _mm_add_ps(cC, cE);
5657 								c0 = _mm_add_ps(c0, c4);
5658 								c8 = _mm_add_ps(c8, cC);
5659 								c0 = _mm_add_ps(c0, c8);
5660 								c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5661 
5662 								_mm_store_ps((float*)(source0 + 16 * x), c0);
5663 							}
5664 
5665 							source0 += pitch;
5666 							source1 += pitch;
5667 							source2 += pitch;
5668 							source3 += pitch;
5669 							source4 += pitch;
5670 							source5 += pitch;
5671 							source6 += pitch;
5672 							source7 += pitch;
5673 							source8 += pitch;
5674 							source9 += pitch;
5675 							sourceA += pitch;
5676 							sourceB += pitch;
5677 							sourceC += pitch;
5678 							sourceD += pitch;
5679 							sourceE += pitch;
5680 							sourceF += pitch;
5681 						}
5682 					}
5683 					else ASSERT(false);
5684 				}
5685 				else
5686 			#endif
5687 			{
5688 				if(internal.samples == 2)
5689 				{
5690 					for(int y = 0; y < height; y++)
5691 					{
5692 						for(int x = 0; x < 4 * width; x++)
5693 						{
5694 							float c0 = *(float*)(source0 + 4 * x);
5695 							float c1 = *(float*)(source1 + 4 * x);
5696 
5697 							c0 = c0 + c1;
5698 							c0 *= 1.0f / 2.0f;
5699 
5700 							*(float*)(source0 + 4 * x) = c0;
5701 						}
5702 
5703 						source0 += pitch;
5704 						source1 += pitch;
5705 					}
5706 				}
5707 				else if(internal.samples == 4)
5708 				{
5709 					for(int y = 0; y < height; y++)
5710 					{
5711 						for(int x = 0; x < 4 * width; x++)
5712 						{
5713 							float c0 = *(float*)(source0 + 4 * x);
5714 							float c1 = *(float*)(source1 + 4 * x);
5715 							float c2 = *(float*)(source2 + 4 * x);
5716 							float c3 = *(float*)(source3 + 4 * x);
5717 
5718 							c0 = c0 + c1;
5719 							c2 = c2 + c3;
5720 							c0 = c0 + c2;
5721 							c0 *= 1.0f / 4.0f;
5722 
5723 							*(float*)(source0 + 4 * x) = c0;
5724 						}
5725 
5726 						source0 += pitch;
5727 						source1 += pitch;
5728 						source2 += pitch;
5729 						source3 += pitch;
5730 					}
5731 				}
5732 				else if(internal.samples == 8)
5733 				{
5734 					for(int y = 0; y < height; y++)
5735 					{
5736 						for(int x = 0; x < 4 * width; x++)
5737 						{
5738 							float c0 = *(float*)(source0 + 4 * x);
5739 							float c1 = *(float*)(source1 + 4 * x);
5740 							float c2 = *(float*)(source2 + 4 * x);
5741 							float c3 = *(float*)(source3 + 4 * x);
5742 							float c4 = *(float*)(source4 + 4 * x);
5743 							float c5 = *(float*)(source5 + 4 * x);
5744 							float c6 = *(float*)(source6 + 4 * x);
5745 							float c7 = *(float*)(source7 + 4 * x);
5746 
5747 							c0 = c0 + c1;
5748 							c2 = c2 + c3;
5749 							c4 = c4 + c5;
5750 							c6 = c6 + c7;
5751 							c0 = c0 + c2;
5752 							c4 = c4 + c6;
5753 							c0 = c0 + c4;
5754 							c0 *= 1.0f / 8.0f;
5755 
5756 							*(float*)(source0 + 4 * x) = c0;
5757 						}
5758 
5759 						source0 += pitch;
5760 						source1 += pitch;
5761 						source2 += pitch;
5762 						source3 += pitch;
5763 						source4 += pitch;
5764 						source5 += pitch;
5765 						source6 += pitch;
5766 						source7 += pitch;
5767 					}
5768 				}
5769 				else if(internal.samples == 16)
5770 				{
5771 					for(int y = 0; y < height; y++)
5772 					{
5773 						for(int x = 0; x < 4 * width; x++)
5774 						{
5775 							float c0 = *(float*)(source0 + 4 * x);
5776 							float c1 = *(float*)(source1 + 4 * x);
5777 							float c2 = *(float*)(source2 + 4 * x);
5778 							float c3 = *(float*)(source3 + 4 * x);
5779 							float c4 = *(float*)(source4 + 4 * x);
5780 							float c5 = *(float*)(source5 + 4 * x);
5781 							float c6 = *(float*)(source6 + 4 * x);
5782 							float c7 = *(float*)(source7 + 4 * x);
5783 							float c8 = *(float*)(source8 + 4 * x);
5784 							float c9 = *(float*)(source9 + 4 * x);
5785 							float cA = *(float*)(sourceA + 4 * x);
5786 							float cB = *(float*)(sourceB + 4 * x);
5787 							float cC = *(float*)(sourceC + 4 * x);
5788 							float cD = *(float*)(sourceD + 4 * x);
5789 							float cE = *(float*)(sourceE + 4 * x);
5790 							float cF = *(float*)(sourceF + 4 * x);
5791 
5792 							c0 = c0 + c1;
5793 							c2 = c2 + c3;
5794 							c4 = c4 + c5;
5795 							c6 = c6 + c7;
5796 							c8 = c8 + c9;
5797 							cA = cA + cB;
5798 							cC = cC + cD;
5799 							cE = cE + cF;
5800 							c0 = c0 + c2;
5801 							c4 = c4 + c6;
5802 							c8 = c8 + cA;
5803 							cC = cC + cE;
5804 							c0 = c0 + c4;
5805 							c8 = c8 + cC;
5806 							c0 = c0 + c8;
5807 							c0 *= 1.0f / 16.0f;
5808 
5809 							*(float*)(source0 + 4 * x) = c0;
5810 						}
5811 
5812 						source0 += pitch;
5813 						source1 += pitch;
5814 						source2 += pitch;
5815 						source3 += pitch;
5816 						source4 += pitch;
5817 						source5 += pitch;
5818 						source6 += pitch;
5819 						source7 += pitch;
5820 						source8 += pitch;
5821 						source9 += pitch;
5822 						sourceA += pitch;
5823 						sourceB += pitch;
5824 						sourceC += pitch;
5825 						sourceD += pitch;
5826 						sourceE += pitch;
5827 						sourceF += pitch;
5828 					}
5829 				}
5830 				else ASSERT(false);
5831 			}
5832 		}
5833 		else if(internal.format == FORMAT_R5G6B5)
5834 		{
5835 			#if defined(__i386__) || defined(__x86_64__)
5836 				if(CPUID::supportsSSE2() && (width % 8) == 0)
5837 				{
5838 					if(internal.samples == 2)
5839 					{
5840 						for(int y = 0; y < height; y++)
5841 						{
5842 							for(int x = 0; x < width; x += 8)
5843 							{
5844 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5845 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5846 
5847 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5848 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5849 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5850 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5851 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5852 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5853 
5854 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5855 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5856 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5857 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5858 								c0 = _mm_or_si128(c0, c1);
5859 
5860 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5861 							}
5862 
5863 							source0 += pitch;
5864 							source1 += pitch;
5865 						}
5866 					}
5867 					else if(internal.samples == 4)
5868 					{
5869 						for(int y = 0; y < height; y++)
5870 						{
5871 							for(int x = 0; x < width; x += 8)
5872 							{
5873 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5874 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5875 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5876 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5877 
5878 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5879 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5880 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5881 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5882 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5883 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5884 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5885 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5886 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5887 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5888 
5889 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5890 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5891 								c0 = _mm_avg_epu8(c0, c2);
5892 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5893 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5894 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5895 								c1 = _mm_avg_epu16(c1, c3);
5896 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5897 								c0 = _mm_or_si128(c0, c1);
5898 
5899 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5900 							}
5901 
5902 							source0 += pitch;
5903 							source1 += pitch;
5904 							source2 += pitch;
5905 							source3 += pitch;
5906 						}
5907 					}
5908 					else if(internal.samples == 8)
5909 					{
5910 						for(int y = 0; y < height; y++)
5911 						{
5912 							for(int x = 0; x < width; x += 8)
5913 							{
5914 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5915 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5916 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5917 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5918 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5919 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5920 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5921 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5922 
5923 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5924 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5925 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5926 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5927 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5928 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5929 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5930 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5931 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5932 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5933 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5934 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5935 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5936 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5937 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5938 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5939 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5940 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5941 
5942 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5943 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5944 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5945 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5946 								c0 = _mm_avg_epu8(c0, c2);
5947 								c4 = _mm_avg_epu8(c4, c6);
5948 								c0 = _mm_avg_epu8(c0, c4);
5949 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5950 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
5951 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
5952 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
5953 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
5954 								c1 = _mm_avg_epu16(c1, c3);
5955 								c5 = _mm_avg_epu16(c5, c7);
5956 								c1 = _mm_avg_epu16(c1, c5);
5957 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5958 								c0 = _mm_or_si128(c0, c1);
5959 
5960 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5961 							}
5962 
5963 							source0 += pitch;
5964 							source1 += pitch;
5965 							source2 += pitch;
5966 							source3 += pitch;
5967 							source4 += pitch;
5968 							source5 += pitch;
5969 							source6 += pitch;
5970 							source7 += pitch;
5971 						}
5972 					}
5973 					else if(internal.samples == 16)
5974 					{
5975 						for(int y = 0; y < height; y++)
5976 						{
5977 							for(int x = 0; x < width; x += 8)
5978 							{
5979 								__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5980 								__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5981 								__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5982 								__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5983 								__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5984 								__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5985 								__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5986 								__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5987 								__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
5988 								__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
5989 								__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
5990 								__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
5991 								__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
5992 								__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
5993 								__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
5994 								__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
5995 
5996 								static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5997 								static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5998 								__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5999 								__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
6000 								__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
6001 								__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
6002 								__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
6003 								__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
6004 								__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
6005 								__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
6006 								__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
6007 								__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
6008 								__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
6009 								__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
6010 								__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
6011 								__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
6012 								__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
6013 								__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
6014 								__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
6015 								__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
6016 								__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
6017 								__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
6018 								__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
6019 								__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
6020 								__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
6021 								__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
6022 								__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
6023 								__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
6024 								__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
6025 								__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
6026 								__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
6027 								__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
6028 								__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
6029 								__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
6030 
6031 								c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
6032 								c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
6033 								c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
6034 								c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
6035 								c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
6036 								cA = _mm_avg_epu8(cA_r_b, cB_r_b);
6037 								cC = _mm_avg_epu8(cC_r_b, cD_r_b);
6038 								cE = _mm_avg_epu8(cE_r_b, cF_r_b);
6039 								c0 = _mm_avg_epu8(c0, c2);
6040 								c4 = _mm_avg_epu8(c4, c6);
6041 								c8 = _mm_avg_epu8(c8, cA);
6042 								cC = _mm_avg_epu8(cC, cE);
6043 								c0 = _mm_avg_epu8(c0, c4);
6044 								c8 = _mm_avg_epu8(c8, cC);
6045 								c0 = _mm_avg_epu8(c0, c8);
6046 								c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
6047 								c1 = _mm_avg_epu16(c0__g_, c1__g_);
6048 								c3 = _mm_avg_epu16(c2__g_, c3__g_);
6049 								c5 = _mm_avg_epu16(c4__g_, c5__g_);
6050 								c7 = _mm_avg_epu16(c6__g_, c7__g_);
6051 								c9 = _mm_avg_epu16(c8__g_, c9__g_);
6052 								cB = _mm_avg_epu16(cA__g_, cB__g_);
6053 								cD = _mm_avg_epu16(cC__g_, cD__g_);
6054 								cF = _mm_avg_epu16(cE__g_, cF__g_);
6055 								c1 = _mm_avg_epu8(c1, c3);
6056 								c5 = _mm_avg_epu8(c5, c7);
6057 								c9 = _mm_avg_epu8(c9, cB);
6058 								cD = _mm_avg_epu8(cD, cF);
6059 								c1 = _mm_avg_epu8(c1, c5);
6060 								c9 = _mm_avg_epu8(c9, cD);
6061 								c1 = _mm_avg_epu8(c1, c9);
6062 								c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
6063 								c0 = _mm_or_si128(c0, c1);
6064 
6065 								_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
6066 							}
6067 
6068 							source0 += pitch;
6069 							source1 += pitch;
6070 							source2 += pitch;
6071 							source3 += pitch;
6072 							source4 += pitch;
6073 							source5 += pitch;
6074 							source6 += pitch;
6075 							source7 += pitch;
6076 							source8 += pitch;
6077 							source9 += pitch;
6078 							sourceA += pitch;
6079 							sourceB += pitch;
6080 							sourceC += pitch;
6081 							sourceD += pitch;
6082 							sourceE += pitch;
6083 							sourceF += pitch;
6084 						}
6085 					}
6086 					else ASSERT(false);
6087 				}
6088 				else
6089 			#endif
6090 			{
6091 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
6092 
6093 				if(internal.samples == 2)
6094 				{
6095 					for(int y = 0; y < height; y++)
6096 					{
6097 						for(int x = 0; x < width; x++)
6098 						{
6099 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6100 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6101 
6102 							c0 = AVERAGE(c0, c1);
6103 
6104 							*(unsigned short*)(source0 + 2 * x) = c0;
6105 						}
6106 
6107 						source0 += pitch;
6108 						source1 += pitch;
6109 					}
6110 				}
6111 				else if(internal.samples == 4)
6112 				{
6113 					for(int y = 0; y < height; y++)
6114 					{
6115 						for(int x = 0; x < width; x++)
6116 						{
6117 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6118 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6119 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6120 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6121 
6122 							c0 = AVERAGE(c0, c1);
6123 							c2 = AVERAGE(c2, c3);
6124 							c0 = AVERAGE(c0, c2);
6125 
6126 							*(unsigned short*)(source0 + 2 * x) = c0;
6127 						}
6128 
6129 						source0 += pitch;
6130 						source1 += pitch;
6131 						source2 += pitch;
6132 						source3 += pitch;
6133 					}
6134 				}
6135 				else if(internal.samples == 8)
6136 				{
6137 					for(int y = 0; y < height; y++)
6138 					{
6139 						for(int x = 0; x < width; x++)
6140 						{
6141 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6142 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6143 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6144 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6145 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
6146 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
6147 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
6148 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
6149 
6150 							c0 = AVERAGE(c0, c1);
6151 							c2 = AVERAGE(c2, c3);
6152 							c4 = AVERAGE(c4, c5);
6153 							c6 = AVERAGE(c6, c7);
6154 							c0 = AVERAGE(c0, c2);
6155 							c4 = AVERAGE(c4, c6);
6156 							c0 = AVERAGE(c0, c4);
6157 
6158 							*(unsigned short*)(source0 + 2 * x) = c0;
6159 						}
6160 
6161 						source0 += pitch;
6162 						source1 += pitch;
6163 						source2 += pitch;
6164 						source3 += pitch;
6165 						source4 += pitch;
6166 						source5 += pitch;
6167 						source6 += pitch;
6168 						source7 += pitch;
6169 					}
6170 				}
6171 				else if(internal.samples == 16)
6172 				{
6173 					for(int y = 0; y < height; y++)
6174 					{
6175 						for(int x = 0; x < width; x++)
6176 						{
6177 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
6178 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
6179 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
6180 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
6181 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
6182 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
6183 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
6184 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
6185 							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
6186 							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
6187 							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
6188 							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
6189 							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
6190 							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
6191 							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
6192 							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
6193 
6194 							c0 = AVERAGE(c0, c1);
6195 							c2 = AVERAGE(c2, c3);
6196 							c4 = AVERAGE(c4, c5);
6197 							c6 = AVERAGE(c6, c7);
6198 							c8 = AVERAGE(c8, c9);
6199 							cA = AVERAGE(cA, cB);
6200 							cC = AVERAGE(cC, cD);
6201 							cE = AVERAGE(cE, cF);
6202 							c0 = AVERAGE(c0, c2);
6203 							c4 = AVERAGE(c4, c6);
6204 							c8 = AVERAGE(c8, cA);
6205 							cC = AVERAGE(cC, cE);
6206 							c0 = AVERAGE(c0, c4);
6207 							c8 = AVERAGE(c8, cC);
6208 							c0 = AVERAGE(c0, c8);
6209 
6210 							*(unsigned short*)(source0 + 2 * x) = c0;
6211 						}
6212 
6213 						source0 += pitch;
6214 						source1 += pitch;
6215 						source2 += pitch;
6216 						source3 += pitch;
6217 						source4 += pitch;
6218 						source5 += pitch;
6219 						source6 += pitch;
6220 						source7 += pitch;
6221 						source8 += pitch;
6222 						source9 += pitch;
6223 						sourceA += pitch;
6224 						sourceB += pitch;
6225 						sourceC += pitch;
6226 						sourceD += pitch;
6227 						sourceE += pitch;
6228 						sourceF += pitch;
6229 					}
6230 				}
6231 				else ASSERT(false);
6232 
6233 				#undef AVERAGE
6234 			}
6235 		}
6236 		else
6237 		{
6238 		//	UNIMPLEMENTED();
6239 		}
6240 	}
6241 }
6242