1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Surface.hpp"
16 
17 #include "Color.hpp"
18 #include "Context.hpp"
19 #include "ETC_Decoder.hpp"
20 #include "Renderer.hpp"
21 #include "Common/Half.hpp"
22 #include "Common/Memory.hpp"
23 #include "Common/CPUID.hpp"
24 #include "Common/Resource.hpp"
25 #include "Common/Debug.hpp"
26 #include "Reactor/Reactor.hpp"
27 
28 #include <xmmintrin.h>
29 #include <emmintrin.h>
30 
31 #undef min
32 #undef max
33 
34 namespace sw
35 {
36 	extern bool quadLayoutEnabled;
37 	extern bool complementaryDepthBuffer;
38 	extern TranscendentalPrecision logPrecision;
39 
40 	unsigned int *Surface::palette = 0;
41 	unsigned int Surface::paletteID = 0;
42 
clip(int minX,int minY,int maxX,int maxY)43 	void Rect::clip(int minX, int minY, int maxX, int maxY)
44 	{
45 		x0 = clamp(x0, minX, maxX);
46 		y0 = clamp(y0, minY, maxY);
47 		x1 = clamp(x1, minX, maxX);
48 		y1 = clamp(y1, minY, maxY);
49 	}
50 
write(int x,int y,int z,const Color<float> & color)51 	void Surface::Buffer::write(int x, int y, int z, const Color<float> &color)
52 	{
53 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
54 
55 		write(element, color);
56 	}
57 
write(int x,int y,const Color<float> & color)58 	void Surface::Buffer::write(int x, int y, const Color<float> &color)
59 	{
60 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
61 
62 		write(element, color);
63 	}
64 
write(void * element,const Color<float> & color)65 	inline void Surface::Buffer::write(void *element, const Color<float> &color)
66 	{
67 		switch(format)
68 		{
69 		case FORMAT_A8:
70 			*(unsigned char*)element = unorm<8>(color.a);
71 			break;
72 		case FORMAT_R8I_SNORM:
73 			*(char*)element = snorm<8>(color.r);
74 			break;
75 		case FORMAT_R8:
76 			*(unsigned char*)element = unorm<8>(color.r);
77 			break;
78 		case FORMAT_R8I:
79 			*(char*)element = scast<8>(color.r);
80 			break;
81 		case FORMAT_R8UI:
82 			*(unsigned char*)element = ucast<8>(color.r);
83 			break;
84 		case FORMAT_R16I:
85 			*(short*)element = scast<16>(color.r);
86 			break;
87 		case FORMAT_R16UI:
88 			*(unsigned short*)element = ucast<16>(color.r);
89 			break;
90 		case FORMAT_R32I:
91 			*(int*)element = static_cast<int>(color.r);
92 			break;
93 		case FORMAT_R32UI:
94 			*(unsigned int*)element = static_cast<unsigned int>(color.r);
95 			break;
96 		case FORMAT_R3G3B2:
97 			*(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
98 			break;
99 		case FORMAT_A8R3G3B2:
100 			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0);
101 			break;
102 		case FORMAT_X4R4G4B4:
103 			*(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
104 			break;
105 		case FORMAT_A4R4G4B4:
106 			*(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0);
107 			break;
108 		case FORMAT_R4G4B4A4:
109 			*(unsigned short*)element = (unorm<4>(color.r) << 12) | (unorm<4>(color.g) << 8) | (unorm<4>(color.b) << 4) | (unorm<4>(color.a) << 0);
110 			break;
111 		case FORMAT_R5G6B5:
112 			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0);
113 			break;
114 		case FORMAT_A1R5G5B5:
115 			*(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
116 			break;
117 		case FORMAT_R5G5B5A1:
118 			*(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<5>(color.g) << 6) | (unorm<5>(color.b) << 1) | (unorm<5>(color.a) << 0);
119 			break;
120 		case FORMAT_X1R5G5B5:
121 			*(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0);
122 			break;
123 		case FORMAT_A8R8G8B8:
124 			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
125 			break;
126 		case FORMAT_X8R8G8B8:
127 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0);
128 			break;
129 		case FORMAT_A8B8G8R8I_SNORM:
130 			*(unsigned int*)element = (static_cast<unsigned int>(snorm<8>(color.a)) << 24) |
131 			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
132 			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
133 			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
134 			break;
135 		case FORMAT_A8B8G8R8:
136 		case FORMAT_SRGB8_A8:
137 			*(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
138 			break;
139 		case FORMAT_A8B8G8R8I:
140 			*(unsigned int*)element = (static_cast<unsigned int>(scast<8>(color.a)) << 24) |
141 			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
142 			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
143 			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
144 			break;
145 		case FORMAT_A8B8G8R8UI:
146 			*(unsigned int*)element = (ucast<8>(color.a) << 24) | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
147 			break;
148 		case FORMAT_X8B8G8R8I_SNORM:
149 			*(unsigned int*)element = 0x7F000000 |
150 			                          (static_cast<unsigned int>(snorm<8>(color.b)) << 16) |
151 			                          (static_cast<unsigned int>(snorm<8>(color.g)) << 8) |
152 			                          (static_cast<unsigned int>(snorm<8>(color.r)) << 0);
153 			break;
154 		case FORMAT_X8B8G8R8:
155 		case FORMAT_SRGB8_X8:
156 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
157 			break;
158 		case FORMAT_X8B8G8R8I:
159 			*(unsigned int*)element = 0x7F000000 |
160 			                          (static_cast<unsigned int>(scast<8>(color.b)) << 16) |
161 			                          (static_cast<unsigned int>(scast<8>(color.g)) << 8) |
162 			                          (static_cast<unsigned int>(scast<8>(color.r)) << 0);
163 		case FORMAT_X8B8G8R8UI:
164 			*(unsigned int*)element = 0xFF000000 | (ucast<8>(color.b) << 16) | (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
165 			break;
166 		case FORMAT_A2R10G10B10:
167 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0);
168 			break;
169 		case FORMAT_A2B10G10R10:
170 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0);
171 			break;
172 		case FORMAT_G8R8I_SNORM:
173 			*(unsigned short*)element = (static_cast<unsigned short>(snorm<8>(color.g)) << 8) |
174 			                            (static_cast<unsigned short>(snorm<8>(color.r)) << 0);
175 			break;
176 		case FORMAT_G8R8:
177 			*(unsigned short*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0);
178 			break;
179 		case FORMAT_G8R8I:
180 			*(unsigned short*)element = (static_cast<unsigned short>(scast<8>(color.g)) << 8) |
181 			                            (static_cast<unsigned short>(scast<8>(color.r)) << 0);
182 			break;
183 		case FORMAT_G8R8UI:
184 			*(unsigned short*)element = (ucast<8>(color.g) << 8) | (ucast<8>(color.r) << 0);
185 			break;
186 		case FORMAT_G16R16:
187 			*(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0);
188 			break;
189 		case FORMAT_G16R16I:
190 			*(unsigned int*)element = (static_cast<unsigned int>(scast<16>(color.g)) << 16) |
191 			                          (static_cast<unsigned int>(scast<16>(color.r)) << 0);
192 			break;
193 		case FORMAT_G16R16UI:
194 			*(unsigned int*)element = (ucast<16>(color.g) << 16) | (ucast<16>(color.r) << 0);
195 			break;
196 		case FORMAT_G32R32I:
197 		case FORMAT_G32R32UI:
198 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
199 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
200 			break;
201 		case FORMAT_A16B16G16R16:
202 			((unsigned short*)element)[0] = unorm<16>(color.r);
203 			((unsigned short*)element)[1] = unorm<16>(color.g);
204 			((unsigned short*)element)[2] = unorm<16>(color.b);
205 			((unsigned short*)element)[3] = unorm<16>(color.a);
206 			break;
207 		case FORMAT_A16B16G16R16I:
208 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
209 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
210 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
211 			((unsigned short*)element)[3] = static_cast<unsigned short>(scast<16>(color.a));
212 			break;
213 		case FORMAT_A16B16G16R16UI:
214 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
215 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
216 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
217 			((unsigned short*)element)[3] = static_cast<unsigned short>(ucast<16>(color.a));
218 			break;
219 		case FORMAT_X16B16G16R16I:
220 			((unsigned short*)element)[0] = static_cast<unsigned short>(scast<16>(color.r));
221 			((unsigned short*)element)[1] = static_cast<unsigned short>(scast<16>(color.g));
222 			((unsigned short*)element)[2] = static_cast<unsigned short>(scast<16>(color.b));
223 			break;
224 		case FORMAT_X16B16G16R16UI:
225 			((unsigned short*)element)[0] = static_cast<unsigned short>(ucast<16>(color.r));
226 			((unsigned short*)element)[1] = static_cast<unsigned short>(ucast<16>(color.g));
227 			((unsigned short*)element)[2] = static_cast<unsigned short>(ucast<16>(color.b));
228 			break;
229 		case FORMAT_A32B32G32R32I:
230 		case FORMAT_A32B32G32R32UI:
231 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
232 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
233 			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
234 			((unsigned int*)element)[3] = static_cast<unsigned int>(color.a);
235 			break;
236 		case FORMAT_X32B32G32R32I:
237 		case FORMAT_X32B32G32R32UI:
238 			((unsigned int*)element)[0] = static_cast<unsigned int>(color.r);
239 			((unsigned int*)element)[1] = static_cast<unsigned int>(color.g);
240 			((unsigned int*)element)[2] = static_cast<unsigned int>(color.b);
241 			break;
242 		case FORMAT_V8U8:
243 			*(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
244 			break;
245 		case FORMAT_L6V5U5:
246 			*(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0);
247 			break;
248 		case FORMAT_Q8W8V8U8:
249 			*(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
250 			break;
251 		case FORMAT_X8L8V8U8:
252 			*(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0);
253 			break;
254 		case FORMAT_V16U16:
255 			*(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0);
256 			break;
257 		case FORMAT_A2W10V10U10:
258 			*(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0);
259 			break;
260 		case FORMAT_A16W16V16U16:
261 			((unsigned short*)element)[0] = snorm<16>(color.r);
262 			((unsigned short*)element)[1] = snorm<16>(color.g);
263 			((unsigned short*)element)[2] = snorm<16>(color.b);
264 			((unsigned short*)element)[3] = unorm<16>(color.a);
265 			break;
266 		case FORMAT_Q16W16V16U16:
267 			((unsigned short*)element)[0] = snorm<16>(color.r);
268 			((unsigned short*)element)[1] = snorm<16>(color.g);
269 			((unsigned short*)element)[2] = snorm<16>(color.b);
270 			((unsigned short*)element)[3] = snorm<16>(color.a);
271 			break;
272 		case FORMAT_R8G8B8:
273 			((unsigned char*)element)[0] = unorm<8>(color.b);
274 			((unsigned char*)element)[1] = unorm<8>(color.g);
275 			((unsigned char*)element)[2] = unorm<8>(color.r);
276 			break;
277 		case FORMAT_B8G8R8:
278 			((unsigned char*)element)[0] = unorm<8>(color.r);
279 			((unsigned char*)element)[1] = unorm<8>(color.g);
280 			((unsigned char*)element)[2] = unorm<8>(color.b);
281 			break;
282 		case FORMAT_R16F:
283 			*(half*)element = (half)color.r;
284 			break;
285 		case FORMAT_A16F:
286 			*(half*)element = (half)color.a;
287 			break;
288 		case FORMAT_G16R16F:
289 			((half*)element)[0] = (half)color.r;
290 			((half*)element)[1] = (half)color.g;
291 			break;
292 		case FORMAT_B16G16R16F:
293 			((half*)element)[0] = (half)color.r;
294 			((half*)element)[1] = (half)color.g;
295 			((half*)element)[2] = (half)color.b;
296 			break;
297 		case FORMAT_A16B16G16R16F:
298 			((half*)element)[0] = (half)color.r;
299 			((half*)element)[1] = (half)color.g;
300 			((half*)element)[2] = (half)color.b;
301 			((half*)element)[3] = (half)color.a;
302 			break;
303 		case FORMAT_A32F:
304 			*(float*)element = color.a;
305 			break;
306 		case FORMAT_R32F:
307 			*(float*)element = color.r;
308 			break;
309 		case FORMAT_G32R32F:
310 			((float*)element)[0] = color.r;
311 			((float*)element)[1] = color.g;
312 			break;
313 		case FORMAT_X32B32G32R32F:
314 			((float*)element)[3] = 1.0f;
315 		case FORMAT_B32G32R32F:
316 			((float*)element)[0] = color.r;
317 			((float*)element)[1] = color.g;
318 			((float*)element)[2] = color.b;
319 			break;
320 		case FORMAT_A32B32G32R32F:
321 			((float*)element)[0] = color.r;
322 			((float*)element)[1] = color.g;
323 			((float*)element)[2] = color.b;
324 			((float*)element)[3] = color.a;
325 			break;
326 		case FORMAT_D32F:
327 		case FORMAT_D32F_LOCKABLE:
328 		case FORMAT_D32FS8_TEXTURE:
329 		case FORMAT_D32FS8_SHADOW:
330 			*((float*)element) = color.r;
331 			break;
332 		case FORMAT_D32F_COMPLEMENTARY:
333 			*((float*)element) = 1 - color.r;
334 			break;
335 		case FORMAT_S8:
336 			*((unsigned char*)element) = unorm<8>(color.r);
337 			break;
338 		case FORMAT_L8:
339 			*(unsigned char*)element = unorm<8>(color.r);
340 			break;
341 		case FORMAT_A4L4:
342 			*(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0);
343 			break;
344 		case FORMAT_L16:
345 			*(unsigned short*)element = unorm<16>(color.r);
346 			break;
347 		case FORMAT_A8L8:
348 			*(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0);
349 			break;
350 		case FORMAT_L16F:
351 			*(half*)element = (half)color.r;
352 			break;
353 		case FORMAT_A16L16F:
354 			((half*)element)[0] = (half)color.r;
355 			((half*)element)[1] = (half)color.a;
356 			break;
357 		case FORMAT_L32F:
358 			*(float*)element = color.r;
359 			break;
360 		case FORMAT_A32L32F:
361 			((float*)element)[0] = color.r;
362 			((float*)element)[1] = color.a;
363 			break;
364 		default:
365 			ASSERT(false);
366 		}
367 	}
368 
read(int x,int y,int z) const369 	Color<float> Surface::Buffer::read(int x, int y, int z) const
370 	{
371 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
372 
373 		return read(element);
374 	}
375 
read(int x,int y) const376 	Color<float> Surface::Buffer::read(int x, int y) const
377 	{
378 		void *element = (unsigned char*)buffer + x * bytes + y * pitchB;
379 
380 		return read(element);
381 	}
382 
read(void * element) const383 	inline Color<float> Surface::Buffer::read(void *element) const
384 	{
385 		float r = 0.0f;
386 		float g = 0.0f;
387 		float b = 0.0f;
388 		float a = 1.0f;
389 
390 		switch(format)
391 		{
392 		case FORMAT_P8:
393 			{
394 				ASSERT(palette);
395 
396 				unsigned int abgr = palette[*(unsigned char*)element];
397 
398 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
399 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
400 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
401 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
402 			}
403 			break;
404 		case FORMAT_A8P8:
405 			{
406 				ASSERT(palette);
407 
408 				unsigned int bgr = palette[((unsigned char*)element)[0]];
409 
410 				r = (bgr & 0x000000FF) * (1.0f / 0x000000FF);
411 				g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00);
412 				b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000);
413 				a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
414 			}
415 			break;
416 		case FORMAT_A8:
417 			r = 0;
418 			g = 0;
419 			b = 0;
420 			a = *(unsigned char*)element * (1.0f / 0xFF);
421 			break;
422 		case FORMAT_R8I_SNORM:
423 			r = max((*(signed char*)element) * (1.0f / 0x7F), -1.0f);
424 			break;
425 		case FORMAT_R8:
426 			r = *(unsigned char*)element * (1.0f / 0xFF);
427 			break;
428 		case FORMAT_R8I:
429 			r = *(signed char*)element;
430 			break;
431 		case FORMAT_R8UI:
432 			r = *(unsigned char*)element;
433 			break;
434 		case FORMAT_R3G3B2:
435 			{
436 				unsigned char rgb = *(unsigned char*)element;
437 
438 				r = (rgb & 0xE0) * (1.0f / 0xE0);
439 				g = (rgb & 0x1C) * (1.0f / 0x1C);
440 				b = (rgb & 0x03) * (1.0f / 0x03);
441 			}
442 			break;
443 		case FORMAT_A8R3G3B2:
444 			{
445 				unsigned short argb = *(unsigned short*)element;
446 
447 				a = (argb & 0xFF00) * (1.0f / 0xFF00);
448 				r = (argb & 0x00E0) * (1.0f / 0x00E0);
449 				g = (argb & 0x001C) * (1.0f / 0x001C);
450 				b = (argb & 0x0003) * (1.0f / 0x0003);
451 			}
452 			break;
453 		case FORMAT_X4R4G4B4:
454 			{
455 				unsigned short rgb = *(unsigned short*)element;
456 
457 				r = (rgb & 0x0F00) * (1.0f / 0x0F00);
458 				g = (rgb & 0x00F0) * (1.0f / 0x00F0);
459 				b = (rgb & 0x000F) * (1.0f / 0x000F);
460 			}
461 			break;
462 		case FORMAT_A4R4G4B4:
463 			{
464 				unsigned short argb = *(unsigned short*)element;
465 
466 				a = (argb & 0xF000) * (1.0f / 0xF000);
467 				r = (argb & 0x0F00) * (1.0f / 0x0F00);
468 				g = (argb & 0x00F0) * (1.0f / 0x00F0);
469 				b = (argb & 0x000F) * (1.0f / 0x000F);
470 			}
471 			break;
472 		case FORMAT_R4G4B4A4:
473 			{
474 				unsigned short rgba = *(unsigned short*)element;
475 
476 				r = (rgba & 0xF000) * (1.0f / 0xF000);
477 				g = (rgba & 0x0F00) * (1.0f / 0x0F00);
478 				b = (rgba & 0x00F0) * (1.0f / 0x00F0);
479 				a = (rgba & 0x000F) * (1.0f / 0x000F);
480 			}
481 			break;
482 		case FORMAT_R5G6B5:
483 			{
484 				unsigned short rgb = *(unsigned short*)element;
485 
486 				r = (rgb & 0xF800) * (1.0f / 0xF800);
487 				g = (rgb & 0x07E0) * (1.0f / 0x07E0);
488 				b = (rgb & 0x001F) * (1.0f / 0x001F);
489 			}
490 			break;
491 		case FORMAT_A1R5G5B5:
492 			{
493 				unsigned short argb = *(unsigned short*)element;
494 
495 				a = (argb & 0x8000) * (1.0f / 0x8000);
496 				r = (argb & 0x7C00) * (1.0f / 0x7C00);
497 				g = (argb & 0x03E0) * (1.0f / 0x03E0);
498 				b = (argb & 0x001F) * (1.0f / 0x001F);
499 			}
500 			break;
501 		case FORMAT_R5G5B5A1:
502 			{
503 				unsigned short rgba = *(unsigned short*)element;
504 
505 				r = (rgba & 0xF800) * (1.0f / 0xF800);
506 				g = (rgba & 0x07C0) * (1.0f / 0x07C0);
507 				b = (rgba & 0x003E) * (1.0f / 0x003E);
508 				a = (rgba & 0x0001) * (1.0f / 0x0001);
509 			}
510 			break;
511 		case FORMAT_X1R5G5B5:
512 			{
513 				unsigned short xrgb = *(unsigned short*)element;
514 
515 				r = (xrgb & 0x7C00) * (1.0f / 0x7C00);
516 				g = (xrgb & 0x03E0) * (1.0f / 0x03E0);
517 				b = (xrgb & 0x001F) * (1.0f / 0x001F);
518 			}
519 			break;
520 		case FORMAT_A8R8G8B8:
521 			{
522 				unsigned int argb = *(unsigned int*)element;
523 
524 				a = (argb & 0xFF000000) * (1.0f / 0xFF000000);
525 				r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000);
526 				g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00);
527 				b = (argb & 0x000000FF) * (1.0f / 0x000000FF);
528 			}
529 			break;
530 		case FORMAT_X8R8G8B8:
531 			{
532 				unsigned int xrgb = *(unsigned int*)element;
533 
534 				r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000);
535 				g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00);
536 				b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF);
537 			}
538 			break;
539 		case FORMAT_A8B8G8R8I_SNORM:
540 			{
541 				signed char* abgr = (signed char*)element;
542 
543 				r = max(abgr[0] * (1.0f / 0x7F), -1.0f);
544 				g = max(abgr[1] * (1.0f / 0x7F), -1.0f);
545 				b = max(abgr[2] * (1.0f / 0x7F), -1.0f);
546 				a = max(abgr[3] * (1.0f / 0x7F), -1.0f);
547 			}
548 			break;
549 		case FORMAT_A8B8G8R8:
550 		case FORMAT_SRGB8_A8:
551 			{
552 				unsigned int abgr = *(unsigned int*)element;
553 
554 				a = (abgr & 0xFF000000) * (1.0f / 0xFF000000);
555 				b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000);
556 				g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00);
557 				r = (abgr & 0x000000FF) * (1.0f / 0x000000FF);
558 			}
559 			break;
560 		case FORMAT_A8B8G8R8I:
561 			{
562 				signed char* abgr = (signed char*)element;
563 
564 				r = abgr[0];
565 				g = abgr[1];
566 				b = abgr[2];
567 				a = abgr[3];
568 			}
569 			break;
570 		case FORMAT_A8B8G8R8UI:
571 			{
572 				unsigned char* abgr = (unsigned char*)element;
573 
574 				r = abgr[0];
575 				g = abgr[1];
576 				b = abgr[2];
577 				a = abgr[3];
578 			}
579 			break;
580 		case FORMAT_X8B8G8R8I_SNORM:
581 			{
582 				signed char* bgr = (signed char*)element;
583 
584 				r = max(bgr[0] * (1.0f / 0x7F), -1.0f);
585 				g = max(bgr[1] * (1.0f / 0x7F), -1.0f);
586 				b = max(bgr[2] * (1.0f / 0x7F), -1.0f);
587 			}
588 			break;
589 		case FORMAT_X8B8G8R8:
590 		case FORMAT_SRGB8_X8:
591 			{
592 				unsigned int xbgr = *(unsigned int*)element;
593 
594 				b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000);
595 				g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00);
596 				r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF);
597 			}
598 			break;
599 		case FORMAT_X8B8G8R8I:
600 			{
601 				signed char* bgr = (signed char*)element;
602 
603 				r = bgr[0];
604 				g = bgr[1];
605 				b = bgr[2];
606 			}
607 			break;
608 		case FORMAT_X8B8G8R8UI:
609 			{
610 				unsigned char* bgr = (unsigned char*)element;
611 
612 				r = bgr[0];
613 				g = bgr[1];
614 				b = bgr[2];
615 			}
616 			break;
617 		case FORMAT_G8R8I_SNORM:
618 			{
619 				signed char* gr = (signed char*)element;
620 
621 				r = (gr[0] & 0xFF00) * (1.0f / 0xFF00);
622 				g = (gr[1] & 0x00FF) * (1.0f / 0x00FF);
623 			}
624 			break;
625 		case FORMAT_G8R8:
626 			{
627 				unsigned short gr = *(unsigned short*)element;
628 
629 				g = (gr & 0xFF00) * (1.0f / 0xFF00);
630 				r = (gr & 0x00FF) * (1.0f / 0x00FF);
631 			}
632 			break;
633 		case FORMAT_G8R8I:
634 			{
635 				signed char* gr = (signed char*)element;
636 
637 				r = gr[0];
638 				g = gr[1];
639 			}
640 			break;
641 		case FORMAT_G8R8UI:
642 			{
643 				unsigned char* gr = (unsigned char*)element;
644 
645 				r = gr[0];
646 				g = gr[1];
647 			}
648 			break;
649 		case FORMAT_R16I:
650 			r = *((short*)element);
651 			break;
652 		case FORMAT_R16UI:
653 			r = *((unsigned short*)element);
654 			break;
655 		case FORMAT_G16R16I:
656 			{
657 				short* gr = (short*)element;
658 
659 				r = gr[0];
660 				g = gr[1];
661 			}
662 			break;
663 		case FORMAT_G16R16:
664 			{
665 				unsigned int gr = *(unsigned int*)element;
666 
667 				g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000);
668 				r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF);
669 			}
670 			break;
671 		case FORMAT_G16R16UI:
672 			{
673 				unsigned short* gr = (unsigned short*)element;
674 
675 				r = gr[0];
676 				g = gr[1];
677 			}
678 			break;
679 		case FORMAT_A2R10G10B10:
680 			{
681 				unsigned int argb = *(unsigned int*)element;
682 
683 				a = (argb & 0xC0000000) * (1.0f / 0xC0000000);
684 				r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000);
685 				g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00);
686 				b = (argb & 0x000003FF) * (1.0f / 0x000003FF);
687 			}
688 			break;
689 		case FORMAT_A2B10G10R10:
690 			{
691 				unsigned int abgr = *(unsigned int*)element;
692 
693 				a = (abgr & 0xC0000000) * (1.0f / 0xC0000000);
694 				b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000);
695 				g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00);
696 				r = (abgr & 0x000003FF) * (1.0f / 0x000003FF);
697 			}
698 			break;
699 		case FORMAT_A16B16G16R16I:
700 			{
701 				short* abgr = (short*)element;
702 
703 				r = abgr[0];
704 				g = abgr[1];
705 				b = abgr[2];
706 				a = abgr[3];
707 			}
708 			break;
709 		case FORMAT_A16B16G16R16:
710 			r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF);
711 			g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF);
712 			b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF);
713 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
714 			break;
715 		case FORMAT_A16B16G16R16UI:
716 			{
717 				unsigned short* abgr = (unsigned short*)element;
718 
719 				r = abgr[0];
720 				g = abgr[1];
721 				b = abgr[2];
722 				a = abgr[3];
723 			}
724 			break;
725 		case FORMAT_X16B16G16R16I:
726 			{
727 				short* bgr = (short*)element;
728 
729 				r = bgr[0];
730 				g = bgr[1];
731 				b = bgr[2];
732 			}
733 			break;
734 		case FORMAT_X16B16G16R16UI:
735 			{
736 				unsigned short* bgr = (unsigned short*)element;
737 
738 				r = bgr[0];
739 				g = bgr[1];
740 				b = bgr[2];
741 			}
742 			break;
743 		case FORMAT_A32B32G32R32I:
744 			{
745 				int* abgr = (int*)element;
746 
747 				r = static_cast<float>(abgr[0]);
748 				g = static_cast<float>(abgr[1]);
749 				b = static_cast<float>(abgr[2]);
750 				a = static_cast<float>(abgr[3]);
751 			}
752 			break;
753 		case FORMAT_A32B32G32R32UI:
754 			{
755 				unsigned int* abgr = (unsigned int*)element;
756 
757 				r = static_cast<float>(abgr[0]);
758 				g = static_cast<float>(abgr[1]);
759 				b = static_cast<float>(abgr[2]);
760 				a = static_cast<float>(abgr[3]);
761 			}
762 			break;
763 		case FORMAT_X32B32G32R32I:
764 			{
765 				int* bgr = (int*)element;
766 
767 				r = static_cast<float>(bgr[0]);
768 				g = static_cast<float>(bgr[1]);
769 				b = static_cast<float>(bgr[2]);
770 			}
771 			break;
772 		case FORMAT_X32B32G32R32UI:
773 			{
774 				unsigned int* bgr = (unsigned int*)element;
775 
776 				r = static_cast<float>(bgr[0]);
777 				g = static_cast<float>(bgr[1]);
778 				b = static_cast<float>(bgr[2]);
779 			}
780 			break;
781 		case FORMAT_G32R32I:
782 			{
783 				int* gr = (int*)element;
784 
785 				r = static_cast<float>(gr[0]);
786 				g = static_cast<float>(gr[1]);
787 			}
788 			break;
789 		case FORMAT_G32R32UI:
790 			{
791 				unsigned int* gr = (unsigned int*)element;
792 
793 				r = static_cast<float>(gr[0]);
794 				g = static_cast<float>(gr[1]);
795 			}
796 			break;
797 		case FORMAT_R32I:
798 			r = static_cast<float>(*((int*)element));
799 			break;
800 		case FORMAT_R32UI:
801 			r = static_cast<float>(*((unsigned int*)element));
802 			break;
803 		case FORMAT_V8U8:
804 			{
805 				unsigned short vu = *(unsigned short*)element;
806 
807 				r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000);
808 				g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000);
809 			}
810 			break;
811 		case FORMAT_L6V5U5:
812 			{
813 				unsigned short lvu = *(unsigned short*)element;
814 
815 				r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000);
816 				g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000);
817 				b = (lvu & 0xFC00) * (1.0f / 0xFC00);
818 			}
819 			break;
820 		case FORMAT_Q8W8V8U8:
821 			{
822 				unsigned int qwvu = *(unsigned int*)element;
823 
824 				r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
825 				g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
826 				b = ((int)(qwvu & 0x00FF0000) << 8)  * (1.0f / 0x7F000000);
827 				a = ((int)(qwvu & 0xFF000000) << 0)  * (1.0f / 0x7F000000);
828 			}
829 			break;
830 		case FORMAT_X8L8V8U8:
831 			{
832 				unsigned int xlvu = *(unsigned int*)element;
833 
834 				r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000);
835 				g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000);
836 				b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000);
837 			}
838 			break;
839 		case FORMAT_R8G8B8:
840 			r = ((unsigned char*)element)[2] * (1.0f / 0xFF);
841 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
842 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
843 			break;
844 		case FORMAT_B8G8R8:
845 			r = ((unsigned char*)element)[0] * (1.0f / 0xFF);
846 			g = ((unsigned char*)element)[1] * (1.0f / 0xFF);
847 			b = ((unsigned char*)element)[2] * (1.0f / 0xFF);
848 			break;
849 		case FORMAT_V16U16:
850 			{
851 				unsigned int vu = *(unsigned int*)element;
852 
853 				r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000);
854 				g = ((int)(vu & 0xFFFF0000) << 0)  * (1.0f / 0x7FFF0000);
855 			}
856 			break;
857 		case FORMAT_A2W10V10U10:
858 			{
859 				unsigned int awvu = *(unsigned int*)element;
860 
861 				r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000);
862 				g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000);
863 				b = ((int)(awvu & 0x3FF00000) << 2)  * (1.0f / 0x7FC00000);
864 				a = (awvu & 0xC0000000) * (1.0f / 0xC0000000);
865 			}
866 			break;
867 		case FORMAT_A16W16V16U16:
868 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
869 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
870 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
871 			a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF);
872 			break;
873 		case FORMAT_Q16W16V16U16:
874 			r = ((signed short*)element)[0] * (1.0f / 0x7FFF);
875 			g = ((signed short*)element)[1] * (1.0f / 0x7FFF);
876 			b = ((signed short*)element)[2] * (1.0f / 0x7FFF);
877 			a = ((signed short*)element)[3] * (1.0f / 0x7FFF);
878 			break;
879 		case FORMAT_L8:
880 			r =
881 			g =
882 			b = *(unsigned char*)element * (1.0f / 0xFF);
883 			break;
884 		case FORMAT_A4L4:
885 			{
886 				unsigned char al = *(unsigned char*)element;
887 
888 				r =
889 				g =
890 				b = (al & 0x0F) * (1.0f / 0x0F);
891 				a = (al & 0xF0) * (1.0f / 0xF0);
892 			}
893 			break;
894 		case FORMAT_L16:
895 			r =
896 			g =
897 			b = *(unsigned short*)element * (1.0f / 0xFFFF);
898 			break;
899 		case FORMAT_A8L8:
900 			r =
901 			g =
902 			b = ((unsigned char*)element)[0] * (1.0f / 0xFF);
903 			a = ((unsigned char*)element)[1] * (1.0f / 0xFF);
904 			break;
905 		case FORMAT_L16F:
906 			r =
907 			g =
908 			b = *(half*)element;
909 			break;
910 		case FORMAT_A16L16F:
911 			r =
912 			g =
913 			b = ((half*)element)[0];
914 			a = ((half*)element)[1];
915 			break;
916 		case FORMAT_L32F:
917 			r =
918 			g =
919 			b = *(float*)element;
920 			break;
921 		case FORMAT_A32L32F:
922 			r =
923 			g =
924 			b = ((float*)element)[0];
925 			a = ((float*)element)[1];
926 			break;
927 		case FORMAT_A16F:
928 			a = *(half*)element;
929 			break;
930 		case FORMAT_R16F:
931 			r = *(half*)element;
932 			break;
933 		case FORMAT_G16R16F:
934 			r = ((half*)element)[0];
935 			g = ((half*)element)[1];
936 			break;
937 		case FORMAT_B16G16R16F:
938 			r = ((half*)element)[0];
939 			g = ((half*)element)[1];
940 			b = ((half*)element)[2];
941 			break;
942 		case FORMAT_A16B16G16R16F:
943 			r = ((half*)element)[0];
944 			g = ((half*)element)[1];
945 			b = ((half*)element)[2];
946 			a = ((half*)element)[3];
947 			break;
948 		case FORMAT_A32F:
949 			a = *(float*)element;
950 			break;
951 		case FORMAT_R32F:
952 			r = *(float*)element;
953 			break;
954 		case FORMAT_G32R32F:
955 			r = ((float*)element)[0];
956 			g = ((float*)element)[1];
957 			break;
958 		case FORMAT_X32B32G32R32F:
959 		case FORMAT_B32G32R32F:
960 			r = ((float*)element)[0];
961 			g = ((float*)element)[1];
962 			b = ((float*)element)[2];
963 			break;
964 		case FORMAT_A32B32G32R32F:
965 			r = ((float*)element)[0];
966 			g = ((float*)element)[1];
967 			b = ((float*)element)[2];
968 			a = ((float*)element)[3];
969 			break;
970 		case FORMAT_D32F:
971 		case FORMAT_D32F_LOCKABLE:
972 		case FORMAT_D32FS8_TEXTURE:
973 		case FORMAT_D32FS8_SHADOW:
974 			r = *(float*)element;
975 			g = r;
976 			b = r;
977 			a = r;
978 			break;
979 		case FORMAT_D32F_COMPLEMENTARY:
980 			r = 1.0f - *(float*)element;
981 			g = r;
982 			b = r;
983 			a = r;
984 			break;
985 		case FORMAT_S8:
986 			r = *(unsigned char*)element * (1.0f / 0xFF);
987 			break;
988 		default:
989 			ASSERT(false);
990 		}
991 
992 	//	if(sRGB)
993 	//	{
994 	//		r = sRGBtoLinear(r);
995 	//		g = sRGBtoLinear(g);
996 	//		b = sRGBtoLinear(b);
997 	//	}
998 
999 		return Color<float>(r, g, b, a);
1000 	}
1001 
sample(float x,float y,float z) const1002 	Color<float> Surface::Buffer::sample(float x, float y, float z) const
1003 	{
1004 		x -= 0.5f;
1005 		y -= 0.5f;
1006 		z -= 0.5f;
1007 
1008 		int x0 = clamp((int)x, 0, width - 1);
1009 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1010 
1011 		int y0 = clamp((int)y, 0, height - 1);
1012 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1013 
1014 		int z0 = clamp((int)z, 0, depth - 1);
1015 		int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1;
1016 
1017 		Color<float> c000 = read(x0, y0, z0);
1018 		Color<float> c100 = read(x1, y0, z0);
1019 		Color<float> c010 = read(x0, y1, z0);
1020 		Color<float> c110 = read(x1, y1, z0);
1021 		Color<float> c001 = read(x0, y0, z1);
1022 		Color<float> c101 = read(x1, y0, z1);
1023 		Color<float> c011 = read(x0, y1, z1);
1024 		Color<float> c111 = read(x1, y1, z1);
1025 
1026 		float fx = x - x0;
1027 		float fy = y - y0;
1028 		float fz = z - z0;
1029 
1030 		c000 *= (1 - fx) * (1 - fy) * (1 - fz);
1031 		c100 *= fx * (1 - fy) * (1 - fz);
1032 		c010 *= (1 - fx) * fy * (1 - fz);
1033 		c110 *= fx * fy * (1 - fz);
1034 		c001 *= (1 - fx) * (1 - fy) * fz;
1035 		c101 *= fx * (1 - fy) * fz;
1036 		c011 *= (1 - fx) * fy * fz;
1037 		c111 *= fx * fy * fz;
1038 
1039 		return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111;
1040 	}
1041 
sample(float x,float y) const1042 	Color<float> Surface::Buffer::sample(float x, float y) const
1043 	{
1044 		x -= 0.5f;
1045 		y -= 0.5f;
1046 
1047 		int x0 = clamp((int)x, 0, width - 1);
1048 		int x1 = (x0 + 1 >= width) ? x0 : x0 + 1;
1049 
1050 		int y0 = clamp((int)y, 0, height - 1);
1051 		int y1 = (y0 + 1 >= height) ? y0 : y0 + 1;
1052 
1053 		Color<float> c00 = read(x0, y0);
1054 		Color<float> c10 = read(x1, y0);
1055 		Color<float> c01 = read(x0, y1);
1056 		Color<float> c11 = read(x1, y1);
1057 
1058 		float fx = x - x0;
1059 		float fy = y - y0;
1060 
1061 		c00 *= (1 - fx) * (1 - fy);
1062 		c10 *= fx * (1 - fy);
1063 		c01 *= (1 - fx) * fy;
1064 		c11 *= fx * fy;
1065 
1066 		return c00 + c10 + c01 + c11;
1067 	}
1068 
lockRect(int x,int y,int z,Lock lock)1069 	void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock)
1070 	{
1071 		this->lock = lock;
1072 
1073 		switch(lock)
1074 		{
1075 		case LOCK_UNLOCKED:
1076 		case LOCK_READONLY:
1077 			break;
1078 		case LOCK_WRITEONLY:
1079 		case LOCK_READWRITE:
1080 		case LOCK_DISCARD:
1081 			dirty = true;
1082 			break;
1083 		default:
1084 			ASSERT(false);
1085 		}
1086 
1087 		if(buffer)
1088 		{
1089 			switch(format)
1090 			{
1091 			#if S3TC_SUPPORT
1092 			case FORMAT_DXT1:
1093 			#endif
1094 			case FORMAT_ATI1:
1095 			case FORMAT_ETC1:
1096 			case FORMAT_R11_EAC:
1097 			case FORMAT_SIGNED_R11_EAC:
1098 			case FORMAT_RGB8_ETC2:
1099 			case FORMAT_SRGB8_ETC2:
1100 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1101 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1102 				return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1103 			case FORMAT_RG11_EAC:
1104 			case FORMAT_SIGNED_RG11_EAC:
1105 			case FORMAT_RGBA8_ETC2_EAC:
1106 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1107 			case FORMAT_RGBA_ASTC_4x4_KHR:
1108 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1109 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1110 			case FORMAT_RGBA_ASTC_5x4_KHR:
1111 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1112 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 4) * pitchB + z * sliceB;
1113 			case FORMAT_RGBA_ASTC_5x5_KHR:
1114 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1115 				return (unsigned char*)buffer + 16 * (x / 5) + (y / 5) * pitchB + z * sliceB;
1116 			case FORMAT_RGBA_ASTC_6x5_KHR:
1117 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1118 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 5) * pitchB + z * sliceB;
1119 			case FORMAT_RGBA_ASTC_6x6_KHR:
1120 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1121 				return (unsigned char*)buffer + 16 * (x / 6) + (y / 6) * pitchB + z * sliceB;
1122 			case FORMAT_RGBA_ASTC_8x5_KHR:
1123 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1124 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 5) * pitchB + z * sliceB;
1125 			case FORMAT_RGBA_ASTC_8x6_KHR:
1126 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1127 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 6) * pitchB + z * sliceB;
1128 			case FORMAT_RGBA_ASTC_8x8_KHR:
1129 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1130 				return (unsigned char*)buffer + 16 * (x / 8) + (y / 8) * pitchB + z * sliceB;
1131 			case FORMAT_RGBA_ASTC_10x5_KHR:
1132 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1133 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 5) * pitchB + z * sliceB;
1134 			case FORMAT_RGBA_ASTC_10x6_KHR:
1135 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1136 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 6) * pitchB + z * sliceB;
1137 			case FORMAT_RGBA_ASTC_10x8_KHR:
1138 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1139 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 8) * pitchB + z * sliceB;
1140 			case FORMAT_RGBA_ASTC_10x10_KHR:
1141 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1142 				return (unsigned char*)buffer + 16 * (x / 10) + (y / 10) * pitchB + z * sliceB;
1143 			case FORMAT_RGBA_ASTC_12x10_KHR:
1144 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1145 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 10) * pitchB + z * sliceB;
1146 			case FORMAT_RGBA_ASTC_12x12_KHR:
1147 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1148 				return (unsigned char*)buffer + 16 * (x / 12) + (y / 12) * pitchB + z * sliceB;
1149 			#if S3TC_SUPPORT
1150 			case FORMAT_DXT3:
1151 			case FORMAT_DXT5:
1152 			#endif
1153 			case FORMAT_ATI2:
1154 				return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB;
1155 			default:
1156 				return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB;
1157 			}
1158 		}
1159 
1160 		return 0;
1161 	}
1162 
unlockRect()1163 	void Surface::Buffer::unlockRect()
1164 	{
1165 		lock = LOCK_UNLOCKED;
1166 	}
1167 
Surface(int width,int height,int depth,Format format,void * pixels,int pitch,int slice)1168 	Surface::Surface(int width, int height, int depth, Format format, void *pixels, int pitch, int slice) : lockable(true), renderTarget(false)
1169 	{
1170 		resource = new Resource(0);
1171 		hasParent = false;
1172 		ownExternal = false;
1173 		depth = max(1, depth);
1174 
1175 		external.buffer = pixels;
1176 		external.width = width;
1177 		external.height = height;
1178 		external.depth = depth;
1179 		external.format = format;
1180 		external.bytes = bytes(external.format);
1181 		external.pitchB = pitch;
1182 		external.pitchP = external.bytes ? pitch / external.bytes : 0;
1183 		external.sliceB = slice;
1184 		external.sliceP = external.bytes ? slice / external.bytes : 0;
1185 		external.lock = LOCK_UNLOCKED;
1186 		external.dirty = true;
1187 
1188 		internal.buffer = 0;
1189 		internal.width = width;
1190 		internal.height = height;
1191 		internal.depth = depth;
1192 		internal.format = selectInternalFormat(format);
1193 		internal.bytes = bytes(internal.format);
1194 		internal.pitchB = pitchB(internal.width, internal.format, false);
1195 		internal.pitchP = pitchP(internal.width, internal.format, false);
1196 		internal.sliceB = sliceB(internal.width, internal.height, internal.format, false);
1197 		internal.sliceP = sliceP(internal.width, internal.height, internal.format, false);
1198 		internal.lock = LOCK_UNLOCKED;
1199 		internal.dirty = false;
1200 
1201 		stencil.buffer = 0;
1202 		stencil.width = width;
1203 		stencil.height = height;
1204 		stencil.depth = depth;
1205 		stencil.format = FORMAT_S8;
1206 		stencil.bytes = bytes(stencil.format);
1207 		stencil.pitchB = pitchB(stencil.width, stencil.format, false);
1208 		stencil.pitchP = pitchP(stencil.width, stencil.format, false);
1209 		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, false);
1210 		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, false);
1211 		stencil.lock = LOCK_UNLOCKED;
1212 		stencil.dirty = false;
1213 
1214 		dirtyMipmaps = true;
1215 		paletteUsed = 0;
1216 	}
1217 
Surface(Resource * texture,int width,int height,int depth,Format format,bool lockable,bool renderTarget,int pitchPprovided)1218 	Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget, int pitchPprovided) : lockable(lockable), renderTarget(renderTarget)
1219 	{
1220 		resource = texture ? texture : new Resource(0);
1221 		hasParent = texture != 0;
1222 		ownExternal = true;
1223 		depth = max(1, depth);
1224 
1225 		external.buffer = 0;
1226 		external.width = width;
1227 		external.height = height;
1228 		external.depth = depth;
1229 		external.format = format;
1230 		external.bytes = bytes(external.format);
1231 		external.pitchB = pitchB(external.width, external.format, renderTarget && !texture);
1232 		external.pitchP = pitchP(external.width, external.format, renderTarget && !texture);
1233 		external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture);
1234 		external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture);
1235 		external.lock = LOCK_UNLOCKED;
1236 		external.dirty = false;
1237 
1238 		internal.buffer = 0;
1239 		internal.width = width;
1240 		internal.height = height;
1241 		internal.depth = depth;
1242 		internal.format = selectInternalFormat(format);
1243 		internal.bytes = bytes(internal.format);
1244 		internal.pitchB = !pitchPprovided ? pitchB(internal.width, internal.format, renderTarget) : pitchPprovided * internal.bytes;
1245 		internal.pitchP = !pitchPprovided ? pitchP(internal.width, internal.format, renderTarget) : pitchPprovided;
1246 		internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget);
1247 		internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget);
1248 		internal.lock = LOCK_UNLOCKED;
1249 		internal.dirty = false;
1250 
1251 		stencil.buffer = 0;
1252 		stencil.width = width;
1253 		stencil.height = height;
1254 		stencil.depth = depth;
1255 		stencil.format = FORMAT_S8;
1256 		stencil.bytes = bytes(stencil.format);
1257 		stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget);
1258 		stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget);
1259 		stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget);
1260 		stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget);
1261 		stencil.lock = LOCK_UNLOCKED;
1262 		stencil.dirty = false;
1263 
1264 		dirtyMipmaps = true;
1265 		paletteUsed = 0;
1266 	}
1267 
~Surface()1268 	Surface::~Surface()
1269 	{
1270 		// Synchronize so we can deallocate the buffers below
1271 		resource->lock(DESTRUCT);
1272 		resource->unlock();
1273 
1274 		if(!hasParent)
1275 		{
1276 			resource->destruct();
1277 		}
1278 
1279 		if(ownExternal)
1280 		{
1281 			deallocate(external.buffer);
1282 		}
1283 
1284 		if(internal.buffer != external.buffer)
1285 		{
1286 			deallocate(internal.buffer);
1287 		}
1288 
1289 		deallocate(stencil.buffer);
1290 
1291 		external.buffer = 0;
1292 		internal.buffer = 0;
1293 		stencil.buffer = 0;
1294 	}
1295 
lockExternal(int x,int y,int z,Lock lock,Accessor client)1296 	void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client)
1297 	{
1298 		resource->lock(client);
1299 
1300 		if(!external.buffer)
1301 		{
1302 			if(internal.buffer && identicalFormats())
1303 			{
1304 				external.buffer = internal.buffer;
1305 			}
1306 			else
1307 			{
1308 				external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format);
1309 			}
1310 		}
1311 
1312 		if(internal.dirty)
1313 		{
1314 			if(lock != LOCK_DISCARD)
1315 			{
1316 				update(external, internal);
1317 			}
1318 
1319 			internal.dirty = false;
1320 		}
1321 
1322 		switch(lock)
1323 		{
1324 		case LOCK_READONLY:
1325 			break;
1326 		case LOCK_WRITEONLY:
1327 		case LOCK_READWRITE:
1328 		case LOCK_DISCARD:
1329 			dirtyMipmaps = true;
1330 			break;
1331 		default:
1332 			ASSERT(false);
1333 		}
1334 
1335 		return external.lockRect(x, y, z, lock);
1336 	}
1337 
unlockExternal()1338 	void Surface::unlockExternal()
1339 	{
1340 		resource->unlock();
1341 
1342 		external.unlockRect();
1343 	}
1344 
lockInternal(int x,int y,int z,Lock lock,Accessor client)1345 	void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client)
1346 	{
1347 		if(lock != LOCK_UNLOCKED)
1348 		{
1349 			resource->lock(client);
1350 		}
1351 
1352 		if(!internal.buffer)
1353 		{
1354 			if(external.buffer && identicalFormats())
1355 			{
1356 				internal.buffer = external.buffer;
1357 			}
1358 			else
1359 			{
1360 				internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format);
1361 			}
1362 		}
1363 
1364 		// FIXME: WHQL requires conversion to lower external precision and back
1365 		if(logPrecision >= WHQL)
1366 		{
1367 			if(internal.dirty && renderTarget && internal.format != external.format)
1368 			{
1369 				if(lock != LOCK_DISCARD)
1370 				{
1371 					switch(external.format)
1372 					{
1373 					case FORMAT_R3G3B2:
1374 					case FORMAT_A8R3G3B2:
1375 					case FORMAT_A1R5G5B5:
1376 					case FORMAT_A2R10G10B10:
1377 					case FORMAT_A2B10G10R10:
1378 						lockExternal(0, 0, 0, LOCK_READWRITE, client);
1379 						unlockExternal();
1380 						break;
1381 					default:
1382 						// Difference passes WHQL
1383 						break;
1384 					}
1385 				}
1386 			}
1387 		}
1388 
1389 		if(external.dirty || (isPalette(external.format) && paletteUsed != Surface::paletteID))
1390 		{
1391 			if(lock != LOCK_DISCARD)
1392 			{
1393 				update(internal, external);
1394 			}
1395 
1396 			external.dirty = false;
1397 			paletteUsed = Surface::paletteID;
1398 		}
1399 
1400 		switch(lock)
1401 		{
1402 		case LOCK_UNLOCKED:
1403 		case LOCK_READONLY:
1404 			break;
1405 		case LOCK_WRITEONLY:
1406 		case LOCK_READWRITE:
1407 		case LOCK_DISCARD:
1408 			dirtyMipmaps = true;
1409 			break;
1410 		default:
1411 			ASSERT(false);
1412 		}
1413 
1414 		if(lock == LOCK_READONLY && client == PUBLIC)
1415 		{
1416 			resolve();
1417 		}
1418 
1419 		return internal.lockRect(x, y, z, lock);
1420 	}
1421 
unlockInternal()1422 	void Surface::unlockInternal()
1423 	{
1424 		resource->unlock();
1425 
1426 		internal.unlockRect();
1427 	}
1428 
lockStencil(int front,Accessor client)1429 	void *Surface::lockStencil(int front, Accessor client)
1430 	{
1431 		resource->lock(client);
1432 
1433 		if(!stencil.buffer)
1434 		{
1435 			stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format);
1436 		}
1437 
1438 		return stencil.lockRect(0, 0, front, LOCK_READWRITE);   // FIXME
1439 	}
1440 
unlockStencil()1441 	void Surface::unlockStencil()
1442 	{
1443 		resource->unlock();
1444 
1445 		stencil.unlockRect();
1446 	}
1447 
bytes(Format format)1448 	int Surface::bytes(Format format)
1449 	{
1450 		switch(format)
1451 		{
1452 		case FORMAT_NULL:				return 0;
1453 		case FORMAT_P8:					return 1;
1454 		case FORMAT_A8P8:				return 2;
1455 		case FORMAT_A8:					return 1;
1456 		case FORMAT_R8I:				return 1;
1457 		case FORMAT_R8:					return 1;
1458 		case FORMAT_R3G3B2:				return 1;
1459 		case FORMAT_R16I:				return 2;
1460 		case FORMAT_R16UI:				return 2;
1461 		case FORMAT_A8R3G3B2:			return 2;
1462 		case FORMAT_R5G6B5:				return 2;
1463 		case FORMAT_A1R5G5B5:			return 2;
1464 		case FORMAT_X1R5G5B5:			return 2;
1465 		case FORMAT_R5G5B5A1:           return 2;
1466 		case FORMAT_X4R4G4B4:			return 2;
1467 		case FORMAT_A4R4G4B4:			return 2;
1468 		case FORMAT_R4G4B4A4:           return 2;
1469 		case FORMAT_R8G8B8:				return 3;
1470 		case FORMAT_B8G8R8:             return 3;
1471 		case FORMAT_R32I:				return 4;
1472 		case FORMAT_R32UI:				return 4;
1473 		case FORMAT_X8R8G8B8:			return 4;
1474 	//	case FORMAT_X8G8R8B8Q:			return 4;
1475 		case FORMAT_A8R8G8B8:			return 4;
1476 	//	case FORMAT_A8G8R8B8Q:			return 4;
1477 		case FORMAT_X8B8G8R8I:			return 4;
1478 		case FORMAT_X8B8G8R8:			return 4;
1479 		case FORMAT_SRGB8_X8:			return 4;
1480 		case FORMAT_SRGB8_A8:			return 4;
1481 		case FORMAT_A8B8G8R8I:			return 4;
1482 		case FORMAT_R8UI:				return 1;
1483 		case FORMAT_G8R8UI:				return 2;
1484 		case FORMAT_X8B8G8R8UI:			return 4;
1485 		case FORMAT_A8B8G8R8UI:			return 4;
1486 		case FORMAT_A8B8G8R8:			return 4;
1487 		case FORMAT_R8I_SNORM:			return 1;
1488 		case FORMAT_G8R8I_SNORM:		return 2;
1489 		case FORMAT_X8B8G8R8I_SNORM:	return 4;
1490 		case FORMAT_A8B8G8R8I_SNORM:	return 4;
1491 		case FORMAT_A2R10G10B10:		return 4;
1492 		case FORMAT_A2B10G10R10:		return 4;
1493 		case FORMAT_G8R8I:				return 2;
1494 		case FORMAT_G8R8:				return 2;
1495 		case FORMAT_G16R16I:			return 4;
1496 		case FORMAT_G16R16UI:			return 4;
1497 		case FORMAT_G16R16:				return 4;
1498 		case FORMAT_G32R32I:			return 8;
1499 		case FORMAT_G32R32UI:			return 8;
1500 		case FORMAT_X16B16G16R16I:		return 8;
1501 		case FORMAT_X16B16G16R16UI:		return 8;
1502 		case FORMAT_A16B16G16R16I:		return 8;
1503 		case FORMAT_A16B16G16R16UI:		return 8;
1504 		case FORMAT_A16B16G16R16:		return 8;
1505 		case FORMAT_X32B32G32R32I:		return 16;
1506 		case FORMAT_X32B32G32R32UI:		return 16;
1507 		case FORMAT_A32B32G32R32I:		return 16;
1508 		case FORMAT_A32B32G32R32UI:		return 16;
1509 		// Compressed formats
1510 		#if S3TC_SUPPORT
1511 		case FORMAT_DXT1:				return 2;   // Column of four pixels
1512 		case FORMAT_DXT3:				return 4;   // Column of four pixels
1513 		case FORMAT_DXT5:				return 4;   // Column of four pixels
1514 		#endif
1515 		case FORMAT_ATI1:				return 2;   // Column of four pixels
1516 		case FORMAT_ATI2:				return 4;   // Column of four pixels
1517 		case FORMAT_ETC1:				return 2;   // Column of four pixels
1518 		case FORMAT_R11_EAC:			return 2;
1519 		case FORMAT_SIGNED_R11_EAC:		return 2;
1520 		case FORMAT_RG11_EAC:			return 4;
1521 		case FORMAT_SIGNED_RG11_EAC:	return 4;
1522 		case FORMAT_RGB8_ETC2:			return 2;
1523 		case FORMAT_SRGB8_ETC2:			return 2;
1524 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1525 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:	return 2;
1526 		case FORMAT_RGBA8_ETC2_EAC:			return 4;
1527 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:	return 4;
1528 		case FORMAT_RGBA_ASTC_4x4_KHR:
1529 		case FORMAT_RGBA_ASTC_5x4_KHR:
1530 		case FORMAT_RGBA_ASTC_5x5_KHR:
1531 		case FORMAT_RGBA_ASTC_6x5_KHR:
1532 		case FORMAT_RGBA_ASTC_6x6_KHR:
1533 		case FORMAT_RGBA_ASTC_8x5_KHR:
1534 		case FORMAT_RGBA_ASTC_8x6_KHR:
1535 		case FORMAT_RGBA_ASTC_8x8_KHR:
1536 		case FORMAT_RGBA_ASTC_10x5_KHR:
1537 		case FORMAT_RGBA_ASTC_10x6_KHR:
1538 		case FORMAT_RGBA_ASTC_10x8_KHR:
1539 		case FORMAT_RGBA_ASTC_10x10_KHR:
1540 		case FORMAT_RGBA_ASTC_12x10_KHR:
1541 		case FORMAT_RGBA_ASTC_12x12_KHR:
1542 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1543 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1544 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1545 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1546 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1547 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1548 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1549 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1550 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1551 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1552 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1553 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1554 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1555 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: return 0; // FIXME
1556 		// Bumpmap formats
1557 		case FORMAT_V8U8:				return 2;
1558 		case FORMAT_L6V5U5:				return 2;
1559 		case FORMAT_Q8W8V8U8:			return 4;
1560 		case FORMAT_X8L8V8U8:			return 4;
1561 		case FORMAT_A2W10V10U10:		return 4;
1562 		case FORMAT_V16U16:				return 4;
1563 		case FORMAT_A16W16V16U16:		return 8;
1564 		case FORMAT_Q16W16V16U16:		return 8;
1565 		// Luminance formats
1566 		case FORMAT_L8:					return 1;
1567 		case FORMAT_A4L4:				return 1;
1568 		case FORMAT_L16:				return 2;
1569 		case FORMAT_A8L8:				return 2;
1570 		case FORMAT_L16F:               return 2;
1571 		case FORMAT_A16L16F:            return 4;
1572 		case FORMAT_L32F:               return 4;
1573 		case FORMAT_A32L32F:            return 8;
1574 		// Floating-point formats
1575 		case FORMAT_A16F:				return 2;
1576 		case FORMAT_R16F:				return 2;
1577 		case FORMAT_G16R16F:			return 4;
1578 		case FORMAT_B16G16R16F:			return 6;
1579 		case FORMAT_A16B16G16R16F:		return 8;
1580 		case FORMAT_A32F:				return 4;
1581 		case FORMAT_R32F:				return 4;
1582 		case FORMAT_G32R32F:			return 8;
1583 		case FORMAT_B32G32R32F:			return 12;
1584 		case FORMAT_X32B32G32R32F:		return 16;
1585 		case FORMAT_A32B32G32R32F:		return 16;
1586 		// Depth/stencil formats
1587 		case FORMAT_D16:				return 2;
1588 		case FORMAT_D32:				return 4;
1589 		case FORMAT_D24X8:				return 4;
1590 		case FORMAT_D24S8:				return 4;
1591 		case FORMAT_D24FS8:				return 4;
1592 		case FORMAT_D32F:				return 4;
1593 		case FORMAT_D32F_COMPLEMENTARY:	return 4;
1594 		case FORMAT_D32F_LOCKABLE:		return 4;
1595 		case FORMAT_D32FS8_TEXTURE:		return 4;
1596 		case FORMAT_D32FS8_SHADOW:		return 4;
1597 		case FORMAT_DF24S8:				return 4;
1598 		case FORMAT_DF16S8:				return 2;
1599 		case FORMAT_INTZ:				return 4;
1600 		case FORMAT_S8:					return 1;
1601 		case FORMAT_YV12_BT601:         return 1;   // Y plane only
1602 		case FORMAT_YV12_BT709:         return 1;   // Y plane only
1603 		case FORMAT_YV12_JFIF:          return 1;   // Y plane only
1604 		default:
1605 			ASSERT(false);
1606 		}
1607 
1608 		return 0;
1609 	}
1610 
pitchB(int width,Format format,bool target)1611 	int Surface::pitchB(int width, Format format, bool target)
1612 	{
1613 		if(target || isDepth(format) || isStencil(format))
1614 		{
1615 			width = align(width, 2);
1616 		}
1617 
1618 		switch(format)
1619 		{
1620 		#if S3TC_SUPPORT
1621 		case FORMAT_DXT1:
1622 		#endif
1623 		case FORMAT_ETC1:
1624 		case FORMAT_R11_EAC:
1625 		case FORMAT_SIGNED_R11_EAC:
1626 		case FORMAT_RGB8_ETC2:
1627 		case FORMAT_SRGB8_ETC2:
1628 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1629 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1630 			return 8 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per 4 rows
1631 		case FORMAT_RG11_EAC:
1632 		case FORMAT_SIGNED_RG11_EAC:
1633 		case FORMAT_RGBA8_ETC2_EAC:
1634 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1635 		case FORMAT_RGBA_ASTC_4x4_KHR:
1636 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1637 			return 16 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per 4 rows
1638 		case FORMAT_RGBA_ASTC_5x4_KHR:
1639 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1640 		case FORMAT_RGBA_ASTC_5x5_KHR:
1641 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1642 			return 16 * ((width + 4) / 5);
1643 		case FORMAT_RGBA_ASTC_6x5_KHR:
1644 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1645 		case FORMAT_RGBA_ASTC_6x6_KHR:
1646 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1647 			return 16 * ((width + 5) / 6);
1648 		case FORMAT_RGBA_ASTC_8x5_KHR:
1649 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1650 		case FORMAT_RGBA_ASTC_8x6_KHR:
1651 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1652 		case FORMAT_RGBA_ASTC_8x8_KHR:
1653 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1654 			return 16 * ((width + 7) / 8);
1655 		case FORMAT_RGBA_ASTC_10x5_KHR:
1656 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1657 		case FORMAT_RGBA_ASTC_10x6_KHR:
1658 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1659 		case FORMAT_RGBA_ASTC_10x8_KHR:
1660 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1661 		case FORMAT_RGBA_ASTC_10x10_KHR:
1662 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1663 			return 16 * ((width + 9) / 10);
1664 		case FORMAT_RGBA_ASTC_12x10_KHR:
1665 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1666 		case FORMAT_RGBA_ASTC_12x12_KHR:
1667 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1668 			return 16 * ((width + 11) / 12);
1669 		#if S3TC_SUPPORT
1670 		case FORMAT_DXT3:
1671 		case FORMAT_DXT5:
1672 			return 16 * ((width + 3) / 4);   // 128 bit per 4x4 block, computed per 4 rows
1673 		#endif
1674 		case FORMAT_ATI1:
1675 			return 2 * ((width + 3) / 4);    // 64 bit per 4x4 block, computed per row
1676 		case FORMAT_ATI2:
1677 			return 4 * ((width + 3) / 4);    // 128 bit per 4x4 block, computed per row
1678 		case FORMAT_YV12_BT601:
1679 		case FORMAT_YV12_BT709:
1680 		case FORMAT_YV12_JFIF:
1681 			return align(width, 16);
1682 		default:
1683 			return bytes(format) * width;
1684 		}
1685 	}
1686 
pitchP(int width,Format format,bool target)1687 	int Surface::pitchP(int width, Format format, bool target)
1688 	{
1689 		int B = bytes(format);
1690 
1691 		return B > 0 ? pitchB(width, format, target) / B : 0;
1692 	}
1693 
sliceB(int width,int height,Format format,bool target)1694 	int Surface::sliceB(int width, int height, Format format, bool target)
1695 	{
1696 		if(target || isDepth(format) || isStencil(format))
1697 		{
1698 			height = ((height + 1) & ~1);
1699 		}
1700 
1701 		switch(format)
1702 		{
1703 		#if S3TC_SUPPORT
1704 		case FORMAT_DXT1:
1705 		case FORMAT_DXT3:
1706 		case FORMAT_DXT5:
1707 		#endif
1708 		case FORMAT_ETC1:
1709 		case FORMAT_R11_EAC:
1710 		case FORMAT_SIGNED_R11_EAC:
1711 		case FORMAT_RG11_EAC:
1712 		case FORMAT_SIGNED_RG11_EAC:
1713 		case FORMAT_RGB8_ETC2:
1714 		case FORMAT_SRGB8_ETC2:
1715 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1716 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
1717 		case FORMAT_RGBA8_ETC2_EAC:
1718 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
1719 		case FORMAT_RGBA_ASTC_4x4_KHR:
1720 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
1721 		case FORMAT_RGBA_ASTC_5x4_KHR:
1722 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
1723 			return pitchB(width, format, target) * ((height + 3) / 4);   // Pitch computed per 4 rows
1724 		case FORMAT_RGBA_ASTC_5x5_KHR:
1725 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
1726 		case FORMAT_RGBA_ASTC_6x5_KHR:
1727 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
1728 		case FORMAT_RGBA_ASTC_8x5_KHR:
1729 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
1730 		case FORMAT_RGBA_ASTC_10x5_KHR:
1731 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
1732 			return pitchB(width, format, target) * ((height + 4) / 5);   // Pitch computed per 5 rows
1733 		case FORMAT_RGBA_ASTC_6x6_KHR:
1734 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
1735 		case FORMAT_RGBA_ASTC_8x6_KHR:
1736 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
1737 		case FORMAT_RGBA_ASTC_10x6_KHR:
1738 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
1739 			return pitchB(width, format, target) * ((height + 5) / 6);   // Pitch computed per 6 rows
1740 		case FORMAT_RGBA_ASTC_8x8_KHR:
1741 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
1742 		case FORMAT_RGBA_ASTC_10x8_KHR:
1743 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
1744 			return pitchB(width, format, target) * ((height + 7) / 8);   // Pitch computed per 8 rows
1745 		case FORMAT_RGBA_ASTC_10x10_KHR:
1746 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
1747 		case FORMAT_RGBA_ASTC_12x10_KHR:
1748 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
1749 			return pitchB(width, format, target) * ((height + 9) / 10);   // Pitch computed per 10 rows
1750 		case FORMAT_RGBA_ASTC_12x12_KHR:
1751 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
1752 			return pitchB(width, format, target) * ((height + 11) / 12);   // Pitch computed per 12 rows
1753 		case FORMAT_ATI1:
1754 		case FORMAT_ATI2:
1755 		default:
1756 			return pitchB(width, format, target) * height;   // Pitch computed per row
1757 		}
1758 	}
1759 
sliceP(int width,int height,Format format,bool target)1760 	int Surface::sliceP(int width, int height, Format format, bool target)
1761 	{
1762 		int B = bytes(format);
1763 
1764 		return B > 0 ? sliceB(width, height, format, target) / B : 0;
1765 	}
1766 
update(Buffer & destination,Buffer & source)1767 	void Surface::update(Buffer &destination, Buffer &source)
1768 	{
1769 	//	ASSERT(source.lock != LOCK_UNLOCKED);
1770 	//	ASSERT(destination.lock != LOCK_UNLOCKED);
1771 
1772 		if(destination.buffer != source.buffer)
1773 		{
1774 			ASSERT(source.dirty && !destination.dirty);
1775 
1776 			switch(source.format)
1777 			{
1778 			case FORMAT_R8G8B8:		decodeR8G8B8(destination, source);		break;   // FIXME: Check destination format
1779 			case FORMAT_X1R5G5B5:	decodeX1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1780 			case FORMAT_A1R5G5B5:	decodeA1R5G5B5(destination, source);	break;   // FIXME: Check destination format
1781 			case FORMAT_X4R4G4B4:	decodeX4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1782 			case FORMAT_A4R4G4B4:	decodeA4R4G4B4(destination, source);	break;   // FIXME: Check destination format
1783 			case FORMAT_P8:			decodeP8(destination, source);			break;   // FIXME: Check destination format
1784 			#if S3TC_SUPPORT
1785 			case FORMAT_DXT1:		decodeDXT1(destination, source);		break;   // FIXME: Check destination format
1786 			case FORMAT_DXT3:		decodeDXT3(destination, source);		break;   // FIXME: Check destination format
1787 			case FORMAT_DXT5:		decodeDXT5(destination, source);		break;   // FIXME: Check destination format
1788 			#endif
1789 			case FORMAT_ATI1:		decodeATI1(destination, source);		break;   // FIXME: Check destination format
1790 			case FORMAT_ATI2:		decodeATI2(destination, source);		break;   // FIXME: Check destination format
1791 			case FORMAT_R11_EAC:         decodeEAC(destination, source, 1, false); break; // FIXME: Check destination format
1792 			case FORMAT_SIGNED_R11_EAC:  decodeEAC(destination, source, 1, true);  break; // FIXME: Check destination format
1793 			case FORMAT_RG11_EAC:        decodeEAC(destination, source, 2, false); break; // FIXME: Check destination format
1794 			case FORMAT_SIGNED_RG11_EAC: decodeEAC(destination, source, 2, true);  break; // FIXME: Check destination format
1795 			case FORMAT_ETC1:
1796 			case FORMAT_RGB8_ETC2:                      decodeETC2(destination, source, 0, false); break; // FIXME: Check destination format
1797 			case FORMAT_SRGB8_ETC2:                     decodeETC2(destination, source, 0, true);  break; // FIXME: Check destination format
1798 			case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:  decodeETC2(destination, source, 1, false); break; // FIXME: Check destination format
1799 			case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: decodeETC2(destination, source, 1, true);  break; // FIXME: Check destination format
1800 			case FORMAT_RGBA8_ETC2_EAC:                 decodeETC2(destination, source, 8, false); break; // FIXME: Check destination format
1801 			case FORMAT_SRGB8_ALPHA8_ETC2_EAC:          decodeETC2(destination, source, 8, true);  break; // FIXME: Check destination format
1802 			case FORMAT_RGBA_ASTC_4x4_KHR:           decodeASTC(destination, source, 4,  4,  1, false); break; // FIXME: Check destination format
1803 			case FORMAT_RGBA_ASTC_5x4_KHR:           decodeASTC(destination, source, 5,  4,  1, false); break; // FIXME: Check destination format
1804 			case FORMAT_RGBA_ASTC_5x5_KHR:           decodeASTC(destination, source, 5,  5,  1, false); break; // FIXME: Check destination format
1805 			case FORMAT_RGBA_ASTC_6x5_KHR:           decodeASTC(destination, source, 6,  5,  1, false); break; // FIXME: Check destination format
1806 			case FORMAT_RGBA_ASTC_6x6_KHR:           decodeASTC(destination, source, 6,  6,  1, false); break; // FIXME: Check destination format
1807 			case FORMAT_RGBA_ASTC_8x5_KHR:           decodeASTC(destination, source, 8,  5,  1, false); break; // FIXME: Check destination format
1808 			case FORMAT_RGBA_ASTC_8x6_KHR:           decodeASTC(destination, source, 8,  6,  1, false); break; // FIXME: Check destination format
1809 			case FORMAT_RGBA_ASTC_8x8_KHR:           decodeASTC(destination, source, 8,  8,  1, false); break; // FIXME: Check destination format
1810 			case FORMAT_RGBA_ASTC_10x5_KHR:          decodeASTC(destination, source, 10, 5,  1, false); break; // FIXME: Check destination format
1811 			case FORMAT_RGBA_ASTC_10x6_KHR:          decodeASTC(destination, source, 10, 6,  1, false); break; // FIXME: Check destination format
1812 			case FORMAT_RGBA_ASTC_10x8_KHR:          decodeASTC(destination, source, 10, 8,  1, false); break; // FIXME: Check destination format
1813 			case FORMAT_RGBA_ASTC_10x10_KHR:         decodeASTC(destination, source, 10, 10, 1, false); break; // FIXME: Check destination format
1814 			case FORMAT_RGBA_ASTC_12x10_KHR:         decodeASTC(destination, source, 12, 10, 1, false); break; // FIXME: Check destination format
1815 			case FORMAT_RGBA_ASTC_12x12_KHR:         decodeASTC(destination, source, 12, 12, 1, false); break; // FIXME: Check destination format
1816 			case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:   decodeASTC(destination, source, 4,  4,  1, true);  break; // FIXME: Check destination format
1817 			case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:   decodeASTC(destination, source, 5,  4,  1, true);  break; // FIXME: Check destination format
1818 			case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:   decodeASTC(destination, source, 5,  5,  1, true);  break; // FIXME: Check destination format
1819 			case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:   decodeASTC(destination, source, 6,  5,  1, true);  break; // FIXME: Check destination format
1820 			case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:   decodeASTC(destination, source, 6,  6,  1, true);  break; // FIXME: Check destination format
1821 			case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:   decodeASTC(destination, source, 8,  5,  1, true);  break; // FIXME: Check destination format
1822 			case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:   decodeASTC(destination, source, 8,  6,  1, true);  break; // FIXME: Check destination format
1823 			case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:   decodeASTC(destination, source, 8,  8,  1, true);  break; // FIXME: Check destination format
1824 			case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:  decodeASTC(destination, source, 10, 5,  1, true);  break; // FIXME: Check destination format
1825 			case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:  decodeASTC(destination, source, 10, 6,  1, true);  break; // FIXME: Check destination format
1826 			case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:  decodeASTC(destination, source, 10, 8,  1, true);  break; // FIXME: Check destination format
1827 			case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR: decodeASTC(destination, source, 10, 10, 1, true);  break; // FIXME: Check destination format
1828 			case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR: decodeASTC(destination, source, 12, 10, 1, true);  break; // FIXME: Check destination format
1829 			case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR: decodeASTC(destination, source, 12, 12, 1, true);  break; // FIXME: Check destination format
1830 			default:				genericUpdate(destination, source);		break;
1831 			}
1832 		}
1833 	}
1834 
genericUpdate(Buffer & destination,Buffer & source)1835 	void Surface::genericUpdate(Buffer &destination, Buffer &source)
1836 	{
1837 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1838 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1839 
1840 		int depth = min(destination.depth, source.depth);
1841 		int height = min(destination.height, source.height);
1842 		int width = min(destination.width, source.width);
1843 		int rowBytes = width * source.bytes;
1844 
1845 		for(int z = 0; z < depth; z++)
1846 		{
1847 			unsigned char *sourceRow = sourceSlice;
1848 			unsigned char *destinationRow = destinationSlice;
1849 
1850 			for(int y = 0; y < height; y++)
1851 			{
1852 				if(source.format == destination.format)
1853 				{
1854 					memcpy(destinationRow, sourceRow, rowBytes);
1855 				}
1856 				else
1857 				{
1858 					unsigned char *sourceElement = sourceRow;
1859 					unsigned char *destinationElement = destinationRow;
1860 
1861 					for(int x = 0; x < width; x++)
1862 					{
1863 						Color<float> color = source.read(sourceElement);
1864 						destination.write(destinationElement, color);
1865 
1866 						sourceElement += source.bytes;
1867 						destinationElement += destination.bytes;
1868 					}
1869 				}
1870 
1871 				sourceRow += source.pitchB;
1872 				destinationRow += destination.pitchB;
1873 			}
1874 
1875 			sourceSlice += source.sliceB;
1876 			destinationSlice += destination.sliceB;
1877 		}
1878 	}
1879 
decodeR8G8B8(Buffer & destination,const Buffer & source)1880 	void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source)
1881 	{
1882 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1883 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1884 
1885 		for(int z = 0; z < destination.depth && z < source.depth; z++)
1886 		{
1887 			unsigned char *sourceRow = sourceSlice;
1888 			unsigned char *destinationRow = destinationSlice;
1889 
1890 			for(int y = 0; y < destination.height && y < source.height; y++)
1891 			{
1892 				unsigned char *sourceElement = sourceRow;
1893 				unsigned char *destinationElement = destinationRow;
1894 
1895 				for(int x = 0; x < destination.width && x < source.width; x++)
1896 				{
1897 					unsigned int b = sourceElement[0];
1898 					unsigned int g = sourceElement[1];
1899 					unsigned int r = sourceElement[2];
1900 
1901 					*(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0);
1902 
1903 					sourceElement += source.bytes;
1904 					destinationElement += destination.bytes;
1905 				}
1906 
1907 				sourceRow += source.pitchB;
1908 				destinationRow += destination.pitchB;
1909 			}
1910 
1911 			sourceSlice += source.sliceB;
1912 			destinationSlice += destination.sliceB;
1913 		}
1914 	}
1915 
decodeX1R5G5B5(Buffer & destination,const Buffer & source)1916 	void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source)
1917 	{
1918 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1919 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1920 
1921 		for(int z = 0; z < destination.depth && z < source.depth; z++)
1922 		{
1923 			unsigned char *sourceRow = sourceSlice;
1924 			unsigned char *destinationRow = destinationSlice;
1925 
1926 			for(int y = 0; y < destination.height && y < source.height; y++)
1927 			{
1928 				unsigned char *sourceElement = sourceRow;
1929 				unsigned char *destinationElement = destinationRow;
1930 
1931 				for(int x = 0; x < destination.width && x < source.width; x++)
1932 				{
1933 					unsigned int xrgb = *(unsigned short*)sourceElement;
1934 
1935 					unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1936 					unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00;
1937 					unsigned int b = (((xrgb & 0x001F) * 2106  + 0x80) >> 8);
1938 
1939 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
1940 
1941 					sourceElement += source.bytes;
1942 					destinationElement += destination.bytes;
1943 				}
1944 
1945 				sourceRow += source.pitchB;
1946 				destinationRow += destination.pitchB;
1947 			}
1948 
1949 			sourceSlice += source.sliceB;
1950 			destinationSlice += destination.sliceB;
1951 		}
1952 	}
1953 
decodeA1R5G5B5(Buffer & destination,const Buffer & source)1954 	void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source)
1955 	{
1956 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1957 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1958 
1959 		for(int z = 0; z < destination.depth && z < source.depth; z++)
1960 		{
1961 			unsigned char *sourceRow = sourceSlice;
1962 			unsigned char *destinationRow = destinationSlice;
1963 
1964 			for(int y = 0; y < destination.height && y < source.height; y++)
1965 			{
1966 				unsigned char *sourceElement = sourceRow;
1967 				unsigned char *destinationElement = destinationRow;
1968 
1969 				for(int x = 0; x < destination.width && x < source.width; x++)
1970 				{
1971 					unsigned int argb = *(unsigned short*)sourceElement;
1972 
1973 					unsigned int a =   (argb & 0x8000) * 130560;
1974 					unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000;
1975 					unsigned int g = (((argb & 0x03E0) * 16846  + 0x8000) >> 8) & 0x0000FF00;
1976 					unsigned int b = (((argb & 0x001F) * 2106   + 0x80) >> 8);
1977 
1978 					*(unsigned int*)destinationElement = a | r | g | b;
1979 
1980 					sourceElement += source.bytes;
1981 					destinationElement += destination.bytes;
1982 				}
1983 
1984 				sourceRow += source.pitchB;
1985 				destinationRow += destination.pitchB;
1986 			}
1987 
1988 			sourceSlice += source.sliceB;
1989 			destinationSlice += destination.sliceB;
1990 		}
1991 	}
1992 
decodeX4R4G4B4(Buffer & destination,const Buffer & source)1993 	void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source)
1994 	{
1995 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
1996 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
1997 
1998 		for(int z = 0; z < destination.depth && z < source.depth; z++)
1999 		{
2000 			unsigned char *sourceRow = sourceSlice;
2001 			unsigned char *destinationRow = destinationSlice;
2002 
2003 			for(int y = 0; y < destination.height && y < source.height; y++)
2004 			{
2005 				unsigned char *sourceElement = sourceRow;
2006 				unsigned char *destinationElement = destinationRow;
2007 
2008 				for(int x = 0; x < destination.width && x < source.width; x++)
2009 				{
2010 					unsigned int xrgb = *(unsigned short*)sourceElement;
2011 
2012 					unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000;
2013 					unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00;
2014 					unsigned int b =  (xrgb & 0x000F) * 0x00000011;
2015 
2016 					*(unsigned int*)destinationElement = 0xFF000000 | r | g | b;
2017 
2018 					sourceElement += source.bytes;
2019 					destinationElement += destination.bytes;
2020 				}
2021 
2022 				sourceRow += source.pitchB;
2023 				destinationRow += destination.pitchB;
2024 			}
2025 
2026 			sourceSlice += source.sliceB;
2027 			destinationSlice += destination.sliceB;
2028 		}
2029 	}
2030 
decodeA4R4G4B4(Buffer & destination,const Buffer & source)2031 	void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source)
2032 	{
2033 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
2034 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
2035 
2036 		for(int z = 0; z < destination.depth && z < source.depth; z++)
2037 		{
2038 			unsigned char *sourceRow = sourceSlice;
2039 			unsigned char *destinationRow = destinationSlice;
2040 
2041 			for(int y = 0; y < destination.height && y < source.height; y++)
2042 			{
2043 				unsigned char *sourceElement = sourceRow;
2044 				unsigned char *destinationElement = destinationRow;
2045 
2046 				for(int x = 0; x < destination.width && x < source.width; x++)
2047 				{
2048 					unsigned int argb = *(unsigned short*)sourceElement;
2049 
2050 					unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000;
2051 					unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000;
2052 					unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00;
2053 					unsigned int b =  (argb & 0x000F) * 0x00000011;
2054 
2055 					*(unsigned int*)destinationElement = a | r | g | b;
2056 
2057 					sourceElement += source.bytes;
2058 					destinationElement += destination.bytes;
2059 				}
2060 
2061 				sourceRow += source.pitchB;
2062 				destinationRow += destination.pitchB;
2063 			}
2064 
2065 			sourceSlice += source.sliceB;
2066 			destinationSlice += destination.sliceB;
2067 		}
2068 	}
2069 
decodeP8(Buffer & destination,const Buffer & source)2070 	void Surface::decodeP8(Buffer &destination, const Buffer &source)
2071 	{
2072 		unsigned char *sourceSlice = (unsigned char*)source.buffer;
2073 		unsigned char *destinationSlice = (unsigned char*)destination.buffer;
2074 
2075 		for(int z = 0; z < destination.depth && z < source.depth; z++)
2076 		{
2077 			unsigned char *sourceRow = sourceSlice;
2078 			unsigned char *destinationRow = destinationSlice;
2079 
2080 			for(int y = 0; y < destination.height && y < source.height; y++)
2081 			{
2082 				unsigned char *sourceElement = sourceRow;
2083 				unsigned char *destinationElement = destinationRow;
2084 
2085 				for(int x = 0; x < destination.width && x < source.width; x++)
2086 				{
2087 					unsigned int abgr = palette[*(unsigned char*)sourceElement];
2088 
2089 					unsigned int r = (abgr & 0x000000FF) << 16;
2090 					unsigned int g = (abgr & 0x0000FF00) << 0;
2091 					unsigned int b = (abgr & 0x00FF0000) >> 16;
2092 					unsigned int a = (abgr & 0xFF000000) >> 0;
2093 
2094 					*(unsigned int*)destinationElement = a | r | g | b;
2095 
2096 					sourceElement += source.bytes;
2097 					destinationElement += destination.bytes;
2098 				}
2099 
2100 				sourceRow += source.pitchB;
2101 				destinationRow += destination.pitchB;
2102 			}
2103 
2104 			sourceSlice += source.sliceB;
2105 			destinationSlice += destination.sliceB;
2106 		}
2107 	}
2108 
2109 #if S3TC_SUPPORT
decodeDXT1(Buffer & internal,const Buffer & external)2110 	void Surface::decodeDXT1(Buffer &internal, const Buffer &external)
2111 	{
2112 		unsigned int *destSlice = (unsigned int*)internal.buffer;
2113 		const DXT1 *source = (const DXT1*)external.buffer;
2114 
2115 		for(int z = 0; z < external.depth; z++)
2116 		{
2117 			unsigned int *dest = destSlice;
2118 
2119 			for(int y = 0; y < external.height; y += 4)
2120 			{
2121 				for(int x = 0; x < external.width; x += 4)
2122 				{
2123 					Color<byte> c[4];
2124 
2125 					c[0] = source->c0;
2126 					c[1] = source->c1;
2127 
2128 					if(source->c0 > source->c1)   // No transparency
2129 					{
2130 						// c2 = 2 / 3 * c0 + 1 / 3 * c1
2131 						c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2132 						c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2133 						c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2134 						c[2].a = 0xFF;
2135 
2136 						// c3 = 1 / 3 * c0 + 2 / 3 * c1
2137 						c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2138 						c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2139 						c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2140 						c[3].a = 0xFF;
2141 					}
2142 					else   // c3 transparent
2143 					{
2144 						// c2 = 1 / 2 * c0 + 1 / 2 * c1
2145 						c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2);
2146 						c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2);
2147 						c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2);
2148 						c[2].a = 0xFF;
2149 
2150 						c[3].r = 0;
2151 						c[3].g = 0;
2152 						c[3].b = 0;
2153 						c[3].a = 0;
2154 					}
2155 
2156 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2157 					{
2158 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2159 						{
2160 							dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4];
2161 						}
2162 					}
2163 
2164 					source++;
2165 				}
2166 			}
2167 
2168 			(byte*&)destSlice += internal.sliceB;
2169 		}
2170 	}
2171 
decodeDXT3(Buffer & internal,const Buffer & external)2172 	void Surface::decodeDXT3(Buffer &internal, const Buffer &external)
2173 	{
2174 		unsigned int *destSlice = (unsigned int*)internal.buffer;
2175 		const DXT3 *source = (const DXT3*)external.buffer;
2176 
2177 		for(int z = 0; z < external.depth; z++)
2178 		{
2179 			unsigned int *dest = destSlice;
2180 
2181 			for(int y = 0; y < external.height; y += 4)
2182 			{
2183 				for(int x = 0; x < external.width; x += 4)
2184 				{
2185 					Color<byte> c[4];
2186 
2187 					c[0] = source->c0;
2188 					c[1] = source->c1;
2189 
2190 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2191 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2192 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2193 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2194 
2195 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2196 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2197 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2198 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2199 
2200 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2201 					{
2202 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2203 						{
2204 							unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F;
2205 							unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24));
2206 
2207 							dest[(x + i) + (y + j) * internal.width] = color;
2208 						}
2209 					}
2210 
2211 					source++;
2212 				}
2213 			}
2214 
2215 			(byte*&)destSlice += internal.sliceB;
2216 		}
2217 	}
2218 
decodeDXT5(Buffer & internal,const Buffer & external)2219 	void Surface::decodeDXT5(Buffer &internal, const Buffer &external)
2220 	{
2221 		unsigned int *destSlice = (unsigned int*)internal.buffer;
2222 		const DXT5 *source = (const DXT5*)external.buffer;
2223 
2224 		for(int z = 0; z < external.depth; z++)
2225 		{
2226 			unsigned int *dest = destSlice;
2227 
2228 			for(int y = 0; y < external.height; y += 4)
2229 			{
2230 				for(int x = 0; x < external.width; x += 4)
2231 				{
2232 					Color<byte> c[4];
2233 
2234 					c[0] = source->c0;
2235 					c[1] = source->c1;
2236 
2237 					// c2 = 2 / 3 * c0 + 1 / 3 * c1
2238 					c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3);
2239 					c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3);
2240 					c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3);
2241 
2242 					// c3 = 1 / 3 * c0 + 2 / 3 * c1
2243 					c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3);
2244 					c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3);
2245 					c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3);
2246 
2247 					byte a[8];
2248 
2249 					a[0] = source->a0;
2250 					a[1] = source->a1;
2251 
2252 					if(a[0] > a[1])
2253 					{
2254 						a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7);
2255 						a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7);
2256 						a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7);
2257 						a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7);
2258 						a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7);
2259 						a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7);
2260 					}
2261 					else
2262 					{
2263 						a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5);
2264 						a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5);
2265 						a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5);
2266 						a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5);
2267 						a[6] = 0;
2268 						a[7] = 0xFF;
2269 					}
2270 
2271 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2272 					{
2273 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2274 						{
2275 							unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24;
2276 							unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha;
2277 
2278 							dest[(x + i) + (y + j) * internal.width] = color;
2279 						}
2280 					}
2281 
2282 					source++;
2283 				}
2284 			}
2285 
2286 			(byte*&)destSlice += internal.sliceB;
2287 		}
2288 	}
2289 #endif
2290 
decodeATI1(Buffer & internal,const Buffer & external)2291 	void Surface::decodeATI1(Buffer &internal, const Buffer &external)
2292 	{
2293 		byte *destSlice = (byte*)internal.buffer;
2294 		const ATI1 *source = (const ATI1*)external.buffer;
2295 
2296 		for(int z = 0; z < external.depth; z++)
2297 		{
2298 			byte *dest = destSlice;
2299 
2300 			for(int y = 0; y < external.height; y += 4)
2301 			{
2302 				for(int x = 0; x < external.width; x += 4)
2303 				{
2304 					byte r[8];
2305 
2306 					r[0] = source->r0;
2307 					r[1] = source->r1;
2308 
2309 					if(r[0] > r[1])
2310 					{
2311 						r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7);
2312 						r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7);
2313 						r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7);
2314 						r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7);
2315 						r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7);
2316 						r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7);
2317 					}
2318 					else
2319 					{
2320 						r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5);
2321 						r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5);
2322 						r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5);
2323 						r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5);
2324 						r[6] = 0;
2325 						r[7] = 0xFF;
2326 					}
2327 
2328 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2329 					{
2330 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2331 						{
2332 							dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8];
2333 						}
2334 					}
2335 
2336 					source++;
2337 				}
2338 			}
2339 
2340 			destSlice += internal.sliceB;
2341 		}
2342 	}
2343 
decodeATI2(Buffer & internal,const Buffer & external)2344 	void Surface::decodeATI2(Buffer &internal, const Buffer &external)
2345 	{
2346 		word *destSlice = (word*)internal.buffer;
2347 		const ATI2 *source = (const ATI2*)external.buffer;
2348 
2349 		for(int z = 0; z < external.depth; z++)
2350 		{
2351 			word *dest = destSlice;
2352 
2353 			for(int y = 0; y < external.height; y += 4)
2354 			{
2355 				for(int x = 0; x < external.width; x += 4)
2356 				{
2357 					byte X[8];
2358 
2359 					X[0] = source->x0;
2360 					X[1] = source->x1;
2361 
2362 					if(X[0] > X[1])
2363 					{
2364 						X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7);
2365 						X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7);
2366 						X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7);
2367 						X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7);
2368 						X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7);
2369 						X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7);
2370 					}
2371 					else
2372 					{
2373 						X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5);
2374 						X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5);
2375 						X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5);
2376 						X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5);
2377 						X[6] = 0;
2378 						X[7] = 0xFF;
2379 					}
2380 
2381 					byte Y[8];
2382 
2383 					Y[0] = source->y0;
2384 					Y[1] = source->y1;
2385 
2386 					if(Y[0] > Y[1])
2387 					{
2388 						Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7);
2389 						Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7);
2390 						Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7);
2391 						Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7);
2392 						Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7);
2393 						Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7);
2394 					}
2395 					else
2396 					{
2397 						Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5);
2398 						Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5);
2399 						Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5);
2400 						Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5);
2401 						Y[6] = 0;
2402 						Y[7] = 0xFF;
2403 					}
2404 
2405 					for(int j = 0; j < 4 && (y + j) < internal.height; j++)
2406 					{
2407 						for(int i = 0; i < 4 && (x + i) < internal.width; i++)
2408 						{
2409 							word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8];
2410 							word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8];
2411 
2412 							dest[(x + i) + (y + j) * internal.width] = (g << 8) + r;
2413 						}
2414 					}
2415 
2416 					source++;
2417 				}
2418 			}
2419 
2420 			(byte*&)destSlice += internal.sliceB;
2421 		}
2422 	}
2423 
decodeETC2(Buffer & internal,const Buffer & external,int nbAlphaBits,bool isSRGB)2424 	void Surface::decodeETC2(Buffer &internal, const Buffer &external, int nbAlphaBits, bool isSRGB)
2425 	{
2426 		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2427 		                    (nbAlphaBits == 8) ? ETC_Decoder::ETC_RGBA : ((nbAlphaBits == 1) ? ETC_Decoder::ETC_RGB_PUNCHTHROUGH_ALPHA : ETC_Decoder::ETC_RGB));
2428 
2429 		if(isSRGB)
2430 		{
2431 			static byte sRGBtoLinearTable[256];
2432 			static bool sRGBtoLinearTableDirty = true;
2433 			if(sRGBtoLinearTableDirty)
2434 			{
2435 				for(int i = 0; i < 256; i++)
2436 				{
2437 					sRGBtoLinearTable[i] = static_cast<byte>(sRGBtoLinear(static_cast<float>(i) / 255.0f) * 255.0f + 0.5f);
2438 				}
2439 				sRGBtoLinearTableDirty = false;
2440 			}
2441 
2442 			// Perform sRGB conversion in place after decoding
2443 			byte* src = (byte*)internal.buffer;
2444 			for(int y = 0; y < internal.height; y++)
2445 			{
2446 				byte* srcRow = src + y * internal.pitchB;
2447 				for(int x = 0; x <  internal.width; x++)
2448 				{
2449 					byte* srcPix = srcRow + x * internal.bytes;
2450 					for(int i = 0; i < 3; i++)
2451 					{
2452 						srcPix[i] = sRGBtoLinearTable[srcPix[i]];
2453 					}
2454 				}
2455 			}
2456 		}
2457 	}
2458 
decodeEAC(Buffer & internal,const Buffer & external,int nbChannels,bool isSigned)2459 	void Surface::decodeEAC(Buffer &internal, const Buffer &external, int nbChannels, bool isSigned)
2460 	{
2461 		ASSERT(nbChannels == 1 || nbChannels == 2);
2462 
2463 		ETC_Decoder::Decode((const byte*)external.buffer, (byte*)internal.buffer, external.width, external.height, internal.width, internal.height, internal.pitchB, internal.bytes,
2464 		                    (nbChannels == 1) ? (isSigned ? ETC_Decoder::ETC_R_SIGNED : ETC_Decoder::ETC_R_UNSIGNED) : (isSigned ? ETC_Decoder::ETC_RG_SIGNED : ETC_Decoder::ETC_RG_UNSIGNED));
2465 
2466 		// FIXME: We convert signed data to float, until signed integer internal formats are supported
2467 		//        This code can be removed if signed ETC2 images are decoded to internal 8 bit signed R/RG formats
2468 		if(isSigned)
2469 		{
2470 			sbyte* src = (sbyte*)internal.buffer;
2471 
2472 			for(int y = 0; y < internal.height; y++)
2473 			{
2474 				sbyte* srcRow = src + y * internal.pitchB;
2475 				for(int x = internal.width - 1; x >= 0; x--)
2476 				{
2477 					int dx = x & 0xFFFFFFFC;
2478 					int mx = x - dx;
2479 					sbyte* srcPix = srcRow + dx * internal.bytes + mx * nbChannels;
2480 					float* dstPix = (float*)(srcRow + x * internal.bytes);
2481 					for(int c = nbChannels - 1; c >= 0; c--)
2482 					{
2483 						static const float normalization = 1.0f / 127.875f;
2484 						dstPix[c] = clamp(static_cast<float>(srcPix[c]) * normalization, -1.0f, 1.0f);
2485 					}
2486 				}
2487 			}
2488 		}
2489 	}
2490 
decodeASTC(Buffer & internal,const Buffer & external,int xBlockSize,int yBlockSize,int zBlockSize,bool isSRGB)2491 	void Surface::decodeASTC(Buffer &internal, const Buffer &external, int xBlockSize, int yBlockSize, int zBlockSize, bool isSRGB)
2492 	{
2493 	}
2494 
size(int width,int height,int depth,Format format)2495 	unsigned int Surface::size(int width, int height, int depth, Format format)
2496 	{
2497 		// Dimensions rounded up to multiples of 4, used for compressed formats
2498 		int width4 = align(width, 4);
2499 		int height4 = align(height, 4);
2500 
2501 		switch(format)
2502 		{
2503 		#if S3TC_SUPPORT
2504 		case FORMAT_DXT1:
2505 		#endif
2506 		case FORMAT_ATI1:
2507 		case FORMAT_ETC1:
2508 		case FORMAT_R11_EAC:
2509 		case FORMAT_SIGNED_R11_EAC:
2510 		case FORMAT_RGB8_ETC2:
2511 		case FORMAT_SRGB8_ETC2:
2512 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2513 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2514 			return width4 * height4 * depth / 2;
2515 		#if S3TC_SUPPORT
2516 		case FORMAT_DXT3:
2517 		case FORMAT_DXT5:
2518 		#endif
2519 		case FORMAT_ATI2:
2520 		case FORMAT_RG11_EAC:
2521 		case FORMAT_SIGNED_RG11_EAC:
2522 		case FORMAT_RGBA8_ETC2_EAC:
2523 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2524 		case FORMAT_RGBA_ASTC_4x4_KHR:
2525 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2526 			return width4 * height4 * depth;
2527 		case FORMAT_RGBA_ASTC_5x4_KHR:
2528 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2529 			return align(width, 5) * height4 * depth;
2530 		case FORMAT_RGBA_ASTC_5x5_KHR:
2531 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2532 			return align(width, 5) * align(height, 5) * depth;
2533 		case FORMAT_RGBA_ASTC_6x5_KHR:
2534 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2535 			return align(width, 6) * align(height, 5) * depth;
2536 		case FORMAT_RGBA_ASTC_6x6_KHR:
2537 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2538 			return align(width, 6) * align(height, 6) * depth;
2539 		case FORMAT_RGBA_ASTC_8x5_KHR:
2540 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2541 			return align(width, 8) * align(height, 5) * depth;
2542 		case FORMAT_RGBA_ASTC_8x6_KHR:
2543 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2544 			return align(width, 8) * align(height, 6) * depth;
2545 		case FORMAT_RGBA_ASTC_8x8_KHR:
2546 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2547 			return align(width, 8) * align(height, 8) * depth;
2548 		case FORMAT_RGBA_ASTC_10x5_KHR:
2549 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2550 			return align(width, 10) * align(height, 5) * depth;
2551 		case FORMAT_RGBA_ASTC_10x6_KHR:
2552 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2553 			return align(width, 10) * align(height, 6) * depth;
2554 		case FORMAT_RGBA_ASTC_10x8_KHR:
2555 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2556 			return align(width, 10) * align(height, 8) * depth;
2557 		case FORMAT_RGBA_ASTC_10x10_KHR:
2558 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2559 			return align(width, 10) * align(height, 10) * depth;
2560 		case FORMAT_RGBA_ASTC_12x10_KHR:
2561 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2562 			return align(width, 12) * align(height, 10) * depth;
2563 		case FORMAT_RGBA_ASTC_12x12_KHR:
2564 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2565 			return align(width, 12) * align(height, 12) * depth;
2566 		case FORMAT_YV12_BT601:
2567 		case FORMAT_YV12_BT709:
2568 		case FORMAT_YV12_JFIF:
2569 			{
2570 				unsigned int YStride = align(width, 16);
2571 				unsigned int YSize = YStride * height;
2572 				unsigned int CStride = align(YStride / 2, 16);
2573 				unsigned int CSize = CStride * height / 2;
2574 
2575 				return YSize + 2 * CSize;
2576 			}
2577 		default:
2578 			return bytes(format) * width * height * depth;
2579 		}
2580 
2581 		return 0;
2582 	}
2583 
isStencil(Format format)2584 	bool Surface::isStencil(Format format)
2585 	{
2586 		switch(format)
2587 		{
2588 		case FORMAT_D32:
2589 		case FORMAT_D16:
2590 		case FORMAT_D24X8:
2591 		case FORMAT_D32F:
2592 		case FORMAT_D32F_COMPLEMENTARY:
2593 		case FORMAT_D32F_LOCKABLE:
2594 			return false;
2595 		case FORMAT_D24S8:
2596 		case FORMAT_D24FS8:
2597 		case FORMAT_S8:
2598 		case FORMAT_DF24S8:
2599 		case FORMAT_DF16S8:
2600 		case FORMAT_D32FS8_TEXTURE:
2601 		case FORMAT_D32FS8_SHADOW:
2602 		case FORMAT_INTZ:
2603 			return true;
2604 		default:
2605 			return false;
2606 		}
2607 	}
2608 
isDepth(Format format)2609 	bool Surface::isDepth(Format format)
2610 	{
2611 		switch(format)
2612 		{
2613 		case FORMAT_D32:
2614 		case FORMAT_D16:
2615 		case FORMAT_D24X8:
2616 		case FORMAT_D24S8:
2617 		case FORMAT_D24FS8:
2618 		case FORMAT_D32F:
2619 		case FORMAT_D32F_COMPLEMENTARY:
2620 		case FORMAT_D32F_LOCKABLE:
2621 		case FORMAT_DF24S8:
2622 		case FORMAT_DF16S8:
2623 		case FORMAT_D32FS8_TEXTURE:
2624 		case FORMAT_D32FS8_SHADOW:
2625 		case FORMAT_INTZ:
2626 			return true;
2627 		case FORMAT_S8:
2628 			return false;
2629 		default:
2630 			return false;
2631 		}
2632 	}
2633 
isPalette(Format format)2634 	bool Surface::isPalette(Format format)
2635 	{
2636 		switch(format)
2637 		{
2638 		case FORMAT_P8:
2639 		case FORMAT_A8P8:
2640 			return true;
2641 		default:
2642 			return false;
2643 		}
2644 	}
2645 
isFloatFormat(Format format)2646 	bool Surface::isFloatFormat(Format format)
2647 	{
2648 		switch(format)
2649 		{
2650 		case FORMAT_R5G6B5:
2651 		case FORMAT_R8G8B8:
2652 		case FORMAT_B8G8R8:
2653 		case FORMAT_X8R8G8B8:
2654 		case FORMAT_X8B8G8R8I:
2655 		case FORMAT_X8B8G8R8:
2656 		case FORMAT_A8R8G8B8:
2657 		case FORMAT_SRGB8_X8:
2658 		case FORMAT_SRGB8_A8:
2659 		case FORMAT_A8B8G8R8I:
2660 		case FORMAT_R8UI:
2661 		case FORMAT_G8R8UI:
2662 		case FORMAT_X8B8G8R8UI:
2663 		case FORMAT_A8B8G8R8UI:
2664 		case FORMAT_A8B8G8R8:
2665 		case FORMAT_G8R8I:
2666 		case FORMAT_G8R8:
2667 		case FORMAT_A2B10G10R10:
2668 		case FORMAT_R8I_SNORM:
2669 		case FORMAT_G8R8I_SNORM:
2670 		case FORMAT_X8B8G8R8I_SNORM:
2671 		case FORMAT_A8B8G8R8I_SNORM:
2672 		case FORMAT_R16I:
2673 		case FORMAT_R16UI:
2674 		case FORMAT_G16R16I:
2675 		case FORMAT_G16R16UI:
2676 		case FORMAT_G16R16:
2677 		case FORMAT_X16B16G16R16I:
2678 		case FORMAT_X16B16G16R16UI:
2679 		case FORMAT_A16B16G16R16I:
2680 		case FORMAT_A16B16G16R16UI:
2681 		case FORMAT_A16B16G16R16:
2682 		case FORMAT_V8U8:
2683 		case FORMAT_Q8W8V8U8:
2684 		case FORMAT_X8L8V8U8:
2685 		case FORMAT_V16U16:
2686 		case FORMAT_A16W16V16U16:
2687 		case FORMAT_Q16W16V16U16:
2688 		case FORMAT_A8:
2689 		case FORMAT_R8I:
2690 		case FORMAT_R8:
2691 		case FORMAT_L8:
2692 		case FORMAT_L16:
2693 		case FORMAT_A8L8:
2694 		case FORMAT_YV12_BT601:
2695 		case FORMAT_YV12_BT709:
2696 		case FORMAT_YV12_JFIF:
2697 		case FORMAT_R32I:
2698 		case FORMAT_R32UI:
2699 		case FORMAT_G32R32I:
2700 		case FORMAT_G32R32UI:
2701 		case FORMAT_X32B32G32R32I:
2702 		case FORMAT_X32B32G32R32UI:
2703 		case FORMAT_A32B32G32R32I:
2704 		case FORMAT_A32B32G32R32UI:
2705 			return false;
2706 		case FORMAT_R32F:
2707 		case FORMAT_G32R32F:
2708 		case FORMAT_X32B32G32R32F:
2709 		case FORMAT_A32B32G32R32F:
2710 		case FORMAT_D32F:
2711 		case FORMAT_D32F_COMPLEMENTARY:
2712 		case FORMAT_D32F_LOCKABLE:
2713 		case FORMAT_D32FS8_TEXTURE:
2714 		case FORMAT_D32FS8_SHADOW:
2715 		case FORMAT_L16F:
2716 		case FORMAT_A16L16F:
2717 		case FORMAT_L32F:
2718 		case FORMAT_A32L32F:
2719 			return true;
2720 		default:
2721 			ASSERT(false);
2722 		}
2723 
2724 		return false;
2725 	}
2726 
isUnsignedComponent(Format format,int component)2727 	bool Surface::isUnsignedComponent(Format format, int component)
2728 	{
2729 		switch(format)
2730 		{
2731 		case FORMAT_NULL:
2732 		case FORMAT_R5G6B5:
2733 		case FORMAT_R8G8B8:
2734 		case FORMAT_B8G8R8:
2735 		case FORMAT_X8R8G8B8:
2736 		case FORMAT_X8B8G8R8:
2737 		case FORMAT_A8R8G8B8:
2738 		case FORMAT_A8B8G8R8:
2739 		case FORMAT_SRGB8_X8:
2740 		case FORMAT_SRGB8_A8:
2741 		case FORMAT_G8R8:
2742 		case FORMAT_A2B10G10R10:
2743 		case FORMAT_R16UI:
2744 		case FORMAT_G16R16:
2745 		case FORMAT_G16R16UI:
2746 		case FORMAT_X16B16G16R16UI:
2747 		case FORMAT_A16B16G16R16:
2748 		case FORMAT_A16B16G16R16UI:
2749 		case FORMAT_R32UI:
2750 		case FORMAT_G32R32UI:
2751 		case FORMAT_X32B32G32R32UI:
2752 		case FORMAT_A32B32G32R32UI:
2753 		case FORMAT_R8UI:
2754 		case FORMAT_G8R8UI:
2755 		case FORMAT_X8B8G8R8UI:
2756 		case FORMAT_A8B8G8R8UI:
2757 		case FORMAT_D32F:
2758 		case FORMAT_D32F_COMPLEMENTARY:
2759 		case FORMAT_D32F_LOCKABLE:
2760 		case FORMAT_D32FS8_TEXTURE:
2761 		case FORMAT_D32FS8_SHADOW:
2762 		case FORMAT_A8:
2763 		case FORMAT_R8:
2764 		case FORMAT_L8:
2765 		case FORMAT_L16:
2766 		case FORMAT_A8L8:
2767 		case FORMAT_YV12_BT601:
2768 		case FORMAT_YV12_BT709:
2769 		case FORMAT_YV12_JFIF:
2770 			return true;
2771 		case FORMAT_A8B8G8R8I:
2772 		case FORMAT_A16B16G16R16I:
2773 		case FORMAT_A32B32G32R32I:
2774 		case FORMAT_A8B8G8R8I_SNORM:
2775 		case FORMAT_Q8W8V8U8:
2776 		case FORMAT_Q16W16V16U16:
2777 		case FORMAT_A32B32G32R32F:
2778 			return false;
2779 		case FORMAT_R32F:
2780 		case FORMAT_R8I:
2781 		case FORMAT_R16I:
2782 		case FORMAT_R32I:
2783 		case FORMAT_R8I_SNORM:
2784 			return component >= 1;
2785 		case FORMAT_V8U8:
2786 		case FORMAT_X8L8V8U8:
2787 		case FORMAT_V16U16:
2788 		case FORMAT_G32R32F:
2789 		case FORMAT_G8R8I:
2790 		case FORMAT_G16R16I:
2791 		case FORMAT_G32R32I:
2792 		case FORMAT_G8R8I_SNORM:
2793 			return component >= 2;
2794 		case FORMAT_A16W16V16U16:
2795 		case FORMAT_X32B32G32R32F:
2796 		case FORMAT_X8B8G8R8I:
2797 		case FORMAT_X16B16G16R16I:
2798 		case FORMAT_X32B32G32R32I:
2799 		case FORMAT_X8B8G8R8I_SNORM:
2800 			return component >= 3;
2801 		default:
2802 			ASSERT(false);
2803 		}
2804 
2805 		return false;
2806 	}
2807 
isSRGBreadable(Format format)2808 	bool Surface::isSRGBreadable(Format format)
2809 	{
2810 		// Keep in sync with Capabilities::isSRGBreadable
2811 		switch(format)
2812 		{
2813 		case FORMAT_L8:
2814 		case FORMAT_A8L8:
2815 		case FORMAT_R8G8B8:
2816 		case FORMAT_A8R8G8B8:
2817 		case FORMAT_X8R8G8B8:
2818 		case FORMAT_A8B8G8R8:
2819 		case FORMAT_X8B8G8R8:
2820 		case FORMAT_SRGB8_X8:
2821 		case FORMAT_SRGB8_A8:
2822 		case FORMAT_R5G6B5:
2823 		case FORMAT_X1R5G5B5:
2824 		case FORMAT_A1R5G5B5:
2825 		case FORMAT_A4R4G4B4:
2826 		#if S3TC_SUPPORT
2827 		case FORMAT_DXT1:
2828 		case FORMAT_DXT3:
2829 		case FORMAT_DXT5:
2830 		#endif
2831 		case FORMAT_ATI1:
2832 		case FORMAT_ATI2:
2833 			return true;
2834 		default:
2835 			return false;
2836 		}
2837 
2838 		return false;
2839 	}
2840 
isSRGBwritable(Format format)2841 	bool Surface::isSRGBwritable(Format format)
2842 	{
2843 		// Keep in sync with Capabilities::isSRGBwritable
2844 		switch(format)
2845 		{
2846 		case FORMAT_NULL:
2847 		case FORMAT_A8R8G8B8:
2848 		case FORMAT_X8R8G8B8:
2849 		case FORMAT_A8B8G8R8:
2850 		case FORMAT_X8B8G8R8:
2851 		case FORMAT_SRGB8_X8:
2852 		case FORMAT_SRGB8_A8:
2853 		case FORMAT_R5G6B5:
2854 			return true;
2855 		default:
2856 			return false;
2857 		}
2858 	}
2859 
isCompressed(Format format)2860 	bool Surface::isCompressed(Format format)
2861 	{
2862 		switch(format)
2863 		{
2864 		#if S3TC_SUPPORT
2865 		case FORMAT_DXT1:
2866 		case FORMAT_DXT3:
2867 		case FORMAT_DXT5:
2868 		#endif
2869 		case FORMAT_ATI1:
2870 		case FORMAT_ATI2:
2871 		case FORMAT_ETC1:
2872 		case FORMAT_R11_EAC:
2873 		case FORMAT_SIGNED_R11_EAC:
2874 		case FORMAT_RG11_EAC:
2875 		case FORMAT_SIGNED_RG11_EAC:
2876 		case FORMAT_RGB8_ETC2:
2877 		case FORMAT_SRGB8_ETC2:
2878 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2879 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
2880 		case FORMAT_RGBA8_ETC2_EAC:
2881 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
2882 		case FORMAT_RGBA_ASTC_4x4_KHR:
2883 		case FORMAT_RGBA_ASTC_5x4_KHR:
2884 		case FORMAT_RGBA_ASTC_5x5_KHR:
2885 		case FORMAT_RGBA_ASTC_6x5_KHR:
2886 		case FORMAT_RGBA_ASTC_6x6_KHR:
2887 		case FORMAT_RGBA_ASTC_8x5_KHR:
2888 		case FORMAT_RGBA_ASTC_8x6_KHR:
2889 		case FORMAT_RGBA_ASTC_8x8_KHR:
2890 		case FORMAT_RGBA_ASTC_10x5_KHR:
2891 		case FORMAT_RGBA_ASTC_10x6_KHR:
2892 		case FORMAT_RGBA_ASTC_10x8_KHR:
2893 		case FORMAT_RGBA_ASTC_10x10_KHR:
2894 		case FORMAT_RGBA_ASTC_12x10_KHR:
2895 		case FORMAT_RGBA_ASTC_12x12_KHR:
2896 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
2897 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
2898 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
2899 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
2900 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
2901 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
2902 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
2903 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
2904 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
2905 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
2906 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
2907 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
2908 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
2909 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
2910 			return true;
2911 		default:
2912 			return false;
2913 		}
2914 	}
2915 
isNonNormalizedInteger(Format format)2916 	bool Surface::isNonNormalizedInteger(Format format)
2917 	{
2918 		switch(format)
2919 		{
2920 		case FORMAT_A8B8G8R8I:
2921 		case FORMAT_X8B8G8R8I:
2922 		case FORMAT_G8R8I:
2923 		case FORMAT_R8I:
2924 		case FORMAT_A8B8G8R8UI:
2925 		case FORMAT_X8B8G8R8UI:
2926 		case FORMAT_G8R8UI:
2927 		case FORMAT_R8UI:
2928 		case FORMAT_A16B16G16R16I:
2929 		case FORMAT_X16B16G16R16I:
2930 		case FORMAT_G16R16I:
2931 		case FORMAT_R16I:
2932 		case FORMAT_A16B16G16R16UI:
2933 		case FORMAT_X16B16G16R16UI:
2934 		case FORMAT_G16R16UI:
2935 		case FORMAT_R16UI:
2936 		case FORMAT_A32B32G32R32I:
2937 		case FORMAT_X32B32G32R32I:
2938 		case FORMAT_G32R32I:
2939 		case FORMAT_R32I:
2940 		case FORMAT_A32B32G32R32UI:
2941 		case FORMAT_X32B32G32R32UI:
2942 		case FORMAT_G32R32UI:
2943 		case FORMAT_R32UI:
2944 			return true;
2945 		default:
2946 			return false;
2947 		}
2948 	}
2949 
componentCount(Format format)2950 	int Surface::componentCount(Format format)
2951 	{
2952 		switch(format)
2953 		{
2954 		case FORMAT_R5G6B5:         return 3;
2955 		case FORMAT_X8R8G8B8:       return 3;
2956 		case FORMAT_X8B8G8R8I:      return 3;
2957 		case FORMAT_X8B8G8R8:       return 3;
2958 		case FORMAT_A8R8G8B8:       return 4;
2959 		case FORMAT_SRGB8_X8:       return 3;
2960 		case FORMAT_SRGB8_A8:       return 4;
2961 		case FORMAT_A8B8G8R8I:      return 4;
2962 		case FORMAT_A8B8G8R8:       return 4;
2963 		case FORMAT_G8R8I:          return 2;
2964 		case FORMAT_G8R8:           return 2;
2965 		case FORMAT_R8I_SNORM:      return 1;
2966 		case FORMAT_G8R8I_SNORM:    return 2;
2967 		case FORMAT_X8B8G8R8I_SNORM:return 3;
2968 		case FORMAT_A8B8G8R8I_SNORM:return 4;
2969 		case FORMAT_R8UI:           return 1;
2970 		case FORMAT_G8R8UI:         return 2;
2971 		case FORMAT_X8B8G8R8UI:     return 3;
2972 		case FORMAT_A8B8G8R8UI:     return 4;
2973 		case FORMAT_A2B10G10R10:    return 4;
2974 		case FORMAT_G16R16I:        return 2;
2975 		case FORMAT_G16R16UI:       return 2;
2976 		case FORMAT_G16R16:         return 2;
2977 		case FORMAT_G32R32I:        return 2;
2978 		case FORMAT_G32R32UI:       return 2;
2979 		case FORMAT_X16B16G16R16I:  return 3;
2980 		case FORMAT_X16B16G16R16UI: return 3;
2981 		case FORMAT_A16B16G16R16I:  return 4;
2982 		case FORMAT_A16B16G16R16UI: return 4;
2983 		case FORMAT_A16B16G16R16:   return 4;
2984 		case FORMAT_X32B32G32R32I:  return 3;
2985 		case FORMAT_X32B32G32R32UI: return 3;
2986 		case FORMAT_A32B32G32R32I:  return 4;
2987 		case FORMAT_A32B32G32R32UI: return 4;
2988 		case FORMAT_V8U8:           return 2;
2989 		case FORMAT_Q8W8V8U8:       return 4;
2990 		case FORMAT_X8L8V8U8:       return 3;
2991 		case FORMAT_V16U16:         return 2;
2992 		case FORMAT_A16W16V16U16:   return 4;
2993 		case FORMAT_Q16W16V16U16:   return 4;
2994 		case FORMAT_R32F:           return 1;
2995 		case FORMAT_G32R32F:        return 2;
2996 		case FORMAT_X32B32G32R32F:  return 3;
2997 		case FORMAT_A32B32G32R32F:  return 4;
2998 		case FORMAT_D32F:           return 1;
2999 		case FORMAT_D32F_LOCKABLE:  return 1;
3000 		case FORMAT_D32FS8_TEXTURE: return 1;
3001 		case FORMAT_D32FS8_SHADOW:  return 1;
3002 		case FORMAT_A8:             return 1;
3003 		case FORMAT_R8I:            return 1;
3004 		case FORMAT_R8:             return 1;
3005 		case FORMAT_R16I:           return 1;
3006 		case FORMAT_R16UI:          return 1;
3007 		case FORMAT_R32I:           return 1;
3008 		case FORMAT_R32UI:          return 1;
3009 		case FORMAT_L8:             return 1;
3010 		case FORMAT_L16:            return 1;
3011 		case FORMAT_A8L8:           return 2;
3012 		case FORMAT_YV12_BT601:     return 3;
3013 		case FORMAT_YV12_BT709:     return 3;
3014 		case FORMAT_YV12_JFIF:      return 3;
3015 		default:
3016 			ASSERT(false);
3017 		}
3018 
3019 		return 1;
3020 	}
3021 
allocateBuffer(int width,int height,int depth,Format format)3022 	void *Surface::allocateBuffer(int width, int height, int depth, Format format)
3023 	{
3024 		// Render targets require 2x2 quads
3025 		int width2 = (width + 1) & ~1;
3026 		int height2 = (height + 1) & ~1;
3027 
3028 		// FIXME: Unpacking byte4 to short4 in the sampler currently involves reading 8 bytes,
3029 		// so we have to allocate 4 extra bytes to avoid buffer overruns.
3030 		return allocateZero(size(width2, height2, depth, format) + 4);
3031 	}
3032 
memfill4(void * buffer,int pattern,int bytes)3033 	void Surface::memfill4(void *buffer, int pattern, int bytes)
3034 	{
3035 		while((size_t)buffer & 0x1 && bytes >= 1)
3036 		{
3037 			*(char*)buffer = (char)pattern;
3038 			(char*&)buffer += 1;
3039 			bytes -= 1;
3040 		}
3041 
3042 		while((size_t)buffer & 0x3 && bytes >= 2)
3043 		{
3044 			*(short*)buffer = (short)pattern;
3045 			(short*&)buffer += 1;
3046 			bytes -= 2;
3047 		}
3048 
3049 		if(CPUID::supportsSSE())
3050 		{
3051 			while((size_t)buffer & 0xF && bytes >= 4)
3052 			{
3053 				*(int*)buffer = pattern;
3054 				(int*&)buffer += 1;
3055 				bytes -= 4;
3056 			}
3057 
3058 			__m128 quad = _mm_set_ps1((float&)pattern);
3059 
3060 			float *pointer = (float*)buffer;
3061 			int qxwords = bytes / 64;
3062 			bytes -= qxwords * 64;
3063 
3064 			while(qxwords--)
3065 			{
3066 				_mm_stream_ps(pointer + 0, quad);
3067 				_mm_stream_ps(pointer + 4, quad);
3068 				_mm_stream_ps(pointer + 8, quad);
3069 				_mm_stream_ps(pointer + 12, quad);
3070 
3071 				pointer += 16;
3072 			}
3073 
3074 			buffer = pointer;
3075 		}
3076 
3077 		while(bytes >= 4)
3078 		{
3079 			*(int*)buffer = (int)pattern;
3080 			(int*&)buffer += 1;
3081 			bytes -= 4;
3082 		}
3083 
3084 		while(bytes >= 2)
3085 		{
3086 			*(short*)buffer = (short)pattern;
3087 			(short*&)buffer += 1;
3088 			bytes -= 2;
3089 		}
3090 
3091 		while(bytes >= 1)
3092 		{
3093 			*(char*)buffer = (char)pattern;
3094 			(char*&)buffer += 1;
3095 			bytes -= 1;
3096 		}
3097 	}
3098 
isEntire(const SliceRect & rect) const3099 	bool Surface::isEntire(const SliceRect& rect) const
3100 	{
3101 		return (rect.x0 == 0 && rect.y0 == 0 && rect.x1 == internal.width && rect.y1 == internal.height && internal.depth == 1);
3102 	}
3103 
getRect() const3104 	SliceRect Surface::getRect() const
3105 	{
3106 		return SliceRect(0, 0, internal.width, internal.height, 0);
3107 	}
3108 
clearDepth(float depth,int x0,int y0,int width,int height)3109 	void Surface::clearDepth(float depth, int x0, int y0, int width, int height)
3110 	{
3111 		if(width == 0 || height == 0) return;
3112 
3113 		// Not overlapping
3114 		if(x0 > internal.width) return;
3115 		if(y0 > internal.height) return;
3116 		if(x0 + width < 0) return;
3117 		if(y0 + height < 0) return;
3118 
3119 		// Clip against dimensions
3120 		if(x0 < 0) {width += x0; x0 = 0;}
3121 		if(x0 + width > internal.width) width = internal.width - x0;
3122 		if(y0 < 0) {height += y0; y0 = 0;}
3123 		if(y0 + height > internal.height) height = internal.height - y0;
3124 
3125 		const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height;
3126 		const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY;
3127 
3128 		int width2 = (internal.width + 1) & ~1;
3129 
3130 		int x1 = x0 + width;
3131 		int y1 = y0 + height;
3132 
3133 		if(internal.format == FORMAT_D32F_LOCKABLE ||
3134 		   internal.format == FORMAT_D32FS8_TEXTURE ||
3135 		   internal.format == FORMAT_D32FS8_SHADOW)
3136 		{
3137 			float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0;
3138 
3139 			for(int z = 0; z < internal.depth; z++)
3140 			{
3141 				for(int y = y0; y < y1; y++)
3142 				{
3143 					memfill4(target, (int&)depth, 4 * width);
3144 					target += width2;
3145 				}
3146 			}
3147 
3148 			unlockInternal();
3149 		}
3150 		else   // Quad layout
3151 		{
3152 			if(complementaryDepthBuffer)
3153 			{
3154 				depth = 1 - depth;
3155 			}
3156 
3157 			float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC);
3158 
3159 			int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3160 			int oddX1 = (x1 & ~1) * 2;
3161 			int evenX0 = ((x0 + 1) & ~1) * 2;
3162 			int evenBytes = (oddX1 - evenX0) * sizeof(float);
3163 
3164 			for(int z = 0; z < internal.depth; z++)
3165 			{
3166 				for(int y = y0; y < y1; y++)
3167 				{
3168 					float *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3169 
3170 					if((y & 1) == 0 && y + 1 < y1)   // Fill quad line at once
3171 					{
3172 						if((x0 & 1) != 0)
3173 						{
3174 							target[oddX0 + 0] = depth;
3175 							target[oddX0 + 2] = depth;
3176 						}
3177 
3178 					//	for(int x2 = evenX0; x2 < x1 * 2; x2 += 4)
3179 					//	{
3180 					//		target[x2 + 0] = depth;
3181 					//		target[x2 + 1] = depth;
3182 					//		target[x2 + 2] = depth;
3183 					//		target[x2 + 3] = depth;
3184 					//	}
3185 
3186 					//	__asm
3187 					//	{
3188 					//		movss xmm0, depth
3189 					//		shufps xmm0, xmm0, 0x00
3190 					//
3191 					//		mov eax, x0
3192 					//		add eax, 1
3193 					//		and eax, 0xFFFFFFFE
3194 					//		cmp eax, x1
3195 					//		jge qEnd
3196 					//
3197 					//		mov edi, target
3198 					//
3199 					//	qLoop:
3200 					//		movntps [edi+8*eax], xmm0
3201 					//
3202 					//		add eax, 2
3203 					//		cmp eax, x1
3204 					//		jl qLoop
3205 					//	qEnd:
3206 					//	}
3207 
3208 						memfill4(&target[evenX0], (int&)depth, evenBytes);
3209 
3210 						if((x1 & 1) != 0)
3211 						{
3212 							target[oddX1 + 0] = depth;
3213 							target[oddX1 + 2] = depth;
3214 						}
3215 
3216 						y++;
3217 					}
3218 					else
3219 					{
3220 						for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3221 						{
3222 							target[i] = depth;
3223 						}
3224 					}
3225 				}
3226 
3227 				buffer += internal.sliceP;
3228 			}
3229 
3230 			unlockInternal();
3231 		}
3232 	}
3233 
clearStencil(unsigned char s,unsigned char mask,int x0,int y0,int width,int height)3234 	void Surface::clearStencil(unsigned char s, unsigned char mask, int x0, int y0, int width, int height)
3235 	{
3236 		if(mask == 0 || width == 0 || height == 0) return;
3237 
3238 		// Not overlapping
3239 		if(x0 > internal.width) return;
3240 		if(y0 > internal.height) return;
3241 		if(x0 + width < 0) return;
3242 		if(y0 + height < 0) return;
3243 
3244 		// Clip against dimensions
3245 		if(x0 < 0) {width += x0; x0 = 0;}
3246 		if(x0 + width > internal.width) width = internal.width - x0;
3247 		if(y0 < 0) {height += y0; y0 = 0;}
3248 		if(y0 + height > internal.height) height = internal.height - y0;
3249 
3250 		int width2 = (internal.width + 1) & ~1;
3251 
3252 		int x1 = x0 + width;
3253 		int y1 = y0 + height;
3254 
3255 		int oddX0 = (x0 & ~1) * 2 + (x0 & 1);
3256 		int oddX1 = (x1 & ~1) * 2;
3257 		int evenX0 = ((x0 + 1) & ~1) * 2;
3258 		int evenBytes = oddX1 - evenX0;
3259 
3260 		unsigned char maskedS = s & mask;
3261 		unsigned char invMask = ~mask;
3262 		unsigned int fill = maskedS;
3263 		fill = fill | (fill << 8) | (fill << 16) + (fill << 24);
3264 
3265 		char *buffer = (char*)lockStencil(0, PUBLIC);
3266 
3267 		// Stencil buffers are assumed to use quad layout
3268 		for(int z = 0; z < stencil.depth; z++)
3269 		{
3270 			for(int y = y0; y < y1; y++)
3271 			{
3272 				char *target = buffer + (y & ~1) * width2 + (y & 1) * 2;
3273 
3274 				if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF)   // Fill quad line at once
3275 				{
3276 					if((x0 & 1) != 0)
3277 					{
3278 						target[oddX0 + 0] = fill;
3279 						target[oddX0 + 2] = fill;
3280 					}
3281 
3282 					memfill4(&target[evenX0], fill, evenBytes);
3283 
3284 					if((x1 & 1) != 0)
3285 					{
3286 						target[oddX1 + 0] = fill;
3287 						target[oddX1 + 2] = fill;
3288 					}
3289 
3290 					y++;
3291 				}
3292 				else
3293 				{
3294 					for(int x = x0, i = oddX0; x < x1; x++, i = (x & ~1) * 2 + (x & 1))
3295 					{
3296 						target[i] = maskedS | (target[i] & invMask);
3297 					}
3298 				}
3299 			}
3300 
3301 			buffer += stencil.sliceP;
3302 		}
3303 
3304 		unlockStencil();
3305 	}
3306 
fill(const Color<float> & color,int x0,int y0,int width,int height)3307 	void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height)
3308 	{
3309 		unsigned char *row;
3310 		Buffer *buffer;
3311 
3312 		if(internal.dirty)
3313 		{
3314 			row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3315 			buffer = &internal;
3316 		}
3317 		else
3318 		{
3319 			row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC);
3320 			buffer = &external;
3321 		}
3322 
3323 		if(buffer->bytes <= 4)
3324 		{
3325 			int c;
3326 			buffer->write(&c, color);
3327 
3328 			if(buffer->bytes <= 1) c = (c << 8)  | c;
3329 			if(buffer->bytes <= 2) c = (c << 16) | c;
3330 
3331 			for(int y = 0; y < height; y++)
3332 			{
3333 				memfill4(row, c, width * buffer->bytes);
3334 
3335 				row += buffer->pitchB;
3336 			}
3337 		}
3338 		else   // Generic
3339 		{
3340 			for(int y = 0; y < height; y++)
3341 			{
3342 				unsigned char *element = row;
3343 
3344 				for(int x = 0; x < width; x++)
3345 				{
3346 					buffer->write(element, color);
3347 
3348 					element += buffer->bytes;
3349 				}
3350 
3351 				row += buffer->pitchB;
3352 			}
3353 		}
3354 
3355 		if(buffer == &internal)
3356 		{
3357 			unlockInternal();
3358 		}
3359 		else
3360 		{
3361 			unlockExternal();
3362 		}
3363 	}
3364 
copyInternal(const Surface * source,int x,int y,float srcX,float srcY,bool filter)3365 	void Surface::copyInternal(const Surface* source, int x, int y, float srcX, float srcY, bool filter)
3366 	{
3367 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3368 
3369 		sw::Color<float> color;
3370 
3371 		if(!filter)
3372 		{
3373 			color = source->internal.read((int)srcX, (int)srcY);
3374 		}
3375 		else   // Bilinear filtering
3376 		{
3377 			color = source->internal.sample(srcX, srcY);
3378 		}
3379 
3380 		internal.write(x, y, color);
3381 	}
3382 
copyInternal(const Surface * source,int x,int y,int z,float srcX,float srcY,float srcZ,bool filter)3383 	void Surface::copyInternal(const Surface* source, int x, int y, int z, float srcX, float srcY, float srcZ, bool filter)
3384 	{
3385 		ASSERT(internal.lock != LOCK_UNLOCKED && source && source->internal.lock != LOCK_UNLOCKED);
3386 
3387 		sw::Color<float> color;
3388 
3389 		if(!filter)
3390 		{
3391 			color = source->internal.read((int)srcX, (int)srcY, int(srcZ));
3392 		}
3393 		else   // Bilinear filtering
3394 		{
3395 			color = source->internal.sample(srcX, srcY, srcZ);
3396 		}
3397 
3398 		internal.write(x, y, z, color);
3399 	}
3400 
hasStencil() const3401 	bool Surface::hasStencil() const
3402 	{
3403 		return isStencil(external.format);
3404 	}
3405 
hasDepth() const3406 	bool Surface::hasDepth() const
3407 	{
3408 		return isDepth(external.format);
3409 	}
3410 
hasPalette() const3411 	bool Surface::hasPalette() const
3412 	{
3413 		return isPalette(external.format);
3414 	}
3415 
isRenderTarget() const3416 	bool Surface::isRenderTarget() const
3417 	{
3418 		return renderTarget;
3419 	}
3420 
hasDirtyMipmaps() const3421 	bool Surface::hasDirtyMipmaps() const
3422 	{
3423 		return dirtyMipmaps;
3424 	}
3425 
cleanMipmaps()3426 	void Surface::cleanMipmaps()
3427 	{
3428 		dirtyMipmaps = false;
3429 	}
3430 
getResource()3431 	Resource *Surface::getResource()
3432 	{
3433 		return resource;
3434 	}
3435 
identicalFormats() const3436 	bool Surface::identicalFormats() const
3437 	{
3438 		return external.format == internal.format &&
3439 		       external.width  == internal.width &&
3440 		       external.height == internal.height &&
3441 		       external.depth  == internal.depth &&
3442 		       external.pitchB == internal.pitchB &&
3443 		       external.sliceB == internal.sliceB;
3444 	}
3445 
selectInternalFormat(Format format) const3446 	Format Surface::selectInternalFormat(Format format) const
3447 	{
3448 		switch(format)
3449 		{
3450 		case FORMAT_NULL:
3451 			return FORMAT_NULL;
3452 		case FORMAT_P8:
3453 		case FORMAT_A8P8:
3454 		case FORMAT_A4R4G4B4:
3455 		case FORMAT_A1R5G5B5:
3456 		case FORMAT_A8R3G3B2:
3457 			return FORMAT_A8R8G8B8;
3458 		case FORMAT_A8:
3459 			return FORMAT_A8;
3460 		case FORMAT_R8I:
3461 			return FORMAT_R8I;
3462 		case FORMAT_R8UI:
3463 			return FORMAT_R8UI;
3464 		case FORMAT_R8I_SNORM:
3465 			return FORMAT_R8I_SNORM;
3466 		case FORMAT_R8:
3467 			return FORMAT_R8;
3468 		case FORMAT_R16I:
3469 			return FORMAT_R16I;
3470 		case FORMAT_R16UI:
3471 			return FORMAT_R16UI;
3472 		case FORMAT_R32I:
3473 			return FORMAT_R32I;
3474 		case FORMAT_R32UI:
3475 			return FORMAT_R32UI;
3476 		case FORMAT_X16B16G16R16I:
3477 		case FORMAT_A16B16G16R16I:
3478 			return FORMAT_A16B16G16R16I;
3479 		case FORMAT_X16B16G16R16UI:
3480 		case FORMAT_A16B16G16R16UI:
3481 			return FORMAT_A16B16G16R16UI;
3482 		case FORMAT_A2R10G10B10:
3483 		case FORMAT_A2B10G10R10:
3484 		case FORMAT_A16B16G16R16:
3485 			return FORMAT_A16B16G16R16;
3486 		case FORMAT_X32B32G32R32I:
3487 		case FORMAT_A32B32G32R32I:
3488 			return FORMAT_A32B32G32R32I;
3489 		case FORMAT_X32B32G32R32UI:
3490 		case FORMAT_A32B32G32R32UI:
3491 			return FORMAT_A32B32G32R32UI;
3492 		case FORMAT_G8R8I:
3493 			return FORMAT_G8R8I;
3494 		case FORMAT_G8R8UI:
3495 			return FORMAT_G8R8UI;
3496 		case FORMAT_G8R8I_SNORM:
3497 			return FORMAT_G8R8I_SNORM;
3498 		case FORMAT_G8R8:
3499 			return FORMAT_G8R8;
3500 		case FORMAT_G16R16I:
3501 			return FORMAT_G16R16I;
3502 		case FORMAT_G16R16UI:
3503 			return FORMAT_G16R16UI;
3504 		case FORMAT_G16R16:
3505 			return FORMAT_G16R16;
3506 		case FORMAT_G32R32I:
3507 			return FORMAT_G32R32I;
3508 		case FORMAT_G32R32UI:
3509 			return FORMAT_G32R32UI;
3510 		case FORMAT_A8R8G8B8:
3511 			if(lockable || !quadLayoutEnabled)
3512 			{
3513 				return FORMAT_A8R8G8B8;
3514 			}
3515 			else
3516 			{
3517 				return FORMAT_A8G8R8B8Q;
3518 			}
3519 		case FORMAT_A8B8G8R8I:
3520 			return FORMAT_A8B8G8R8I;
3521 		case FORMAT_A8B8G8R8UI:
3522 			return FORMAT_A8B8G8R8UI;
3523 		case FORMAT_A8B8G8R8I_SNORM:
3524 			return FORMAT_A8B8G8R8I_SNORM;
3525 		case FORMAT_R5G5B5A1:
3526 		case FORMAT_R4G4B4A4:
3527 		case FORMAT_A8B8G8R8:
3528 			return FORMAT_A8B8G8R8;
3529 		case FORMAT_R5G6B5:
3530 			return FORMAT_R5G6B5;
3531 		case FORMAT_R3G3B2:
3532 		case FORMAT_R8G8B8:
3533 		case FORMAT_X4R4G4B4:
3534 		case FORMAT_X1R5G5B5:
3535 		case FORMAT_X8R8G8B8:
3536 			if(lockable || !quadLayoutEnabled)
3537 			{
3538 				return FORMAT_X8R8G8B8;
3539 			}
3540 			else
3541 			{
3542 				return FORMAT_X8G8R8B8Q;
3543 			}
3544 		case FORMAT_X8B8G8R8I:
3545 			return FORMAT_X8B8G8R8I;
3546 		case FORMAT_X8B8G8R8UI:
3547 			return FORMAT_X8B8G8R8UI;
3548 		case FORMAT_X8B8G8R8I_SNORM:
3549 			return FORMAT_X8B8G8R8I_SNORM;
3550 		case FORMAT_B8G8R8:
3551 		case FORMAT_X8B8G8R8:
3552 			return FORMAT_X8B8G8R8;
3553 		case FORMAT_SRGB8_X8:
3554 			return FORMAT_SRGB8_X8;
3555 		case FORMAT_SRGB8_A8:
3556 			return FORMAT_SRGB8_A8;
3557 		// Compressed formats
3558 		#if S3TC_SUPPORT
3559 		case FORMAT_DXT1:
3560 		case FORMAT_DXT3:
3561 		case FORMAT_DXT5:
3562 		#endif
3563 		case FORMAT_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3564 		case FORMAT_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
3565 		case FORMAT_RGBA8_ETC2_EAC:
3566 		case FORMAT_SRGB8_ALPHA8_ETC2_EAC:
3567 		case FORMAT_SRGB8_ALPHA8_ASTC_4x4_KHR:
3568 		case FORMAT_SRGB8_ALPHA8_ASTC_5x4_KHR:
3569 		case FORMAT_SRGB8_ALPHA8_ASTC_5x5_KHR:
3570 		case FORMAT_SRGB8_ALPHA8_ASTC_6x5_KHR:
3571 		case FORMAT_SRGB8_ALPHA8_ASTC_6x6_KHR:
3572 		case FORMAT_SRGB8_ALPHA8_ASTC_8x5_KHR:
3573 		case FORMAT_SRGB8_ALPHA8_ASTC_8x6_KHR:
3574 		case FORMAT_SRGB8_ALPHA8_ASTC_8x8_KHR:
3575 		case FORMAT_SRGB8_ALPHA8_ASTC_10x5_KHR:
3576 		case FORMAT_SRGB8_ALPHA8_ASTC_10x6_KHR:
3577 		case FORMAT_SRGB8_ALPHA8_ASTC_10x8_KHR:
3578 		case FORMAT_SRGB8_ALPHA8_ASTC_10x10_KHR:
3579 		case FORMAT_SRGB8_ALPHA8_ASTC_12x10_KHR:
3580 		case FORMAT_SRGB8_ALPHA8_ASTC_12x12_KHR:
3581 			return FORMAT_A8R8G8B8;
3582 		case FORMAT_RGBA_ASTC_4x4_KHR:
3583 		case FORMAT_RGBA_ASTC_5x4_KHR:
3584 		case FORMAT_RGBA_ASTC_5x5_KHR:
3585 		case FORMAT_RGBA_ASTC_6x5_KHR:
3586 		case FORMAT_RGBA_ASTC_6x6_KHR:
3587 		case FORMAT_RGBA_ASTC_8x5_KHR:
3588 		case FORMAT_RGBA_ASTC_8x6_KHR:
3589 		case FORMAT_RGBA_ASTC_8x8_KHR:
3590 		case FORMAT_RGBA_ASTC_10x5_KHR:
3591 		case FORMAT_RGBA_ASTC_10x6_KHR:
3592 		case FORMAT_RGBA_ASTC_10x8_KHR:
3593 		case FORMAT_RGBA_ASTC_10x10_KHR:
3594 		case FORMAT_RGBA_ASTC_12x10_KHR:
3595 		case FORMAT_RGBA_ASTC_12x12_KHR:
3596 			// ASTC supports HDR, so a floating point format is required to represent it properly
3597 			return FORMAT_A32B32G32R32F; // FIXME: 16FP is probably sufficient, but it's currently unsupported
3598 		case FORMAT_ATI1:
3599 		case FORMAT_R11_EAC:
3600 			return FORMAT_R8;
3601 		case FORMAT_SIGNED_R11_EAC:
3602 			return FORMAT_R32F; // FIXME: Signed 8bit format would be sufficient
3603 		case FORMAT_ATI2:
3604 		case FORMAT_RG11_EAC:
3605 			return FORMAT_G8R8;
3606 		case FORMAT_SIGNED_RG11_EAC:
3607 			return FORMAT_G32R32F; // FIXME: Signed 8bit format would be sufficient
3608 		case FORMAT_ETC1:
3609 		case FORMAT_RGB8_ETC2:
3610 		case FORMAT_SRGB8_ETC2:
3611 			return FORMAT_X8R8G8B8;
3612 		// Bumpmap formats
3613 		case FORMAT_V8U8:			return FORMAT_V8U8;
3614 		case FORMAT_L6V5U5:			return FORMAT_X8L8V8U8;
3615 		case FORMAT_Q8W8V8U8:		return FORMAT_Q8W8V8U8;
3616 		case FORMAT_X8L8V8U8:		return FORMAT_X8L8V8U8;
3617 		case FORMAT_V16U16:			return FORMAT_V16U16;
3618 		case FORMAT_A2W10V10U10:	return FORMAT_A16W16V16U16;
3619 		case FORMAT_Q16W16V16U16:	return FORMAT_Q16W16V16U16;
3620 		// Floating-point formats
3621 		case FORMAT_A16F:			return FORMAT_A32B32G32R32F;
3622 		case FORMAT_R16F:			return FORMAT_R32F;
3623 		case FORMAT_G16R16F:		return FORMAT_G32R32F;
3624 		case FORMAT_B16G16R16F:     return FORMAT_X32B32G32R32F;
3625 		case FORMAT_A16B16G16R16F:	return FORMAT_A32B32G32R32F;
3626 		case FORMAT_A32F:			return FORMAT_A32B32G32R32F;
3627 		case FORMAT_R32F:			return FORMAT_R32F;
3628 		case FORMAT_G32R32F:		return FORMAT_G32R32F;
3629 		case FORMAT_B32G32R32F:     return FORMAT_X32B32G32R32F;
3630 		case FORMAT_X32B32G32R32F:  return FORMAT_X32B32G32R32F;
3631 		case FORMAT_A32B32G32R32F:	return FORMAT_A32B32G32R32F;
3632 		// Luminance formats
3633 		case FORMAT_L8:				return FORMAT_L8;
3634 		case FORMAT_A4L4:			return FORMAT_A8L8;
3635 		case FORMAT_L16:			return FORMAT_L16;
3636 		case FORMAT_A8L8:			return FORMAT_A8L8;
3637 		case FORMAT_L16F:           return FORMAT_X32B32G32R32F;
3638 		case FORMAT_A16L16F:        return FORMAT_A32B32G32R32F;
3639 		case FORMAT_L32F:           return FORMAT_X32B32G32R32F;
3640 		case FORMAT_A32L32F:        return FORMAT_A32B32G32R32F;
3641 		// Depth/stencil formats
3642 		case FORMAT_D16:
3643 		case FORMAT_D32:
3644 		case FORMAT_D24X8:
3645 		case FORMAT_D24S8:
3646 		case FORMAT_D24FS8:
3647 			if(hasParent)   // Texture
3648 			{
3649 				return FORMAT_D32FS8_SHADOW;
3650 			}
3651 			else if(complementaryDepthBuffer)
3652 			{
3653 				return FORMAT_D32F_COMPLEMENTARY;
3654 			}
3655 			else
3656 			{
3657 				return FORMAT_D32F;
3658 			}
3659 		case FORMAT_D32F:           return FORMAT_D32F;
3660 		case FORMAT_D32F_LOCKABLE:  return FORMAT_D32F_LOCKABLE;
3661 		case FORMAT_D32FS8_TEXTURE: return FORMAT_D32FS8_TEXTURE;
3662 		case FORMAT_INTZ:           return FORMAT_D32FS8_TEXTURE;
3663 		case FORMAT_DF24S8:         return FORMAT_D32FS8_SHADOW;
3664 		case FORMAT_DF16S8:         return FORMAT_D32FS8_SHADOW;
3665 		case FORMAT_YV12_BT601:     return FORMAT_YV12_BT601;
3666 		case FORMAT_YV12_BT709:     return FORMAT_YV12_BT709;
3667 		case FORMAT_YV12_JFIF:      return FORMAT_YV12_JFIF;
3668 		default:
3669 			ASSERT(false);
3670 		}
3671 
3672 		return FORMAT_NULL;
3673 	}
3674 
setTexturePalette(unsigned int * palette)3675 	void Surface::setTexturePalette(unsigned int *palette)
3676 	{
3677 		Surface::palette = palette;
3678 		Surface::paletteID++;
3679 	}
3680 
resolve()3681 	void Surface::resolve()
3682 	{
3683 		if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL)
3684 		{
3685 			return;
3686 		}
3687 
3688 		void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE);
3689 
3690 		int quality = internal.depth;
3691 		int width = internal.width;
3692 		int height = internal.height;
3693 		int pitch = internal.pitchB;
3694 		int slice = internal.sliceB;
3695 
3696 		unsigned char *source0 = (unsigned char*)source;
3697 		unsigned char *source1 = source0 + slice;
3698 		unsigned char *source2 = source1 + slice;
3699 		unsigned char *source3 = source2 + slice;
3700 		unsigned char *source4 = source3 + slice;
3701 		unsigned char *source5 = source4 + slice;
3702 		unsigned char *source6 = source5 + slice;
3703 		unsigned char *source7 = source6 + slice;
3704 		unsigned char *source8 = source7 + slice;
3705 		unsigned char *source9 = source8 + slice;
3706 		unsigned char *sourceA = source9 + slice;
3707 		unsigned char *sourceB = sourceA + slice;
3708 		unsigned char *sourceC = sourceB + slice;
3709 		unsigned char *sourceD = sourceC + slice;
3710 		unsigned char *sourceE = sourceD + slice;
3711 		unsigned char *sourceF = sourceE + slice;
3712 
3713 		if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8 ||
3714 		   internal.format == FORMAT_X8B8G8R8 || internal.format == FORMAT_A8B8G8R8 ||
3715 		   internal.format == FORMAT_SRGB8_X8 || internal.format == FORMAT_SRGB8_A8)
3716 		{
3717 			if(CPUID::supportsSSE2() && (width % 4) == 0)
3718 			{
3719 				if(internal.depth == 2)
3720 				{
3721 					for(int y = 0; y < height; y++)
3722 					{
3723 						for(int x = 0; x < width; x += 4)
3724 						{
3725 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3726 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3727 
3728 							c0 = _mm_avg_epu8(c0, c1);
3729 
3730 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3731 						}
3732 
3733 						source0 += pitch;
3734 						source1 += pitch;
3735 					}
3736 				}
3737 				else if(internal.depth == 4)
3738 				{
3739 					for(int y = 0; y < height; y++)
3740 					{
3741 						for(int x = 0; x < width; x += 4)
3742 						{
3743 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3744 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3745 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3746 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3747 
3748 							c0 = _mm_avg_epu8(c0, c1);
3749 							c2 = _mm_avg_epu8(c2, c3);
3750 							c0 = _mm_avg_epu8(c0, c2);
3751 
3752 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3753 						}
3754 
3755 						source0 += pitch;
3756 						source1 += pitch;
3757 						source2 += pitch;
3758 						source3 += pitch;
3759 					}
3760 				}
3761 				else if(internal.depth == 8)
3762 				{
3763 					for(int y = 0; y < height; y++)
3764 					{
3765 						for(int x = 0; x < width; x += 4)
3766 						{
3767 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3768 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3769 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3770 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3771 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3772 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3773 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3774 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3775 
3776 							c0 = _mm_avg_epu8(c0, c1);
3777 							c2 = _mm_avg_epu8(c2, c3);
3778 							c4 = _mm_avg_epu8(c4, c5);
3779 							c6 = _mm_avg_epu8(c6, c7);
3780 							c0 = _mm_avg_epu8(c0, c2);
3781 							c4 = _mm_avg_epu8(c4, c6);
3782 							c0 = _mm_avg_epu8(c0, c4);
3783 
3784 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3785 						}
3786 
3787 						source0 += pitch;
3788 						source1 += pitch;
3789 						source2 += pitch;
3790 						source3 += pitch;
3791 						source4 += pitch;
3792 						source5 += pitch;
3793 						source6 += pitch;
3794 						source7 += pitch;
3795 					}
3796 				}
3797 				else if(internal.depth == 16)
3798 				{
3799 					for(int y = 0; y < height; y++)
3800 					{
3801 						for(int x = 0; x < width; x += 4)
3802 						{
3803 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
3804 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
3805 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
3806 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
3807 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
3808 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
3809 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
3810 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
3811 							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
3812 							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
3813 							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
3814 							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
3815 							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
3816 							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
3817 							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
3818 							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
3819 
3820 							c0 = _mm_avg_epu8(c0, c1);
3821 							c2 = _mm_avg_epu8(c2, c3);
3822 							c4 = _mm_avg_epu8(c4, c5);
3823 							c6 = _mm_avg_epu8(c6, c7);
3824 							c8 = _mm_avg_epu8(c8, c9);
3825 							cA = _mm_avg_epu8(cA, cB);
3826 							cC = _mm_avg_epu8(cC, cD);
3827 							cE = _mm_avg_epu8(cE, cF);
3828 							c0 = _mm_avg_epu8(c0, c2);
3829 							c4 = _mm_avg_epu8(c4, c6);
3830 							c8 = _mm_avg_epu8(c8, cA);
3831 							cC = _mm_avg_epu8(cC, cE);
3832 							c0 = _mm_avg_epu8(c0, c4);
3833 							c8 = _mm_avg_epu8(c8, cC);
3834 							c0 = _mm_avg_epu8(c0, c8);
3835 
3836 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
3837 						}
3838 
3839 						source0 += pitch;
3840 						source1 += pitch;
3841 						source2 += pitch;
3842 						source3 += pitch;
3843 						source4 += pitch;
3844 						source5 += pitch;
3845 						source6 += pitch;
3846 						source7 += pitch;
3847 						source8 += pitch;
3848 						source9 += pitch;
3849 						sourceA += pitch;
3850 						sourceB += pitch;
3851 						sourceC += pitch;
3852 						sourceD += pitch;
3853 						sourceE += pitch;
3854 						sourceF += pitch;
3855 					}
3856 				}
3857 				else ASSERT(false);
3858 			}
3859 			else
3860 			{
3861 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101))
3862 
3863 				if(internal.depth == 2)
3864 				{
3865 					for(int y = 0; y < height; y++)
3866 					{
3867 						for(int x = 0; x < width; x++)
3868 						{
3869 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3870 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3871 
3872 							c0 = AVERAGE(c0, c1);
3873 
3874 							*(unsigned int*)(source0 + 4 * x) = c0;
3875 						}
3876 
3877 						source0 += pitch;
3878 						source1 += pitch;
3879 					}
3880 				}
3881 				else if(internal.depth == 4)
3882 				{
3883 					for(int y = 0; y < height; y++)
3884 					{
3885 						for(int x = 0; x < width; x++)
3886 						{
3887 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3888 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3889 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3890 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3891 
3892 							c0 = AVERAGE(c0, c1);
3893 							c2 = AVERAGE(c2, c3);
3894 							c0 = AVERAGE(c0, c2);
3895 
3896 							*(unsigned int*)(source0 + 4 * x) = c0;
3897 						}
3898 
3899 						source0 += pitch;
3900 						source1 += pitch;
3901 						source2 += pitch;
3902 						source3 += pitch;
3903 					}
3904 				}
3905 				else if(internal.depth == 8)
3906 				{
3907 					for(int y = 0; y < height; y++)
3908 					{
3909 						for(int x = 0; x < width; x++)
3910 						{
3911 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3912 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3913 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3914 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3915 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3916 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3917 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3918 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3919 
3920 							c0 = AVERAGE(c0, c1);
3921 							c2 = AVERAGE(c2, c3);
3922 							c4 = AVERAGE(c4, c5);
3923 							c6 = AVERAGE(c6, c7);
3924 							c0 = AVERAGE(c0, c2);
3925 							c4 = AVERAGE(c4, c6);
3926 							c0 = AVERAGE(c0, c4);
3927 
3928 							*(unsigned int*)(source0 + 4 * x) = c0;
3929 						}
3930 
3931 						source0 += pitch;
3932 						source1 += pitch;
3933 						source2 += pitch;
3934 						source3 += pitch;
3935 						source4 += pitch;
3936 						source5 += pitch;
3937 						source6 += pitch;
3938 						source7 += pitch;
3939 					}
3940 				}
3941 				else if(internal.depth == 16)
3942 				{
3943 					for(int y = 0; y < height; y++)
3944 					{
3945 						for(int x = 0; x < width; x++)
3946 						{
3947 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
3948 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
3949 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
3950 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
3951 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
3952 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
3953 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
3954 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
3955 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
3956 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
3957 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
3958 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
3959 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
3960 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
3961 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
3962 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
3963 
3964 							c0 = AVERAGE(c0, c1);
3965 							c2 = AVERAGE(c2, c3);
3966 							c4 = AVERAGE(c4, c5);
3967 							c6 = AVERAGE(c6, c7);
3968 							c8 = AVERAGE(c8, c9);
3969 							cA = AVERAGE(cA, cB);
3970 							cC = AVERAGE(cC, cD);
3971 							cE = AVERAGE(cE, cF);
3972 							c0 = AVERAGE(c0, c2);
3973 							c4 = AVERAGE(c4, c6);
3974 							c8 = AVERAGE(c8, cA);
3975 							cC = AVERAGE(cC, cE);
3976 							c0 = AVERAGE(c0, c4);
3977 							c8 = AVERAGE(c8, cC);
3978 							c0 = AVERAGE(c0, c8);
3979 
3980 							*(unsigned int*)(source0 + 4 * x) = c0;
3981 						}
3982 
3983 						source0 += pitch;
3984 						source1 += pitch;
3985 						source2 += pitch;
3986 						source3 += pitch;
3987 						source4 += pitch;
3988 						source5 += pitch;
3989 						source6 += pitch;
3990 						source7 += pitch;
3991 						source8 += pitch;
3992 						source9 += pitch;
3993 						sourceA += pitch;
3994 						sourceB += pitch;
3995 						sourceC += pitch;
3996 						sourceD += pitch;
3997 						sourceE += pitch;
3998 						sourceF += pitch;
3999 					}
4000 				}
4001 				else ASSERT(false);
4002 
4003 				#undef AVERAGE
4004 			}
4005 		}
4006 		else if(internal.format == FORMAT_G16R16)
4007 		{
4008 			if(CPUID::supportsSSE2() && (width % 4) == 0)
4009 			{
4010 				if(internal.depth == 2)
4011 				{
4012 					for(int y = 0; y < height; y++)
4013 					{
4014 						for(int x = 0; x < width; x += 4)
4015 						{
4016 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4017 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4018 
4019 							c0 = _mm_avg_epu16(c0, c1);
4020 
4021 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4022 						}
4023 
4024 						source0 += pitch;
4025 						source1 += pitch;
4026 					}
4027 				}
4028 				else if(internal.depth == 4)
4029 				{
4030 					for(int y = 0; y < height; y++)
4031 					{
4032 						for(int x = 0; x < width; x += 4)
4033 						{
4034 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4035 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4036 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4037 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4038 
4039 							c0 = _mm_avg_epu16(c0, c1);
4040 							c2 = _mm_avg_epu16(c2, c3);
4041 							c0 = _mm_avg_epu16(c0, c2);
4042 
4043 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4044 						}
4045 
4046 						source0 += pitch;
4047 						source1 += pitch;
4048 						source2 += pitch;
4049 						source3 += pitch;
4050 					}
4051 				}
4052 				else if(internal.depth == 8)
4053 				{
4054 					for(int y = 0; y < height; y++)
4055 					{
4056 						for(int x = 0; x < width; x += 4)
4057 						{
4058 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4059 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4060 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4061 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4062 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4063 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4064 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4065 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4066 
4067 							c0 = _mm_avg_epu16(c0, c1);
4068 							c2 = _mm_avg_epu16(c2, c3);
4069 							c4 = _mm_avg_epu16(c4, c5);
4070 							c6 = _mm_avg_epu16(c6, c7);
4071 							c0 = _mm_avg_epu16(c0, c2);
4072 							c4 = _mm_avg_epu16(c4, c6);
4073 							c0 = _mm_avg_epu16(c0, c4);
4074 
4075 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4076 						}
4077 
4078 						source0 += pitch;
4079 						source1 += pitch;
4080 						source2 += pitch;
4081 						source3 += pitch;
4082 						source4 += pitch;
4083 						source5 += pitch;
4084 						source6 += pitch;
4085 						source7 += pitch;
4086 					}
4087 				}
4088 				else if(internal.depth == 16)
4089 				{
4090 					for(int y = 0; y < height; y++)
4091 					{
4092 						for(int x = 0; x < width; x += 4)
4093 						{
4094 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x));
4095 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x));
4096 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x));
4097 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x));
4098 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x));
4099 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x));
4100 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x));
4101 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x));
4102 							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x));
4103 							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x));
4104 							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x));
4105 							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x));
4106 							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x));
4107 							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x));
4108 							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x));
4109 							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x));
4110 
4111 							c0 = _mm_avg_epu16(c0, c1);
4112 							c2 = _mm_avg_epu16(c2, c3);
4113 							c4 = _mm_avg_epu16(c4, c5);
4114 							c6 = _mm_avg_epu16(c6, c7);
4115 							c8 = _mm_avg_epu16(c8, c9);
4116 							cA = _mm_avg_epu16(cA, cB);
4117 							cC = _mm_avg_epu16(cC, cD);
4118 							cE = _mm_avg_epu16(cE, cF);
4119 							c0 = _mm_avg_epu16(c0, c2);
4120 							c4 = _mm_avg_epu16(c4, c6);
4121 							c8 = _mm_avg_epu16(c8, cA);
4122 							cC = _mm_avg_epu16(cC, cE);
4123 							c0 = _mm_avg_epu16(c0, c4);
4124 							c8 = _mm_avg_epu16(c8, cC);
4125 							c0 = _mm_avg_epu16(c0, c8);
4126 
4127 							_mm_store_si128((__m128i*)(source0 + 4 * x), c0);
4128 						}
4129 
4130 						source0 += pitch;
4131 						source1 += pitch;
4132 						source2 += pitch;
4133 						source3 += pitch;
4134 						source4 += pitch;
4135 						source5 += pitch;
4136 						source6 += pitch;
4137 						source7 += pitch;
4138 						source8 += pitch;
4139 						source9 += pitch;
4140 						sourceA += pitch;
4141 						sourceB += pitch;
4142 						sourceC += pitch;
4143 						sourceD += pitch;
4144 						sourceE += pitch;
4145 						sourceF += pitch;
4146 					}
4147 				}
4148 				else ASSERT(false);
4149 			}
4150 			else
4151 			{
4152 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4153 
4154 				if(internal.depth == 2)
4155 				{
4156 					for(int y = 0; y < height; y++)
4157 					{
4158 						for(int x = 0; x < width; x++)
4159 						{
4160 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4161 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4162 
4163 							c0 = AVERAGE(c0, c1);
4164 
4165 							*(unsigned int*)(source0 + 4 * x) = c0;
4166 						}
4167 
4168 						source0 += pitch;
4169 						source1 += pitch;
4170 					}
4171 				}
4172 				else if(internal.depth == 4)
4173 				{
4174 					for(int y = 0; y < height; y++)
4175 					{
4176 						for(int x = 0; x < width; x++)
4177 						{
4178 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4179 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4180 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4181 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4182 
4183 							c0 = AVERAGE(c0, c1);
4184 							c2 = AVERAGE(c2, c3);
4185 							c0 = AVERAGE(c0, c2);
4186 
4187 							*(unsigned int*)(source0 + 4 * x) = c0;
4188 						}
4189 
4190 						source0 += pitch;
4191 						source1 += pitch;
4192 						source2 += pitch;
4193 						source3 += pitch;
4194 					}
4195 				}
4196 				else if(internal.depth == 8)
4197 				{
4198 					for(int y = 0; y < height; y++)
4199 					{
4200 						for(int x = 0; x < width; x++)
4201 						{
4202 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4203 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4204 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4205 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4206 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4207 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4208 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4209 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4210 
4211 							c0 = AVERAGE(c0, c1);
4212 							c2 = AVERAGE(c2, c3);
4213 							c4 = AVERAGE(c4, c5);
4214 							c6 = AVERAGE(c6, c7);
4215 							c0 = AVERAGE(c0, c2);
4216 							c4 = AVERAGE(c4, c6);
4217 							c0 = AVERAGE(c0, c4);
4218 
4219 							*(unsigned int*)(source0 + 4 * x) = c0;
4220 						}
4221 
4222 						source0 += pitch;
4223 						source1 += pitch;
4224 						source2 += pitch;
4225 						source3 += pitch;
4226 						source4 += pitch;
4227 						source5 += pitch;
4228 						source6 += pitch;
4229 						source7 += pitch;
4230 					}
4231 				}
4232 				else if(internal.depth == 16)
4233 				{
4234 					for(int y = 0; y < height; y++)
4235 					{
4236 						for(int x = 0; x < width; x++)
4237 						{
4238 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4239 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4240 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4241 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4242 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4243 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4244 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4245 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4246 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4247 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4248 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4249 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4250 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4251 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4252 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4253 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4254 
4255 							c0 = AVERAGE(c0, c1);
4256 							c2 = AVERAGE(c2, c3);
4257 							c4 = AVERAGE(c4, c5);
4258 							c6 = AVERAGE(c6, c7);
4259 							c8 = AVERAGE(c8, c9);
4260 							cA = AVERAGE(cA, cB);
4261 							cC = AVERAGE(cC, cD);
4262 							cE = AVERAGE(cE, cF);
4263 							c0 = AVERAGE(c0, c2);
4264 							c4 = AVERAGE(c4, c6);
4265 							c8 = AVERAGE(c8, cA);
4266 							cC = AVERAGE(cC, cE);
4267 							c0 = AVERAGE(c0, c4);
4268 							c8 = AVERAGE(c8, cC);
4269 							c0 = AVERAGE(c0, c8);
4270 
4271 							*(unsigned int*)(source0 + 4 * x) = c0;
4272 						}
4273 
4274 						source0 += pitch;
4275 						source1 += pitch;
4276 						source2 += pitch;
4277 						source3 += pitch;
4278 						source4 += pitch;
4279 						source5 += pitch;
4280 						source6 += pitch;
4281 						source7 += pitch;
4282 						source8 += pitch;
4283 						source9 += pitch;
4284 						sourceA += pitch;
4285 						sourceB += pitch;
4286 						sourceC += pitch;
4287 						sourceD += pitch;
4288 						sourceE += pitch;
4289 						sourceF += pitch;
4290 					}
4291 				}
4292 				else ASSERT(false);
4293 
4294 				#undef AVERAGE
4295 			}
4296 		}
4297 		else if(internal.format == FORMAT_A16B16G16R16)
4298 		{
4299 			if(CPUID::supportsSSE2() && (width % 2) == 0)
4300 			{
4301 				if(internal.depth == 2)
4302 				{
4303 					for(int y = 0; y < height; y++)
4304 					{
4305 						for(int x = 0; x < width; x += 2)
4306 						{
4307 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4308 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4309 
4310 							c0 = _mm_avg_epu16(c0, c1);
4311 
4312 							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4313 						}
4314 
4315 						source0 += pitch;
4316 						source1 += pitch;
4317 					}
4318 				}
4319 				else if(internal.depth == 4)
4320 				{
4321 					for(int y = 0; y < height; y++)
4322 					{
4323 						for(int x = 0; x < width; x += 2)
4324 						{
4325 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4326 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4327 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4328 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4329 
4330 							c0 = _mm_avg_epu16(c0, c1);
4331 							c2 = _mm_avg_epu16(c2, c3);
4332 							c0 = _mm_avg_epu16(c0, c2);
4333 
4334 							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4335 						}
4336 
4337 						source0 += pitch;
4338 						source1 += pitch;
4339 						source2 += pitch;
4340 						source3 += pitch;
4341 					}
4342 				}
4343 				else if(internal.depth == 8)
4344 				{
4345 					for(int y = 0; y < height; y++)
4346 					{
4347 						for(int x = 0; x < width; x += 2)
4348 						{
4349 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4350 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4351 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4352 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4353 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4354 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4355 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4356 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4357 
4358 							c0 = _mm_avg_epu16(c0, c1);
4359 							c2 = _mm_avg_epu16(c2, c3);
4360 							c4 = _mm_avg_epu16(c4, c5);
4361 							c6 = _mm_avg_epu16(c6, c7);
4362 							c0 = _mm_avg_epu16(c0, c2);
4363 							c4 = _mm_avg_epu16(c4, c6);
4364 							c0 = _mm_avg_epu16(c0, c4);
4365 
4366 							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4367 						}
4368 
4369 						source0 += pitch;
4370 						source1 += pitch;
4371 						source2 += pitch;
4372 						source3 += pitch;
4373 						source4 += pitch;
4374 						source5 += pitch;
4375 						source6 += pitch;
4376 						source7 += pitch;
4377 					}
4378 				}
4379 				else if(internal.depth == 16)
4380 				{
4381 					for(int y = 0; y < height; y++)
4382 					{
4383 						for(int x = 0; x < width; x += 2)
4384 						{
4385 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x));
4386 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x));
4387 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x));
4388 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x));
4389 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x));
4390 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x));
4391 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x));
4392 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x));
4393 							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x));
4394 							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x));
4395 							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x));
4396 							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x));
4397 							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x));
4398 							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x));
4399 							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x));
4400 							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x));
4401 
4402 							c0 = _mm_avg_epu16(c0, c1);
4403 							c2 = _mm_avg_epu16(c2, c3);
4404 							c4 = _mm_avg_epu16(c4, c5);
4405 							c6 = _mm_avg_epu16(c6, c7);
4406 							c8 = _mm_avg_epu16(c8, c9);
4407 							cA = _mm_avg_epu16(cA, cB);
4408 							cC = _mm_avg_epu16(cC, cD);
4409 							cE = _mm_avg_epu16(cE, cF);
4410 							c0 = _mm_avg_epu16(c0, c2);
4411 							c4 = _mm_avg_epu16(c4, c6);
4412 							c8 = _mm_avg_epu16(c8, cA);
4413 							cC = _mm_avg_epu16(cC, cE);
4414 							c0 = _mm_avg_epu16(c0, c4);
4415 							c8 = _mm_avg_epu16(c8, cC);
4416 							c0 = _mm_avg_epu16(c0, c8);
4417 
4418 							_mm_store_si128((__m128i*)(source0 + 8 * x), c0);
4419 						}
4420 
4421 						source0 += pitch;
4422 						source1 += pitch;
4423 						source2 += pitch;
4424 						source3 += pitch;
4425 						source4 += pitch;
4426 						source5 += pitch;
4427 						source6 += pitch;
4428 						source7 += pitch;
4429 						source8 += pitch;
4430 						source9 += pitch;
4431 						sourceA += pitch;
4432 						sourceB += pitch;
4433 						sourceC += pitch;
4434 						sourceD += pitch;
4435 						sourceE += pitch;
4436 						sourceF += pitch;
4437 					}
4438 				}
4439 				else ASSERT(false);
4440 			}
4441 			else
4442 			{
4443 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001))
4444 
4445 				if(internal.depth == 2)
4446 				{
4447 					for(int y = 0; y < height; y++)
4448 					{
4449 						for(int x = 0; x < 2 * width; x++)
4450 						{
4451 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4452 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4453 
4454 							c0 = AVERAGE(c0, c1);
4455 
4456 							*(unsigned int*)(source0 + 4 * x) = c0;
4457 						}
4458 
4459 						source0 += pitch;
4460 						source1 += pitch;
4461 					}
4462 				}
4463 				else if(internal.depth == 4)
4464 				{
4465 					for(int y = 0; y < height; y++)
4466 					{
4467 						for(int x = 0; x < 2 * width; x++)
4468 						{
4469 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4470 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4471 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4472 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4473 
4474 							c0 = AVERAGE(c0, c1);
4475 							c2 = AVERAGE(c2, c3);
4476 							c0 = AVERAGE(c0, c2);
4477 
4478 							*(unsigned int*)(source0 + 4 * x) = c0;
4479 						}
4480 
4481 						source0 += pitch;
4482 						source1 += pitch;
4483 						source2 += pitch;
4484 						source3 += pitch;
4485 					}
4486 				}
4487 				else if(internal.depth == 8)
4488 				{
4489 					for(int y = 0; y < height; y++)
4490 					{
4491 						for(int x = 0; x < 2 * width; x++)
4492 						{
4493 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4494 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4495 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4496 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4497 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4498 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4499 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4500 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4501 
4502 							c0 = AVERAGE(c0, c1);
4503 							c2 = AVERAGE(c2, c3);
4504 							c4 = AVERAGE(c4, c5);
4505 							c6 = AVERAGE(c6, c7);
4506 							c0 = AVERAGE(c0, c2);
4507 							c4 = AVERAGE(c4, c6);
4508 							c0 = AVERAGE(c0, c4);
4509 
4510 							*(unsigned int*)(source0 + 4 * x) = c0;
4511 						}
4512 
4513 						source0 += pitch;
4514 						source1 += pitch;
4515 						source2 += pitch;
4516 						source3 += pitch;
4517 						source4 += pitch;
4518 						source5 += pitch;
4519 						source6 += pitch;
4520 						source7 += pitch;
4521 					}
4522 				}
4523 				else if(internal.depth == 16)
4524 				{
4525 					for(int y = 0; y < height; y++)
4526 					{
4527 						for(int x = 0; x < 2 * width; x++)
4528 						{
4529 							unsigned int c0 = *(unsigned int*)(source0 + 4 * x);
4530 							unsigned int c1 = *(unsigned int*)(source1 + 4 * x);
4531 							unsigned int c2 = *(unsigned int*)(source2 + 4 * x);
4532 							unsigned int c3 = *(unsigned int*)(source3 + 4 * x);
4533 							unsigned int c4 = *(unsigned int*)(source4 + 4 * x);
4534 							unsigned int c5 = *(unsigned int*)(source5 + 4 * x);
4535 							unsigned int c6 = *(unsigned int*)(source6 + 4 * x);
4536 							unsigned int c7 = *(unsigned int*)(source7 + 4 * x);
4537 							unsigned int c8 = *(unsigned int*)(source8 + 4 * x);
4538 							unsigned int c9 = *(unsigned int*)(source9 + 4 * x);
4539 							unsigned int cA = *(unsigned int*)(sourceA + 4 * x);
4540 							unsigned int cB = *(unsigned int*)(sourceB + 4 * x);
4541 							unsigned int cC = *(unsigned int*)(sourceC + 4 * x);
4542 							unsigned int cD = *(unsigned int*)(sourceD + 4 * x);
4543 							unsigned int cE = *(unsigned int*)(sourceE + 4 * x);
4544 							unsigned int cF = *(unsigned int*)(sourceF + 4 * x);
4545 
4546 							c0 = AVERAGE(c0, c1);
4547 							c2 = AVERAGE(c2, c3);
4548 							c4 = AVERAGE(c4, c5);
4549 							c6 = AVERAGE(c6, c7);
4550 							c8 = AVERAGE(c8, c9);
4551 							cA = AVERAGE(cA, cB);
4552 							cC = AVERAGE(cC, cD);
4553 							cE = AVERAGE(cE, cF);
4554 							c0 = AVERAGE(c0, c2);
4555 							c4 = AVERAGE(c4, c6);
4556 							c8 = AVERAGE(c8, cA);
4557 							cC = AVERAGE(cC, cE);
4558 							c0 = AVERAGE(c0, c4);
4559 							c8 = AVERAGE(c8, cC);
4560 							c0 = AVERAGE(c0, c8);
4561 
4562 							*(unsigned int*)(source0 + 4 * x) = c0;
4563 						}
4564 
4565 						source0 += pitch;
4566 						source1 += pitch;
4567 						source2 += pitch;
4568 						source3 += pitch;
4569 						source4 += pitch;
4570 						source5 += pitch;
4571 						source6 += pitch;
4572 						source7 += pitch;
4573 						source8 += pitch;
4574 						source9 += pitch;
4575 						sourceA += pitch;
4576 						sourceB += pitch;
4577 						sourceC += pitch;
4578 						sourceD += pitch;
4579 						sourceE += pitch;
4580 						sourceF += pitch;
4581 					}
4582 				}
4583 				else ASSERT(false);
4584 
4585 				#undef AVERAGE
4586 			}
4587 		}
4588 		else if(internal.format == FORMAT_R32F)
4589 		{
4590 			if(CPUID::supportsSSE() && (width % 4) == 0)
4591 			{
4592 				if(internal.depth == 2)
4593 				{
4594 					for(int y = 0; y < height; y++)
4595 					{
4596 						for(int x = 0; x < width; x += 4)
4597 						{
4598 							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4599 							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4600 
4601 							c0 = _mm_add_ps(c0, c1);
4602 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4603 
4604 							_mm_store_ps((float*)(source0 + 4 * x), c0);
4605 						}
4606 
4607 						source0 += pitch;
4608 						source1 += pitch;
4609 					}
4610 				}
4611 				else if(internal.depth == 4)
4612 				{
4613 					for(int y = 0; y < height; y++)
4614 					{
4615 						for(int x = 0; x < width; x += 4)
4616 						{
4617 							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4618 							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4619 							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4620 							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4621 
4622 							c0 = _mm_add_ps(c0, c1);
4623 							c2 = _mm_add_ps(c2, c3);
4624 							c0 = _mm_add_ps(c0, c2);
4625 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4626 
4627 							_mm_store_ps((float*)(source0 + 4 * x), c0);
4628 						}
4629 
4630 						source0 += pitch;
4631 						source1 += pitch;
4632 						source2 += pitch;
4633 						source3 += pitch;
4634 					}
4635 				}
4636 				else if(internal.depth == 8)
4637 				{
4638 					for(int y = 0; y < height; y++)
4639 					{
4640 						for(int x = 0; x < width; x += 4)
4641 						{
4642 							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4643 							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4644 							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4645 							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4646 							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4647 							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4648 							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4649 							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4650 
4651 							c0 = _mm_add_ps(c0, c1);
4652 							c2 = _mm_add_ps(c2, c3);
4653 							c4 = _mm_add_ps(c4, c5);
4654 							c6 = _mm_add_ps(c6, c7);
4655 							c0 = _mm_add_ps(c0, c2);
4656 							c4 = _mm_add_ps(c4, c6);
4657 							c0 = _mm_add_ps(c0, c4);
4658 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4659 
4660 							_mm_store_ps((float*)(source0 + 4 * x), c0);
4661 						}
4662 
4663 						source0 += pitch;
4664 						source1 += pitch;
4665 						source2 += pitch;
4666 						source3 += pitch;
4667 						source4 += pitch;
4668 						source5 += pitch;
4669 						source6 += pitch;
4670 						source7 += pitch;
4671 					}
4672 				}
4673 				else if(internal.depth == 16)
4674 				{
4675 					for(int y = 0; y < height; y++)
4676 					{
4677 						for(int x = 0; x < width; x += 4)
4678 						{
4679 							__m128 c0 = _mm_load_ps((float*)(source0 + 4 * x));
4680 							__m128 c1 = _mm_load_ps((float*)(source1 + 4 * x));
4681 							__m128 c2 = _mm_load_ps((float*)(source2 + 4 * x));
4682 							__m128 c3 = _mm_load_ps((float*)(source3 + 4 * x));
4683 							__m128 c4 = _mm_load_ps((float*)(source4 + 4 * x));
4684 							__m128 c5 = _mm_load_ps((float*)(source5 + 4 * x));
4685 							__m128 c6 = _mm_load_ps((float*)(source6 + 4 * x));
4686 							__m128 c7 = _mm_load_ps((float*)(source7 + 4 * x));
4687 							__m128 c8 = _mm_load_ps((float*)(source8 + 4 * x));
4688 							__m128 c9 = _mm_load_ps((float*)(source9 + 4 * x));
4689 							__m128 cA = _mm_load_ps((float*)(sourceA + 4 * x));
4690 							__m128 cB = _mm_load_ps((float*)(sourceB + 4 * x));
4691 							__m128 cC = _mm_load_ps((float*)(sourceC + 4 * x));
4692 							__m128 cD = _mm_load_ps((float*)(sourceD + 4 * x));
4693 							__m128 cE = _mm_load_ps((float*)(sourceE + 4 * x));
4694 							__m128 cF = _mm_load_ps((float*)(sourceF + 4 * x));
4695 
4696 							c0 = _mm_add_ps(c0, c1);
4697 							c2 = _mm_add_ps(c2, c3);
4698 							c4 = _mm_add_ps(c4, c5);
4699 							c6 = _mm_add_ps(c6, c7);
4700 							c8 = _mm_add_ps(c8, c9);
4701 							cA = _mm_add_ps(cA, cB);
4702 							cC = _mm_add_ps(cC, cD);
4703 							cE = _mm_add_ps(cE, cF);
4704 							c0 = _mm_add_ps(c0, c2);
4705 							c4 = _mm_add_ps(c4, c6);
4706 							c8 = _mm_add_ps(c8, cA);
4707 							cC = _mm_add_ps(cC, cE);
4708 							c0 = _mm_add_ps(c0, c4);
4709 							c8 = _mm_add_ps(c8, cC);
4710 							c0 = _mm_add_ps(c0, c8);
4711 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
4712 
4713 							_mm_store_ps((float*)(source0 + 4 * x), c0);
4714 						}
4715 
4716 						source0 += pitch;
4717 						source1 += pitch;
4718 						source2 += pitch;
4719 						source3 += pitch;
4720 						source4 += pitch;
4721 						source5 += pitch;
4722 						source6 += pitch;
4723 						source7 += pitch;
4724 						source8 += pitch;
4725 						source9 += pitch;
4726 						sourceA += pitch;
4727 						sourceB += pitch;
4728 						sourceC += pitch;
4729 						sourceD += pitch;
4730 						sourceE += pitch;
4731 						sourceF += pitch;
4732 					}
4733 				}
4734 				else ASSERT(false);
4735 			}
4736 			else
4737 			{
4738 				if(internal.depth == 2)
4739 				{
4740 					for(int y = 0; y < height; y++)
4741 					{
4742 						for(int x = 0; x < width; x++)
4743 						{
4744 							float c0 = *(float*)(source0 + 4 * x);
4745 							float c1 = *(float*)(source1 + 4 * x);
4746 
4747 							c0 = c0 + c1;
4748 							c0 *= 1.0f / 2.0f;
4749 
4750 							*(float*)(source0 + 4 * x) = c0;
4751 						}
4752 
4753 						source0 += pitch;
4754 						source1 += pitch;
4755 					}
4756 				}
4757 				else if(internal.depth == 4)
4758 				{
4759 					for(int y = 0; y < height; y++)
4760 					{
4761 						for(int x = 0; x < width; x++)
4762 						{
4763 							float c0 = *(float*)(source0 + 4 * x);
4764 							float c1 = *(float*)(source1 + 4 * x);
4765 							float c2 = *(float*)(source2 + 4 * x);
4766 							float c3 = *(float*)(source3 + 4 * x);
4767 
4768 							c0 = c0 + c1;
4769 							c2 = c2 + c3;
4770 							c0 = c0 + c2;
4771 							c0 *= 1.0f / 4.0f;
4772 
4773 							*(float*)(source0 + 4 * x) = c0;
4774 						}
4775 
4776 						source0 += pitch;
4777 						source1 += pitch;
4778 						source2 += pitch;
4779 						source3 += pitch;
4780 					}
4781 				}
4782 				else if(internal.depth == 8)
4783 				{
4784 					for(int y = 0; y < height; y++)
4785 					{
4786 						for(int x = 0; x < width; x++)
4787 						{
4788 							float c0 = *(float*)(source0 + 4 * x);
4789 							float c1 = *(float*)(source1 + 4 * x);
4790 							float c2 = *(float*)(source2 + 4 * x);
4791 							float c3 = *(float*)(source3 + 4 * x);
4792 							float c4 = *(float*)(source4 + 4 * x);
4793 							float c5 = *(float*)(source5 + 4 * x);
4794 							float c6 = *(float*)(source6 + 4 * x);
4795 							float c7 = *(float*)(source7 + 4 * x);
4796 
4797 							c0 = c0 + c1;
4798 							c2 = c2 + c3;
4799 							c4 = c4 + c5;
4800 							c6 = c6 + c7;
4801 							c0 = c0 + c2;
4802 							c4 = c4 + c6;
4803 							c0 = c0 + c4;
4804 							c0 *= 1.0f / 8.0f;
4805 
4806 							*(float*)(source0 + 4 * x) = c0;
4807 						}
4808 
4809 						source0 += pitch;
4810 						source1 += pitch;
4811 						source2 += pitch;
4812 						source3 += pitch;
4813 						source4 += pitch;
4814 						source5 += pitch;
4815 						source6 += pitch;
4816 						source7 += pitch;
4817 					}
4818 				}
4819 				else if(internal.depth == 16)
4820 				{
4821 					for(int y = 0; y < height; y++)
4822 					{
4823 						for(int x = 0; x < width; x++)
4824 						{
4825 							float c0 = *(float*)(source0 + 4 * x);
4826 							float c1 = *(float*)(source1 + 4 * x);
4827 							float c2 = *(float*)(source2 + 4 * x);
4828 							float c3 = *(float*)(source3 + 4 * x);
4829 							float c4 = *(float*)(source4 + 4 * x);
4830 							float c5 = *(float*)(source5 + 4 * x);
4831 							float c6 = *(float*)(source6 + 4 * x);
4832 							float c7 = *(float*)(source7 + 4 * x);
4833 							float c8 = *(float*)(source8 + 4 * x);
4834 							float c9 = *(float*)(source9 + 4 * x);
4835 							float cA = *(float*)(sourceA + 4 * x);
4836 							float cB = *(float*)(sourceB + 4 * x);
4837 							float cC = *(float*)(sourceC + 4 * x);
4838 							float cD = *(float*)(sourceD + 4 * x);
4839 							float cE = *(float*)(sourceE + 4 * x);
4840 							float cF = *(float*)(sourceF + 4 * x);
4841 
4842 							c0 = c0 + c1;
4843 							c2 = c2 + c3;
4844 							c4 = c4 + c5;
4845 							c6 = c6 + c7;
4846 							c8 = c8 + c9;
4847 							cA = cA + cB;
4848 							cC = cC + cD;
4849 							cE = cE + cF;
4850 							c0 = c0 + c2;
4851 							c4 = c4 + c6;
4852 							c8 = c8 + cA;
4853 							cC = cC + cE;
4854 							c0 = c0 + c4;
4855 							c8 = c8 + cC;
4856 							c0 = c0 + c8;
4857 							c0 *= 1.0f / 16.0f;
4858 
4859 							*(float*)(source0 + 4 * x) = c0;
4860 						}
4861 
4862 						source0 += pitch;
4863 						source1 += pitch;
4864 						source2 += pitch;
4865 						source3 += pitch;
4866 						source4 += pitch;
4867 						source5 += pitch;
4868 						source6 += pitch;
4869 						source7 += pitch;
4870 						source8 += pitch;
4871 						source9 += pitch;
4872 						sourceA += pitch;
4873 						sourceB += pitch;
4874 						sourceC += pitch;
4875 						sourceD += pitch;
4876 						sourceE += pitch;
4877 						sourceF += pitch;
4878 					}
4879 				}
4880 				else ASSERT(false);
4881 			}
4882 		}
4883 		else if(internal.format == FORMAT_G32R32F)
4884 		{
4885 			if(CPUID::supportsSSE() && (width % 2) == 0)
4886 			{
4887 				if(internal.depth == 2)
4888 				{
4889 					for(int y = 0; y < height; y++)
4890 					{
4891 						for(int x = 0; x < width; x += 2)
4892 						{
4893 							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4894 							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4895 
4896 							c0 = _mm_add_ps(c0, c1);
4897 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
4898 
4899 							_mm_store_ps((float*)(source0 + 8 * x), c0);
4900 						}
4901 
4902 						source0 += pitch;
4903 						source1 += pitch;
4904 					}
4905 				}
4906 				else if(internal.depth == 4)
4907 				{
4908 					for(int y = 0; y < height; y++)
4909 					{
4910 						for(int x = 0; x < width; x += 2)
4911 						{
4912 							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4913 							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4914 							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4915 							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4916 
4917 							c0 = _mm_add_ps(c0, c1);
4918 							c2 = _mm_add_ps(c2, c3);
4919 							c0 = _mm_add_ps(c0, c2);
4920 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
4921 
4922 							_mm_store_ps((float*)(source0 + 8 * x), c0);
4923 						}
4924 
4925 						source0 += pitch;
4926 						source1 += pitch;
4927 						source2 += pitch;
4928 						source3 += pitch;
4929 					}
4930 				}
4931 				else if(internal.depth == 8)
4932 				{
4933 					for(int y = 0; y < height; y++)
4934 					{
4935 						for(int x = 0; x < width; x += 2)
4936 						{
4937 							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4938 							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4939 							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4940 							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4941 							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4942 							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4943 							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4944 							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4945 
4946 							c0 = _mm_add_ps(c0, c1);
4947 							c2 = _mm_add_ps(c2, c3);
4948 							c4 = _mm_add_ps(c4, c5);
4949 							c6 = _mm_add_ps(c6, c7);
4950 							c0 = _mm_add_ps(c0, c2);
4951 							c4 = _mm_add_ps(c4, c6);
4952 							c0 = _mm_add_ps(c0, c4);
4953 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
4954 
4955 							_mm_store_ps((float*)(source0 + 8 * x), c0);
4956 						}
4957 
4958 						source0 += pitch;
4959 						source1 += pitch;
4960 						source2 += pitch;
4961 						source3 += pitch;
4962 						source4 += pitch;
4963 						source5 += pitch;
4964 						source6 += pitch;
4965 						source7 += pitch;
4966 					}
4967 				}
4968 				else if(internal.depth == 16)
4969 				{
4970 					for(int y = 0; y < height; y++)
4971 					{
4972 						for(int x = 0; x < width; x += 2)
4973 						{
4974 							__m128 c0 = _mm_load_ps((float*)(source0 + 8 * x));
4975 							__m128 c1 = _mm_load_ps((float*)(source1 + 8 * x));
4976 							__m128 c2 = _mm_load_ps((float*)(source2 + 8 * x));
4977 							__m128 c3 = _mm_load_ps((float*)(source3 + 8 * x));
4978 							__m128 c4 = _mm_load_ps((float*)(source4 + 8 * x));
4979 							__m128 c5 = _mm_load_ps((float*)(source5 + 8 * x));
4980 							__m128 c6 = _mm_load_ps((float*)(source6 + 8 * x));
4981 							__m128 c7 = _mm_load_ps((float*)(source7 + 8 * x));
4982 							__m128 c8 = _mm_load_ps((float*)(source8 + 8 * x));
4983 							__m128 c9 = _mm_load_ps((float*)(source9 + 8 * x));
4984 							__m128 cA = _mm_load_ps((float*)(sourceA + 8 * x));
4985 							__m128 cB = _mm_load_ps((float*)(sourceB + 8 * x));
4986 							__m128 cC = _mm_load_ps((float*)(sourceC + 8 * x));
4987 							__m128 cD = _mm_load_ps((float*)(sourceD + 8 * x));
4988 							__m128 cE = _mm_load_ps((float*)(sourceE + 8 * x));
4989 							__m128 cF = _mm_load_ps((float*)(sourceF + 8 * x));
4990 
4991 							c0 = _mm_add_ps(c0, c1);
4992 							c2 = _mm_add_ps(c2, c3);
4993 							c4 = _mm_add_ps(c4, c5);
4994 							c6 = _mm_add_ps(c6, c7);
4995 							c8 = _mm_add_ps(c8, c9);
4996 							cA = _mm_add_ps(cA, cB);
4997 							cC = _mm_add_ps(cC, cD);
4998 							cE = _mm_add_ps(cE, cF);
4999 							c0 = _mm_add_ps(c0, c2);
5000 							c4 = _mm_add_ps(c4, c6);
5001 							c8 = _mm_add_ps(c8, cA);
5002 							cC = _mm_add_ps(cC, cE);
5003 							c0 = _mm_add_ps(c0, c4);
5004 							c8 = _mm_add_ps(c8, cC);
5005 							c0 = _mm_add_ps(c0, c8);
5006 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5007 
5008 							_mm_store_ps((float*)(source0 + 8 * x), c0);
5009 						}
5010 
5011 						source0 += pitch;
5012 						source1 += pitch;
5013 						source2 += pitch;
5014 						source3 += pitch;
5015 						source4 += pitch;
5016 						source5 += pitch;
5017 						source6 += pitch;
5018 						source7 += pitch;
5019 						source8 += pitch;
5020 						source9 += pitch;
5021 						sourceA += pitch;
5022 						sourceB += pitch;
5023 						sourceC += pitch;
5024 						sourceD += pitch;
5025 						sourceE += pitch;
5026 						sourceF += pitch;
5027 					}
5028 				}
5029 				else ASSERT(false);
5030 			}
5031 			else
5032 			{
5033 				if(internal.depth == 2)
5034 				{
5035 					for(int y = 0; y < height; y++)
5036 					{
5037 						for(int x = 0; x < 2 * width; x++)
5038 						{
5039 							float c0 = *(float*)(source0 + 4 * x);
5040 							float c1 = *(float*)(source1 + 4 * x);
5041 
5042 							c0 = c0 + c1;
5043 							c0 *= 1.0f / 2.0f;
5044 
5045 							*(float*)(source0 + 4 * x) = c0;
5046 						}
5047 
5048 						source0 += pitch;
5049 						source1 += pitch;
5050 					}
5051 				}
5052 				else if(internal.depth == 4)
5053 				{
5054 					for(int y = 0; y < height; y++)
5055 					{
5056 						for(int x = 0; x < 2 * width; x++)
5057 						{
5058 							float c0 = *(float*)(source0 + 4 * x);
5059 							float c1 = *(float*)(source1 + 4 * x);
5060 							float c2 = *(float*)(source2 + 4 * x);
5061 							float c3 = *(float*)(source3 + 4 * x);
5062 
5063 							c0 = c0 + c1;
5064 							c2 = c2 + c3;
5065 							c0 = c0 + c2;
5066 							c0 *= 1.0f / 4.0f;
5067 
5068 							*(float*)(source0 + 4 * x) = c0;
5069 						}
5070 
5071 						source0 += pitch;
5072 						source1 += pitch;
5073 						source2 += pitch;
5074 						source3 += pitch;
5075 					}
5076 				}
5077 				else if(internal.depth == 8)
5078 				{
5079 					for(int y = 0; y < height; y++)
5080 					{
5081 						for(int x = 0; x < 2 * width; x++)
5082 						{
5083 							float c0 = *(float*)(source0 + 4 * x);
5084 							float c1 = *(float*)(source1 + 4 * x);
5085 							float c2 = *(float*)(source2 + 4 * x);
5086 							float c3 = *(float*)(source3 + 4 * x);
5087 							float c4 = *(float*)(source4 + 4 * x);
5088 							float c5 = *(float*)(source5 + 4 * x);
5089 							float c6 = *(float*)(source6 + 4 * x);
5090 							float c7 = *(float*)(source7 + 4 * x);
5091 
5092 							c0 = c0 + c1;
5093 							c2 = c2 + c3;
5094 							c4 = c4 + c5;
5095 							c6 = c6 + c7;
5096 							c0 = c0 + c2;
5097 							c4 = c4 + c6;
5098 							c0 = c0 + c4;
5099 							c0 *= 1.0f / 8.0f;
5100 
5101 							*(float*)(source0 + 4 * x) = c0;
5102 						}
5103 
5104 						source0 += pitch;
5105 						source1 += pitch;
5106 						source2 += pitch;
5107 						source3 += pitch;
5108 						source4 += pitch;
5109 						source5 += pitch;
5110 						source6 += pitch;
5111 						source7 += pitch;
5112 					}
5113 				}
5114 				else if(internal.depth == 16)
5115 				{
5116 					for(int y = 0; y < height; y++)
5117 					{
5118 						for(int x = 0; x < 2 * width; x++)
5119 						{
5120 							float c0 = *(float*)(source0 + 4 * x);
5121 							float c1 = *(float*)(source1 + 4 * x);
5122 							float c2 = *(float*)(source2 + 4 * x);
5123 							float c3 = *(float*)(source3 + 4 * x);
5124 							float c4 = *(float*)(source4 + 4 * x);
5125 							float c5 = *(float*)(source5 + 4 * x);
5126 							float c6 = *(float*)(source6 + 4 * x);
5127 							float c7 = *(float*)(source7 + 4 * x);
5128 							float c8 = *(float*)(source8 + 4 * x);
5129 							float c9 = *(float*)(source9 + 4 * x);
5130 							float cA = *(float*)(sourceA + 4 * x);
5131 							float cB = *(float*)(sourceB + 4 * x);
5132 							float cC = *(float*)(sourceC + 4 * x);
5133 							float cD = *(float*)(sourceD + 4 * x);
5134 							float cE = *(float*)(sourceE + 4 * x);
5135 							float cF = *(float*)(sourceF + 4 * x);
5136 
5137 							c0 = c0 + c1;
5138 							c2 = c2 + c3;
5139 							c4 = c4 + c5;
5140 							c6 = c6 + c7;
5141 							c8 = c8 + c9;
5142 							cA = cA + cB;
5143 							cC = cC + cD;
5144 							cE = cE + cF;
5145 							c0 = c0 + c2;
5146 							c4 = c4 + c6;
5147 							c8 = c8 + cA;
5148 							cC = cC + cE;
5149 							c0 = c0 + c4;
5150 							c8 = c8 + cC;
5151 							c0 = c0 + c8;
5152 							c0 *= 1.0f / 16.0f;
5153 
5154 							*(float*)(source0 + 4 * x) = c0;
5155 						}
5156 
5157 						source0 += pitch;
5158 						source1 += pitch;
5159 						source2 += pitch;
5160 						source3 += pitch;
5161 						source4 += pitch;
5162 						source5 += pitch;
5163 						source6 += pitch;
5164 						source7 += pitch;
5165 						source8 += pitch;
5166 						source9 += pitch;
5167 						sourceA += pitch;
5168 						sourceB += pitch;
5169 						sourceC += pitch;
5170 						sourceD += pitch;
5171 						sourceE += pitch;
5172 						sourceF += pitch;
5173 					}
5174 				}
5175 				else ASSERT(false);
5176 			}
5177 		}
5178 		else if(internal.format == FORMAT_A32B32G32R32F || internal.format == FORMAT_X32B32G32R32F)
5179 		{
5180 			if(CPUID::supportsSSE())
5181 			{
5182 				if(internal.depth == 2)
5183 				{
5184 					for(int y = 0; y < height; y++)
5185 					{
5186 						for(int x = 0; x < width; x++)
5187 						{
5188 							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5189 							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5190 
5191 							c0 = _mm_add_ps(c0, c1);
5192 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f));
5193 
5194 							_mm_store_ps((float*)(source0 + 16 * x), c0);
5195 						}
5196 
5197 						source0 += pitch;
5198 						source1 += pitch;
5199 					}
5200 				}
5201 				else if(internal.depth == 4)
5202 				{
5203 					for(int y = 0; y < height; y++)
5204 					{
5205 						for(int x = 0; x < width; x++)
5206 						{
5207 							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5208 							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5209 							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5210 							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5211 
5212 							c0 = _mm_add_ps(c0, c1);
5213 							c2 = _mm_add_ps(c2, c3);
5214 							c0 = _mm_add_ps(c0, c2);
5215 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f));
5216 
5217 							_mm_store_ps((float*)(source0 + 16 * x), c0);
5218 						}
5219 
5220 						source0 += pitch;
5221 						source1 += pitch;
5222 						source2 += pitch;
5223 						source3 += pitch;
5224 					}
5225 				}
5226 				else if(internal.depth == 8)
5227 				{
5228 					for(int y = 0; y < height; y++)
5229 					{
5230 						for(int x = 0; x < width; x++)
5231 						{
5232 							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5233 							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5234 							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5235 							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5236 							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5237 							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5238 							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5239 							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5240 
5241 							c0 = _mm_add_ps(c0, c1);
5242 							c2 = _mm_add_ps(c2, c3);
5243 							c4 = _mm_add_ps(c4, c5);
5244 							c6 = _mm_add_ps(c6, c7);
5245 							c0 = _mm_add_ps(c0, c2);
5246 							c4 = _mm_add_ps(c4, c6);
5247 							c0 = _mm_add_ps(c0, c4);
5248 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f));
5249 
5250 							_mm_store_ps((float*)(source0 + 16 * x), c0);
5251 						}
5252 
5253 						source0 += pitch;
5254 						source1 += pitch;
5255 						source2 += pitch;
5256 						source3 += pitch;
5257 						source4 += pitch;
5258 						source5 += pitch;
5259 						source6 += pitch;
5260 						source7 += pitch;
5261 					}
5262 				}
5263 				else if(internal.depth == 16)
5264 				{
5265 					for(int y = 0; y < height; y++)
5266 					{
5267 						for(int x = 0; x < width; x++)
5268 						{
5269 							__m128 c0 = _mm_load_ps((float*)(source0 + 16 * x));
5270 							__m128 c1 = _mm_load_ps((float*)(source1 + 16 * x));
5271 							__m128 c2 = _mm_load_ps((float*)(source2 + 16 * x));
5272 							__m128 c3 = _mm_load_ps((float*)(source3 + 16 * x));
5273 							__m128 c4 = _mm_load_ps((float*)(source4 + 16 * x));
5274 							__m128 c5 = _mm_load_ps((float*)(source5 + 16 * x));
5275 							__m128 c6 = _mm_load_ps((float*)(source6 + 16 * x));
5276 							__m128 c7 = _mm_load_ps((float*)(source7 + 16 * x));
5277 							__m128 c8 = _mm_load_ps((float*)(source8 + 16 * x));
5278 							__m128 c9 = _mm_load_ps((float*)(source9 + 16 * x));
5279 							__m128 cA = _mm_load_ps((float*)(sourceA + 16 * x));
5280 							__m128 cB = _mm_load_ps((float*)(sourceB + 16 * x));
5281 							__m128 cC = _mm_load_ps((float*)(sourceC + 16 * x));
5282 							__m128 cD = _mm_load_ps((float*)(sourceD + 16 * x));
5283 							__m128 cE = _mm_load_ps((float*)(sourceE + 16 * x));
5284 							__m128 cF = _mm_load_ps((float*)(sourceF + 16 * x));
5285 
5286 							c0 = _mm_add_ps(c0, c1);
5287 							c2 = _mm_add_ps(c2, c3);
5288 							c4 = _mm_add_ps(c4, c5);
5289 							c6 = _mm_add_ps(c6, c7);
5290 							c8 = _mm_add_ps(c8, c9);
5291 							cA = _mm_add_ps(cA, cB);
5292 							cC = _mm_add_ps(cC, cD);
5293 							cE = _mm_add_ps(cE, cF);
5294 							c0 = _mm_add_ps(c0, c2);
5295 							c4 = _mm_add_ps(c4, c6);
5296 							c8 = _mm_add_ps(c8, cA);
5297 							cC = _mm_add_ps(cC, cE);
5298 							c0 = _mm_add_ps(c0, c4);
5299 							c8 = _mm_add_ps(c8, cC);
5300 							c0 = _mm_add_ps(c0, c8);
5301 							c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f));
5302 
5303 							_mm_store_ps((float*)(source0 + 16 * x), c0);
5304 						}
5305 
5306 						source0 += pitch;
5307 						source1 += pitch;
5308 						source2 += pitch;
5309 						source3 += pitch;
5310 						source4 += pitch;
5311 						source5 += pitch;
5312 						source6 += pitch;
5313 						source7 += pitch;
5314 						source8 += pitch;
5315 						source9 += pitch;
5316 						sourceA += pitch;
5317 						sourceB += pitch;
5318 						sourceC += pitch;
5319 						sourceD += pitch;
5320 						sourceE += pitch;
5321 						sourceF += pitch;
5322 					}
5323 				}
5324 				else ASSERT(false);
5325 			}
5326 			else
5327 			{
5328 				if(internal.depth == 2)
5329 				{
5330 					for(int y = 0; y < height; y++)
5331 					{
5332 						for(int x = 0; x < 4 * width; x++)
5333 						{
5334 							float c0 = *(float*)(source0 + 4 * x);
5335 							float c1 = *(float*)(source1 + 4 * x);
5336 
5337 							c0 = c0 + c1;
5338 							c0 *= 1.0f / 2.0f;
5339 
5340 							*(float*)(source0 + 4 * x) = c0;
5341 						}
5342 
5343 						source0 += pitch;
5344 						source1 += pitch;
5345 					}
5346 				}
5347 				else if(internal.depth == 4)
5348 				{
5349 					for(int y = 0; y < height; y++)
5350 					{
5351 						for(int x = 0; x < 4 * width; x++)
5352 						{
5353 							float c0 = *(float*)(source0 + 4 * x);
5354 							float c1 = *(float*)(source1 + 4 * x);
5355 							float c2 = *(float*)(source2 + 4 * x);
5356 							float c3 = *(float*)(source3 + 4 * x);
5357 
5358 							c0 = c0 + c1;
5359 							c2 = c2 + c3;
5360 							c0 = c0 + c2;
5361 							c0 *= 1.0f / 4.0f;
5362 
5363 							*(float*)(source0 + 4 * x) = c0;
5364 						}
5365 
5366 						source0 += pitch;
5367 						source1 += pitch;
5368 						source2 += pitch;
5369 						source3 += pitch;
5370 					}
5371 				}
5372 				else if(internal.depth == 8)
5373 				{
5374 					for(int y = 0; y < height; y++)
5375 					{
5376 						for(int x = 0; x < 4 * width; x++)
5377 						{
5378 							float c0 = *(float*)(source0 + 4 * x);
5379 							float c1 = *(float*)(source1 + 4 * x);
5380 							float c2 = *(float*)(source2 + 4 * x);
5381 							float c3 = *(float*)(source3 + 4 * x);
5382 							float c4 = *(float*)(source4 + 4 * x);
5383 							float c5 = *(float*)(source5 + 4 * x);
5384 							float c6 = *(float*)(source6 + 4 * x);
5385 							float c7 = *(float*)(source7 + 4 * x);
5386 
5387 							c0 = c0 + c1;
5388 							c2 = c2 + c3;
5389 							c4 = c4 + c5;
5390 							c6 = c6 + c7;
5391 							c0 = c0 + c2;
5392 							c4 = c4 + c6;
5393 							c0 = c0 + c4;
5394 							c0 *= 1.0f / 8.0f;
5395 
5396 							*(float*)(source0 + 4 * x) = c0;
5397 						}
5398 
5399 						source0 += pitch;
5400 						source1 += pitch;
5401 						source2 += pitch;
5402 						source3 += pitch;
5403 						source4 += pitch;
5404 						source5 += pitch;
5405 						source6 += pitch;
5406 						source7 += pitch;
5407 					}
5408 				}
5409 				else if(internal.depth == 16)
5410 				{
5411 					for(int y = 0; y < height; y++)
5412 					{
5413 						for(int x = 0; x < 4 * width; x++)
5414 						{
5415 							float c0 = *(float*)(source0 + 4 * x);
5416 							float c1 = *(float*)(source1 + 4 * x);
5417 							float c2 = *(float*)(source2 + 4 * x);
5418 							float c3 = *(float*)(source3 + 4 * x);
5419 							float c4 = *(float*)(source4 + 4 * x);
5420 							float c5 = *(float*)(source5 + 4 * x);
5421 							float c6 = *(float*)(source6 + 4 * x);
5422 							float c7 = *(float*)(source7 + 4 * x);
5423 							float c8 = *(float*)(source8 + 4 * x);
5424 							float c9 = *(float*)(source9 + 4 * x);
5425 							float cA = *(float*)(sourceA + 4 * x);
5426 							float cB = *(float*)(sourceB + 4 * x);
5427 							float cC = *(float*)(sourceC + 4 * x);
5428 							float cD = *(float*)(sourceD + 4 * x);
5429 							float cE = *(float*)(sourceE + 4 * x);
5430 							float cF = *(float*)(sourceF + 4 * x);
5431 
5432 							c0 = c0 + c1;
5433 							c2 = c2 + c3;
5434 							c4 = c4 + c5;
5435 							c6 = c6 + c7;
5436 							c8 = c8 + c9;
5437 							cA = cA + cB;
5438 							cC = cC + cD;
5439 							cE = cE + cF;
5440 							c0 = c0 + c2;
5441 							c4 = c4 + c6;
5442 							c8 = c8 + cA;
5443 							cC = cC + cE;
5444 							c0 = c0 + c4;
5445 							c8 = c8 + cC;
5446 							c0 = c0 + c8;
5447 							c0 *= 1.0f / 16.0f;
5448 
5449 							*(float*)(source0 + 4 * x) = c0;
5450 						}
5451 
5452 						source0 += pitch;
5453 						source1 += pitch;
5454 						source2 += pitch;
5455 						source3 += pitch;
5456 						source4 += pitch;
5457 						source5 += pitch;
5458 						source6 += pitch;
5459 						source7 += pitch;
5460 						source8 += pitch;
5461 						source9 += pitch;
5462 						sourceA += pitch;
5463 						sourceB += pitch;
5464 						sourceC += pitch;
5465 						sourceD += pitch;
5466 						sourceE += pitch;
5467 						sourceF += pitch;
5468 					}
5469 				}
5470 				else ASSERT(false);
5471 			}
5472 		}
5473 		else if(internal.format == FORMAT_R5G6B5)
5474 		{
5475 			if(CPUID::supportsSSE2() && (width % 8) == 0)
5476 			{
5477 				if(internal.depth == 2)
5478 				{
5479 					for(int y = 0; y < height; y++)
5480 					{
5481 						for(int x = 0; x < width; x += 8)
5482 						{
5483 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5484 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5485 
5486 							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5487 							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5488 							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5489 							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5490 							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5491 							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5492 
5493 							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5494 							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5495 							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5496 							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5497 							c0 = _mm_or_si128(c0, c1);
5498 
5499 							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5500 						}
5501 
5502 						source0 += pitch;
5503 						source1 += pitch;
5504 					}
5505 				}
5506 				else if(internal.depth == 4)
5507 				{
5508 					for(int y = 0; y < height; y++)
5509 					{
5510 						for(int x = 0; x < width; x += 8)
5511 						{
5512 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5513 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5514 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5515 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5516 
5517 							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5518 							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5519 							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5520 							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5521 							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5522 							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5523 							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5524 							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5525 							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5526 							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5527 
5528 							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5529 							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5530 							c0 = _mm_avg_epu8(c0, c2);
5531 							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5532 							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5533 							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5534 							c1 = _mm_avg_epu16(c1, c3);
5535 							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5536 							c0 = _mm_or_si128(c0, c1);
5537 
5538 							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5539 						}
5540 
5541 						source0 += pitch;
5542 						source1 += pitch;
5543 						source2 += pitch;
5544 						source3 += pitch;
5545 					}
5546 				}
5547 				else if(internal.depth == 8)
5548 				{
5549 					for(int y = 0; y < height; y++)
5550 					{
5551 						for(int x = 0; x < width; x += 8)
5552 						{
5553 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5554 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5555 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5556 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5557 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5558 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5559 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5560 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5561 
5562 							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5563 							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5564 							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5565 							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5566 							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5567 							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5568 							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5569 							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5570 							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5571 							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5572 							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5573 							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5574 							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5575 							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5576 							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5577 							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5578 							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5579 							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5580 
5581 							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5582 							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5583 							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5584 							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5585 							c0 = _mm_avg_epu8(c0, c2);
5586 							c4 = _mm_avg_epu8(c4, c6);
5587 							c0 = _mm_avg_epu8(c0, c4);
5588 							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5589 							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5590 							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5591 							c5 = _mm_avg_epu16(c4__g_, c5__g_);
5592 							c7 = _mm_avg_epu16(c6__g_, c7__g_);
5593 							c1 = _mm_avg_epu16(c1, c3);
5594 							c5 = _mm_avg_epu16(c5, c7);
5595 							c1 = _mm_avg_epu16(c1, c5);
5596 							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5597 							c0 = _mm_or_si128(c0, c1);
5598 
5599 							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5600 						}
5601 
5602 						source0 += pitch;
5603 						source1 += pitch;
5604 						source2 += pitch;
5605 						source3 += pitch;
5606 						source4 += pitch;
5607 						source5 += pitch;
5608 						source6 += pitch;
5609 						source7 += pitch;
5610 					}
5611 				}
5612 				else if(internal.depth == 16)
5613 				{
5614 					for(int y = 0; y < height; y++)
5615 					{
5616 						for(int x = 0; x < width; x += 8)
5617 						{
5618 							__m128i c0 = _mm_load_si128((__m128i*)(source0 + 2 * x));
5619 							__m128i c1 = _mm_load_si128((__m128i*)(source1 + 2 * x));
5620 							__m128i c2 = _mm_load_si128((__m128i*)(source2 + 2 * x));
5621 							__m128i c3 = _mm_load_si128((__m128i*)(source3 + 2 * x));
5622 							__m128i c4 = _mm_load_si128((__m128i*)(source4 + 2 * x));
5623 							__m128i c5 = _mm_load_si128((__m128i*)(source5 + 2 * x));
5624 							__m128i c6 = _mm_load_si128((__m128i*)(source6 + 2 * x));
5625 							__m128i c7 = _mm_load_si128((__m128i*)(source7 + 2 * x));
5626 							__m128i c8 = _mm_load_si128((__m128i*)(source8 + 2 * x));
5627 							__m128i c9 = _mm_load_si128((__m128i*)(source9 + 2 * x));
5628 							__m128i cA = _mm_load_si128((__m128i*)(sourceA + 2 * x));
5629 							__m128i cB = _mm_load_si128((__m128i*)(sourceB + 2 * x));
5630 							__m128i cC = _mm_load_si128((__m128i*)(sourceC + 2 * x));
5631 							__m128i cD = _mm_load_si128((__m128i*)(sourceD + 2 * x));
5632 							__m128i cE = _mm_load_si128((__m128i*)(sourceE + 2 * x));
5633 							__m128i cF = _mm_load_si128((__m128i*)(sourceF + 2 * x));
5634 
5635 							static const ushort8 r_b = {0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F, 0xF81F};
5636 							static const ushort8 _g_ = {0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0, 0x07E0};
5637 							__m128i c0_r_b = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5638 							__m128i c0__g_ = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(_g_));
5639 							__m128i c1_r_b = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(r_b));
5640 							__m128i c1__g_ = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5641 							__m128i c2_r_b = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(r_b));
5642 							__m128i c2__g_ = _mm_and_si128(c2, reinterpret_cast<const __m128i&>(_g_));
5643 							__m128i c3_r_b = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(r_b));
5644 							__m128i c3__g_ = _mm_and_si128(c3, reinterpret_cast<const __m128i&>(_g_));
5645 							__m128i c4_r_b = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(r_b));
5646 							__m128i c4__g_ = _mm_and_si128(c4, reinterpret_cast<const __m128i&>(_g_));
5647 							__m128i c5_r_b = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(r_b));
5648 							__m128i c5__g_ = _mm_and_si128(c5, reinterpret_cast<const __m128i&>(_g_));
5649 							__m128i c6_r_b = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(r_b));
5650 							__m128i c6__g_ = _mm_and_si128(c6, reinterpret_cast<const __m128i&>(_g_));
5651 							__m128i c7_r_b = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(r_b));
5652 							__m128i c7__g_ = _mm_and_si128(c7, reinterpret_cast<const __m128i&>(_g_));
5653 							__m128i c8_r_b = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(r_b));
5654 							__m128i c8__g_ = _mm_and_si128(c8, reinterpret_cast<const __m128i&>(_g_));
5655 							__m128i c9_r_b = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(r_b));
5656 							__m128i c9__g_ = _mm_and_si128(c9, reinterpret_cast<const __m128i&>(_g_));
5657 							__m128i cA_r_b = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(r_b));
5658 							__m128i cA__g_ = _mm_and_si128(cA, reinterpret_cast<const __m128i&>(_g_));
5659 							__m128i cB_r_b = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(r_b));
5660 							__m128i cB__g_ = _mm_and_si128(cB, reinterpret_cast<const __m128i&>(_g_));
5661 							__m128i cC_r_b = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(r_b));
5662 							__m128i cC__g_ = _mm_and_si128(cC, reinterpret_cast<const __m128i&>(_g_));
5663 							__m128i cD_r_b = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(r_b));
5664 							__m128i cD__g_ = _mm_and_si128(cD, reinterpret_cast<const __m128i&>(_g_));
5665 							__m128i cE_r_b = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(r_b));
5666 							__m128i cE__g_ = _mm_and_si128(cE, reinterpret_cast<const __m128i&>(_g_));
5667 							__m128i cF_r_b = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(r_b));
5668 							__m128i cF__g_ = _mm_and_si128(cF, reinterpret_cast<const __m128i&>(_g_));
5669 
5670 							c0 = _mm_avg_epu8(c0_r_b, c1_r_b);
5671 							c2 = _mm_avg_epu8(c2_r_b, c3_r_b);
5672 							c4 = _mm_avg_epu8(c4_r_b, c5_r_b);
5673 							c6 = _mm_avg_epu8(c6_r_b, c7_r_b);
5674 							c8 = _mm_avg_epu8(c8_r_b, c9_r_b);
5675 							cA = _mm_avg_epu8(cA_r_b, cB_r_b);
5676 							cC = _mm_avg_epu8(cC_r_b, cD_r_b);
5677 							cE = _mm_avg_epu8(cE_r_b, cF_r_b);
5678 							c0 = _mm_avg_epu8(c0, c2);
5679 							c4 = _mm_avg_epu8(c4, c6);
5680 							c8 = _mm_avg_epu8(c8, cA);
5681 							cC = _mm_avg_epu8(cC, cE);
5682 							c0 = _mm_avg_epu8(c0, c4);
5683 							c8 = _mm_avg_epu8(c8, cC);
5684 							c0 = _mm_avg_epu8(c0, c8);
5685 							c0 = _mm_and_si128(c0, reinterpret_cast<const __m128i&>(r_b));
5686 							c1 = _mm_avg_epu16(c0__g_, c1__g_);
5687 							c3 = _mm_avg_epu16(c2__g_, c3__g_);
5688 							c5 = _mm_avg_epu16(c4__g_, c5__g_);
5689 							c7 = _mm_avg_epu16(c6__g_, c7__g_);
5690 							c9 = _mm_avg_epu16(c8__g_, c9__g_);
5691 							cB = _mm_avg_epu16(cA__g_, cB__g_);
5692 							cD = _mm_avg_epu16(cC__g_, cD__g_);
5693 							cF = _mm_avg_epu16(cE__g_, cF__g_);
5694 							c1 = _mm_avg_epu8(c1, c3);
5695 							c5 = _mm_avg_epu8(c5, c7);
5696 							c9 = _mm_avg_epu8(c9, cB);
5697 							cD = _mm_avg_epu8(cD, cF);
5698 							c1 = _mm_avg_epu8(c1, c5);
5699 							c9 = _mm_avg_epu8(c9, cD);
5700 							c1 = _mm_avg_epu8(c1, c9);
5701 							c1 = _mm_and_si128(c1, reinterpret_cast<const __m128i&>(_g_));
5702 							c0 = _mm_or_si128(c0, c1);
5703 
5704 							_mm_store_si128((__m128i*)(source0 + 2 * x), c0);
5705 						}
5706 
5707 						source0 += pitch;
5708 						source1 += pitch;
5709 						source2 += pitch;
5710 						source3 += pitch;
5711 						source4 += pitch;
5712 						source5 += pitch;
5713 						source6 += pitch;
5714 						source7 += pitch;
5715 						source8 += pitch;
5716 						source9 += pitch;
5717 						sourceA += pitch;
5718 						sourceB += pitch;
5719 						sourceC += pitch;
5720 						sourceD += pitch;
5721 						sourceE += pitch;
5722 						sourceF += pitch;
5723 					}
5724 				}
5725 				else ASSERT(false);
5726 			}
5727 			else
5728 			{
5729 				#define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7BEF) + (((x) ^ (y)) & 0x0821))
5730 
5731 				if(internal.depth == 2)
5732 				{
5733 					for(int y = 0; y < height; y++)
5734 					{
5735 						for(int x = 0; x < width; x++)
5736 						{
5737 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5738 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5739 
5740 							c0 = AVERAGE(c0, c1);
5741 
5742 							*(unsigned short*)(source0 + 2 * x) = c0;
5743 						}
5744 
5745 						source0 += pitch;
5746 						source1 += pitch;
5747 					}
5748 				}
5749 				else if(internal.depth == 4)
5750 				{
5751 					for(int y = 0; y < height; y++)
5752 					{
5753 						for(int x = 0; x < width; x++)
5754 						{
5755 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5756 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5757 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5758 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5759 
5760 							c0 = AVERAGE(c0, c1);
5761 							c2 = AVERAGE(c2, c3);
5762 							c0 = AVERAGE(c0, c2);
5763 
5764 							*(unsigned short*)(source0 + 2 * x) = c0;
5765 						}
5766 
5767 						source0 += pitch;
5768 						source1 += pitch;
5769 						source2 += pitch;
5770 						source3 += pitch;
5771 					}
5772 				}
5773 				else if(internal.depth == 8)
5774 				{
5775 					for(int y = 0; y < height; y++)
5776 					{
5777 						for(int x = 0; x < width; x++)
5778 						{
5779 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5780 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5781 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5782 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5783 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5784 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5785 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5786 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5787 
5788 							c0 = AVERAGE(c0, c1);
5789 							c2 = AVERAGE(c2, c3);
5790 							c4 = AVERAGE(c4, c5);
5791 							c6 = AVERAGE(c6, c7);
5792 							c0 = AVERAGE(c0, c2);
5793 							c4 = AVERAGE(c4, c6);
5794 							c0 = AVERAGE(c0, c4);
5795 
5796 							*(unsigned short*)(source0 + 2 * x) = c0;
5797 						}
5798 
5799 						source0 += pitch;
5800 						source1 += pitch;
5801 						source2 += pitch;
5802 						source3 += pitch;
5803 						source4 += pitch;
5804 						source5 += pitch;
5805 						source6 += pitch;
5806 						source7 += pitch;
5807 					}
5808 				}
5809 				else if(internal.depth == 16)
5810 				{
5811 					for(int y = 0; y < height; y++)
5812 					{
5813 						for(int x = 0; x < width; x++)
5814 						{
5815 							unsigned short c0 = *(unsigned short*)(source0 + 2 * x);
5816 							unsigned short c1 = *(unsigned short*)(source1 + 2 * x);
5817 							unsigned short c2 = *(unsigned short*)(source2 + 2 * x);
5818 							unsigned short c3 = *(unsigned short*)(source3 + 2 * x);
5819 							unsigned short c4 = *(unsigned short*)(source4 + 2 * x);
5820 							unsigned short c5 = *(unsigned short*)(source5 + 2 * x);
5821 							unsigned short c6 = *(unsigned short*)(source6 + 2 * x);
5822 							unsigned short c7 = *(unsigned short*)(source7 + 2 * x);
5823 							unsigned short c8 = *(unsigned short*)(source8 + 2 * x);
5824 							unsigned short c9 = *(unsigned short*)(source9 + 2 * x);
5825 							unsigned short cA = *(unsigned short*)(sourceA + 2 * x);
5826 							unsigned short cB = *(unsigned short*)(sourceB + 2 * x);
5827 							unsigned short cC = *(unsigned short*)(sourceC + 2 * x);
5828 							unsigned short cD = *(unsigned short*)(sourceD + 2 * x);
5829 							unsigned short cE = *(unsigned short*)(sourceE + 2 * x);
5830 							unsigned short cF = *(unsigned short*)(sourceF + 2 * x);
5831 
5832 							c0 = AVERAGE(c0, c1);
5833 							c2 = AVERAGE(c2, c3);
5834 							c4 = AVERAGE(c4, c5);
5835 							c6 = AVERAGE(c6, c7);
5836 							c8 = AVERAGE(c8, c9);
5837 							cA = AVERAGE(cA, cB);
5838 							cC = AVERAGE(cC, cD);
5839 							cE = AVERAGE(cE, cF);
5840 							c0 = AVERAGE(c0, c2);
5841 							c4 = AVERAGE(c4, c6);
5842 							c8 = AVERAGE(c8, cA);
5843 							cC = AVERAGE(cC, cE);
5844 							c0 = AVERAGE(c0, c4);
5845 							c8 = AVERAGE(c8, cC);
5846 							c0 = AVERAGE(c0, c8);
5847 
5848 							*(unsigned short*)(source0 + 2 * x) = c0;
5849 						}
5850 
5851 						source0 += pitch;
5852 						source1 += pitch;
5853 						source2 += pitch;
5854 						source3 += pitch;
5855 						source4 += pitch;
5856 						source5 += pitch;
5857 						source6 += pitch;
5858 						source7 += pitch;
5859 						source8 += pitch;
5860 						source9 += pitch;
5861 						sourceA += pitch;
5862 						sourceB += pitch;
5863 						sourceC += pitch;
5864 						sourceD += pitch;
5865 						sourceE += pitch;
5866 						sourceF += pitch;
5867 					}
5868 				}
5869 				else ASSERT(false);
5870 
5871 				#undef AVERAGE
5872 			}
5873 		}
5874 		else
5875 		{
5876 		//	UNIMPLEMENTED();
5877 		}
5878 	}
5879 }
5880