1/*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25/**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32
33	.file	"read_rgba_span_x86.S"
34#if !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
35/* Kevin F. Quinn 2nd July 2006
36 * Replaced data segment constants with text-segment instructions.
37 */
38#define	LOAD_MASK(mvins,m1,m2) \
39   	pushl	$0xff00ff00 ;\
40   	pushl	$0xff00ff00 ;\
41   	pushl	$0xff00ff00 ;\
42   	pushl	$0xff00ff00 ;\
43	mvins	(%esp), m1	;\
44   	pushl	$0x00ff0000 ;\
45   	pushl	$0x00ff0000 ;\
46   	pushl	$0x00ff0000 ;\
47   	pushl	$0x00ff0000 ;\
48	mvins	(%esp), m2	;\
49	addl	$32, %esp
50
51/* I implemented these as macros because they appear in several places,
52 * and I've tweaked them a number of times.  I got tired of changing every
53 * place they appear. :)
54 */
55
56#define DO_ONE_PIXEL() \
57	movl	(%ebx), %eax ; \
58	addl	$4, %ebx ; \
59	bswap	%eax          /* ARGB -> BGRA */ ; \
60	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
61	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
62	addl	$4, %ecx
63
64#define DO_ONE_LAST_PIXEL() \
65	movl	(%ebx), %eax ; \
66	bswap	%eax          /* ARGB -> BGRA */ ; \
67	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
68	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
69
70
71/**
72 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
73 *
74 * \warning
75 * This function assumes that the caller will issue the EMMS instruction
76 * at the correct places.
77 */
78
79.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
80#ifndef USE_DRICORE
81.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
82#endif
83	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
84_generic_read_RGBA_span_BGRA8888_REV_MMX:
85	pushl	%ebx
86
87#ifdef USE_INNER_EMMS
88	emms
89#endif
90	LOAD_MASK(movq,%mm1,%mm2)
91
92	movl	8(%esp), %ebx	/* source pointer */
93	movl	16(%esp), %edx	/* number of pixels to copy */
94	movl	12(%esp), %ecx	/* destination pointer */
95
96	testl	%edx, %edx
97	jle	.L20		/* Bail if there's nothing to do. */
98
99	movl	%ebx, %eax
100
101	negl	%eax
102	sarl	$2, %eax
103	andl	$1, %eax
104	je	.L17
105
106	subl	%eax, %edx
107	DO_ONE_PIXEL()
108.L17:
109
110	/* Would it be faster to unroll this loop once and process 4 pixels
111	 * per pass, instead of just two?
112	 */
113
114	movl	%edx, %eax
115	shrl	%eax
116	jmp	.L18
117.L19:
118	movq	(%ebx), %mm0
119	addl	$8, %ebx
120
121	/* These 9 instructions do what PSHUFB (if there were such an
122	 * instruction) could do in 1. :(
123	 */
124
125	movq	%mm0, %mm3
126	movq	%mm0, %mm4
127
128	pand	%mm2, %mm3
129	psllq	$16, %mm4
130	psrlq	$16, %mm3
131	pand	%mm2, %mm4
132
133	pand	%mm1, %mm0
134	por	%mm4, %mm3
135	por	%mm3, %mm0
136
137	movq	%mm0, (%ecx)
138	addl	$8, %ecx
139	subl	$1, %eax
140.L18:
141	jne	.L19
142
143#ifdef USE_INNER_EMMS
144	emms
145#endif
146
147	/* At this point there are either 1 or 0 pixels remaining to be
148	 * converted.  Convert the last pixel, if needed.
149	 */
150
151	testl	$1, %edx
152	je	.L20
153
154	DO_ONE_LAST_PIXEL()
155
156.L20:
157	popl	%ebx
158	ret
159	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
160
161
162/**
163 * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
164 * instructions are only actually used to read data from the framebuffer.
165 * In practice, the speed-up is pretty small.
166 *
167 * \todo
168 * Do some more testing and determine if there's any reason to have this
169 * function in addition to the MMX version.
170 *
171 * \warning
172 * This function assumes that the caller will issue the EMMS instruction
173 * at the correct places.
174 */
175
176.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
177#ifndef USE_DRICORE
178.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
179#endif
180	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
181_generic_read_RGBA_span_BGRA8888_REV_SSE:
182	pushl	%esi
183	pushl	%ebx
184	pushl	%ebp
185
186#ifdef USE_INNER_EMMS
187	emms
188#endif
189
190	LOAD_MASK(movq,%mm1,%mm2)
191
192	movl	16(%esp), %ebx	/* source pointer */
193	movl	24(%esp), %edx	/* number of pixels to copy */
194	movl	20(%esp), %ecx	/* destination pointer */
195
196	testl	%edx, %edx
197	jle	.L35		/* Bail if there's nothing to do. */
198
199	movl	%esp, %ebp
200	subl	$16, %esp
201	andl	$0xfffffff0, %esp
202
203	movl	%ebx, %eax
204	movl	%edx, %esi
205
206	negl	%eax
207	andl	$15, %eax
208	sarl	$2, %eax
209	cmpl	%edx, %eax
210	cmovle	%eax, %esi
211
212	subl	%esi, %edx
213
214	testl	$1, %esi
215	je	.L32
216
217	DO_ONE_PIXEL()
218.L32:
219
220	testl	$2, %esi
221	je	.L31
222
223	movq	(%ebx), %mm0
224	addl	$8, %ebx
225
226	movq	%mm0, %mm3
227	movq	%mm0, %mm4
228
229	pand	%mm2, %mm3
230	psllq	$16, %mm4
231	psrlq	$16, %mm3
232	pand	%mm2, %mm4
233
234	pand	%mm1, %mm0
235	por	%mm4, %mm3
236	por	%mm3, %mm0
237
238	movq	%mm0, (%ecx)
239	addl	$8, %ecx
240.L31:
241
242	movl	%edx, %eax
243	shrl	$2, %eax
244	jmp	.L33
245.L34:
246	movaps	(%ebx), %xmm0
247	addl	$16, %ebx
248
249	/* This would be so much better if we could just move directly from
250	 * an SSE register to an MMX register.  Unfortunately, that
251	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
252	 * instruction.
253	 */
254
255	movaps	%xmm0, (%esp)
256	movq	(%esp), %mm0
257	movq	8(%esp), %mm5
258
259	movq	%mm0, %mm3
260	movq	%mm0, %mm4
261	movq	%mm5, %mm6
262	movq	%mm5, %mm7
263
264	pand	%mm2, %mm3
265	pand	%mm2, %mm6
266
267	psllq	$16, %mm4
268	psllq	$16, %mm7
269
270	psrlq	$16, %mm3
271	psrlq	$16, %mm6
272
273	pand	%mm2, %mm4
274	pand	%mm2, %mm7
275
276	pand	%mm1, %mm0
277	pand	%mm1, %mm5
278
279	por	%mm4, %mm3
280	por	%mm7, %mm6
281
282	por	%mm3, %mm0
283	por	%mm6, %mm5
284
285	movq	%mm0, (%ecx)
286	movq	%mm5, 8(%ecx)
287	addl	$16, %ecx
288
289	subl	$1, %eax
290.L33:
291	jne	.L34
292
293#ifdef USE_INNER_EMMS
294	emms
295#endif
296	movl	%ebp, %esp
297
298	/* At this point there are either [0, 3] pixels remaining to be
299	 * converted.
300	 */
301
302	testl	$2, %edx
303	je	.L36
304
305	movq	(%ebx), %mm0
306	addl	$8, %ebx
307
308	movq	%mm0, %mm3
309	movq	%mm0, %mm4
310
311	pand	%mm2, %mm3
312	psllq	$16, %mm4
313	psrlq	$16, %mm3
314	pand	%mm2, %mm4
315
316	pand	%mm1, %mm0
317	por	%mm4, %mm3
318	por	%mm3, %mm0
319
320	movq	%mm0, (%ecx)
321	addl	$8, %ecx
322.L36:
323
324	testl	$1, %edx
325	je	.L35
326
327	DO_ONE_LAST_PIXEL()
328.L35:
329	popl	%ebp
330	popl	%ebx
331	popl	%esi
332	ret
333	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
334
335
336/**
337 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
338 */
339
340	.text
341.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
342#ifndef USE_DRICORE
343.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
344#endif
345	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
346_generic_read_RGBA_span_BGRA8888_REV_SSE2:
347	pushl	%esi
348	pushl	%ebx
349
350	LOAD_MASK(movdqu,%xmm1,%xmm2)
351
352	movl	12(%esp), %ebx	/* source pointer */
353	movl	20(%esp), %edx	/* number of pixels to copy */
354	movl	16(%esp), %ecx	/* destination pointer */
355
356	movl	%ebx, %eax
357	movl	%edx, %esi
358
359	testl	%edx, %edx
360	jle	.L46		/* Bail if there's nothing to do. */
361
362	/* If the source pointer isn't a multiple of 16 we have to process
363	 * a few pixels the "slow" way to get the address aligned for
364	 * the SSE fetch intsructions.
365	 */
366
367	negl	%eax
368	andl	$15, %eax
369	sarl	$2, %eax
370
371	cmpl	%edx, %eax
372	cmovbe	%eax, %esi
373	subl	%esi, %edx
374
375	testl	$1, %esi
376	je	.L41
377
378	DO_ONE_PIXEL()
379.L41:
380	testl	$2, %esi
381	je	.L40
382
383	movq	(%ebx), %xmm0
384	addl	$8, %ebx
385
386	movdqa	%xmm0, %xmm3
387	movdqa	%xmm0, %xmm4
388	andps	%xmm1, %xmm0
389
390	andps	%xmm2, %xmm3
391	pslldq	$2, %xmm4
392	psrldq	$2, %xmm3
393	andps	%xmm2, %xmm4
394
395	orps	%xmm4, %xmm3
396	orps	%xmm3, %xmm0
397
398	movq	%xmm0, (%ecx)
399	addl	$8, %ecx
400.L40:
401
402	/* Would it be worth having a specialized version of this loop for
403	 * the case where the destination is 16-byte aligned?  That version
404	 * would be identical except that it could use movedqa instead of
405	 * movdqu.
406	 */
407
408	movl	%edx, %eax
409	shrl	$2, %eax
410	jmp	.L42
411.L43:
412	movdqa	(%ebx), %xmm0
413	addl	$16, %ebx
414
415	movdqa	%xmm0, %xmm3
416	movdqa	%xmm0, %xmm4
417	andps	%xmm1, %xmm0
418
419	andps	%xmm2, %xmm3
420	pslldq	$2, %xmm4
421	psrldq	$2, %xmm3
422	andps	%xmm2, %xmm4
423
424	orps	%xmm4, %xmm3
425	orps	%xmm3, %xmm0
426
427	movdqu	%xmm0, (%ecx)
428	addl	$16, %ecx
429	subl	$1, %eax
430.L42:
431	jne	.L43
432
433
434	/* There may be upto 3 pixels remaining to be copied.  Take care
435	 * of them now.  We do the 2 pixel case first because the data
436	 * will be aligned.
437	 */
438
439	testl	$2, %edx
440	je	.L47
441
442	movq	(%ebx), %xmm0
443	addl	$8, %ebx
444
445	movdqa	%xmm0, %xmm3
446	movdqa	%xmm0, %xmm4
447	andps	%xmm1, %xmm0
448
449	andps	%xmm2, %xmm3
450	pslldq	$2, %xmm4
451	psrldq	$2, %xmm3
452	andps	%xmm2, %xmm4
453
454	orps	%xmm4, %xmm3
455	orps	%xmm3, %xmm0
456
457	movq	%xmm0, (%ecx)
458	addl	$8, %ecx
459.L47:
460
461	testl	$1, %edx
462	je	.L46
463
464	DO_ONE_LAST_PIXEL()
465.L46:
466
467	popl	%ebx
468	popl	%esi
469	ret
470	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
471
472
473
474#define MASK_565_L	0x07e0f800
475#define MASK_565_H	0x0000001f
476/* Setting SCALE_ADJUST to 5 gives a perfect match with the
477 * classic C implementation in Mesa.  Setting SCALE_ADJUST
478 * to 0 is slightly faster but at a small cost to accuracy.
479 */
480#define SCALE_ADJUST	5
481#if SCALE_ADJUST == 5
482#define PRESCALE_L 0x00100001
483#define PRESCALE_H 0x00000200
484#define SCALE_L 0x40C620E8
485#define SCALE_H 0x0000839d
486#elif SCALE_ADJUST == 0
487#define PRESCALE_L 0x00200001
488#define PRESCALE_H 0x00000800
489#define SCALE_L 0x01040108
490#define SCALE_H 0x00000108
491#else
492#error SCALE_ADJUST must either be 5 or 0.
493#endif
494#define ALPHA_L 0x00000000
495#define ALPHA_H 0x00ff0000
496
497/**
498 * MMX optimized version of the RGB565 to RGBA copy routine.
499 */
500
501	.text
502	.globl	_generic_read_RGBA_span_RGB565_MMX
503#ifndef USE_DRICORE
504        .hidden _generic_read_RGBA_span_RGB565_MMX
505#endif
506	.type	_generic_read_RGBA_span_RGB565_MMX, @function
507
508_generic_read_RGBA_span_RGB565_MMX:
509
510#ifdef USE_INNER_EMMS
511	emms
512#endif
513
514	movl	4(%esp), %eax	/* source pointer */
515	movl	8(%esp), %edx	/* destination pointer */
516	movl	12(%esp), %ecx	/* number of pixels to copy */
517
518	pushl	$MASK_565_H
519	pushl	$MASK_565_L
520	movq	(%esp), %mm5
521	pushl	$PRESCALE_H
522	pushl	$PRESCALE_L
523	movq	(%esp), %mm6
524	pushl	$SCALE_H
525	pushl	$SCALE_L
526	movq	(%esp), %mm7
527	pushl	$ALPHA_H
528	pushl	$ALPHA_L
529	movq	(%esp), %mm3
530	addl	$32,%esp
531
532	sarl	$2, %ecx
533	jl	.L01		/* Bail early if the count is negative. */
534	jmp	.L02
535
536.L03:
537	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
538	 * second pixels into the four words of %mm0 and %mm2.
539      	 */
540
541	movq	(%eax), %mm4
542	addl	$8, %eax
543
544	pshufw	$0x00, %mm4, %mm0
545	pshufw	$0x55, %mm4, %mm2
546
547
548	/* Mask the pixels so that each word of each register contains only
549	 * one color component.
550	 */
551
552	pand	%mm5, %mm0
553	pand	%mm5, %mm2
554
555
556	/* Adjust the component values so that they are as small as possible,
557	 * but large enough so that we can multiply them by an unsigned 16-bit
558	 * number and get a value as large as 0x00ff0000.
559 	 */
560
561	pmullw	%mm6, %mm0
562	pmullw	%mm6, %mm2
563#if SCALE_ADJUST > 0
564	psrlw	$SCALE_ADJUST, %mm0
565	psrlw	$SCALE_ADJUST, %mm2
566#endif
567
568	/* Scale the input component values to be on the range
569	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
570	 */
571
572	pmulhuw	%mm7, %mm0
573	pmulhuw	%mm7, %mm2
574
575
576	/* Always set the alpha value to 0xff.
577	 */
578
579 	por %mm3, %mm0
580 	por %mm3, %mm2
581
582
583	/* Pack the 16-bit values to 8-bit values and store the converted
584	 * pixel data.
585	 */
586
587	packuswb	%mm2, %mm0
588	movq	%mm0, (%edx)
589	addl	$8, %edx
590
591	pshufw	$0xaa, %mm4, %mm0
592	pshufw	$0xff, %mm4, %mm2
593
594	pand	%mm5, %mm0
595	pand	%mm5, %mm2
596	pmullw	%mm6, %mm0
597	pmullw	%mm6, %mm2
598#if SCALE_ADJUST > 0
599	psrlw	$SCALE_ADJUST, %mm0
600	psrlw	$SCALE_ADJUST, %mm2
601#endif
602	pmulhuw	%mm7, %mm0
603	pmulhuw	%mm7, %mm2
604
605 	por %mm3, %mm0
606 	por %mm3, %mm2
607
608	packuswb	%mm2, %mm0
609
610	movq	%mm0, (%edx)
611	addl	$8, %edx
612
613	subl	$1, %ecx
614.L02:
615	jne	.L03
616
617
618	/* At this point there can be at most 3 pixels left to process.  If
619	 * there is either 2 or 3 left, process 2.
620         */
621
622	movl	12(%esp), %ecx
623	testl	$0x02, %ecx
624	je	.L04
625
626	movd	(%eax), %mm4
627	addl	$4, %eax
628
629	pshufw	$0x00, %mm4, %mm0
630	pshufw	$0x55, %mm4, %mm2
631
632	pand	%mm5, %mm0
633	pand	%mm5, %mm2
634	pmullw	%mm6, %mm0
635	pmullw	%mm6, %mm2
636#if SCALE_ADJUST > 0
637	psrlw	$SCALE_ADJUST, %mm0
638	psrlw	$SCALE_ADJUST, %mm2
639#endif
640	pmulhuw	%mm7, %mm0
641	pmulhuw	%mm7, %mm2
642
643 	por %mm3, %mm0
644 	por %mm3, %mm2
645
646	packuswb	%mm2, %mm0
647
648	movq	%mm0, (%edx)
649	addl	$8, %edx
650
651.L04:
652	/* At this point there can be at most 1 pixel left to process.
653	 * Process it if needed.
654         */
655
656	testl	$0x01, %ecx
657	je	.L01
658
659	movzwl	(%eax), %ecx
660	movd	%ecx, %mm4
661
662	pshufw	$0x00, %mm4, %mm0
663
664	pand	%mm5, %mm0
665	pmullw	%mm6, %mm0
666#if SCALE_ADJUST > 0
667	psrlw	$SCALE_ADJUST, %mm0
668#endif
669	pmulhuw	%mm7, %mm0
670
671 	por %mm3, %mm0
672
673	packuswb	%mm0, %mm0
674
675	movd	%mm0, (%edx)
676
677.L01:
678#ifdef USE_INNER_EMMS
679	emms
680#endif
681	ret
682#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) && !defined(__APPLE__) */
683
684#if defined (__ELF__) && defined (__linux__)
685	.section .note.GNU-stack,"",%progbits
686#endif
687