1/*
2 * (C) Copyright IBM Corporation 2004
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25/**
26 * \file read_rgba_span_x86.S
27 * Optimized routines to transfer pixel data from the framebuffer to a
28 * buffer in main memory.
29 *
30 * \author Ian Romanick <idr@us.ibm.com>
31 */
32
33	.file	"read_rgba_span_x86.S"
34#if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */
35/* Kevin F. Quinn 2nd July 2006
36 * Replaced data segment constants with text-segment instructions.
37 */
38#define	LOAD_MASK(mvins,m1,m2) \
39   	pushl	$0xff00ff00 ;\
40   	pushl	$0xff00ff00 ;\
41   	pushl	$0xff00ff00 ;\
42   	pushl	$0xff00ff00 ;\
43	mvins	(%esp), m1	;\
44   	pushl	$0x00ff0000 ;\
45   	pushl	$0x00ff0000 ;\
46   	pushl	$0x00ff0000 ;\
47   	pushl	$0x00ff0000 ;\
48	mvins	(%esp), m2	;\
49	addl	$32, %esp
50
51/* I implemented these as macros because they appear in several places,
52 * and I've tweaked them a number of times.  I got tired of changing every
53 * place they appear. :)
54 */
55
56#define DO_ONE_PIXEL() \
57	movl	(%ebx), %eax ; \
58	addl	$4, %ebx ; \
59	bswap	%eax          /* ARGB -> BGRA */ ; \
60	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
61	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \
62	addl	$4, %ecx
63
64#define DO_ONE_LAST_PIXEL() \
65	movl	(%ebx), %eax ; \
66	bswap	%eax          /* ARGB -> BGRA */ ; \
67	rorl	$8, %eax      /* BGRA -> ABGR */ ; \
68	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ;
69
70
71/**
72 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
73 *
74 * \warning
75 * This function assumes that the caller will issue the EMMS instruction
76 * at the correct places.
77 */
78
79.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
80.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX
81	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
82_generic_read_RGBA_span_BGRA8888_REV_MMX:
83	pushl	%ebx
84
85#ifdef USE_INNER_EMMS
86	emms
87#endif
88	LOAD_MASK(movq,%mm1,%mm2)
89
90	movl	8(%esp), %ebx	/* source pointer */
91	movl	16(%esp), %edx	/* number of pixels to copy */
92	movl	12(%esp), %ecx	/* destination pointer */
93
94	testl	%edx, %edx
95	jle	.L20		/* Bail if there's nothing to do. */
96
97	movl	%ebx, %eax
98
99	negl	%eax
100	sarl	$2, %eax
101	andl	$1, %eax
102	je	.L17
103
104	subl	%eax, %edx
105	DO_ONE_PIXEL()
106.L17:
107
108	/* Would it be faster to unroll this loop once and process 4 pixels
109	 * per pass, instead of just two?
110	 */
111
112	movl	%edx, %eax
113	shrl	%eax
114	jmp	.L18
115.L19:
116	movq	(%ebx), %mm0
117	addl	$8, %ebx
118
119	/* These 9 instructions do what PSHUFB (if there were such an
120	 * instruction) could do in 1. :(
121	 */
122
123	movq	%mm0, %mm3
124	movq	%mm0, %mm4
125
126	pand	%mm2, %mm3
127	psllq	$16, %mm4
128	psrlq	$16, %mm3
129	pand	%mm2, %mm4
130
131	pand	%mm1, %mm0
132	por	%mm4, %mm3
133	por	%mm3, %mm0
134
135	movq	%mm0, (%ecx)
136	addl	$8, %ecx
137	subl	$1, %eax
138.L18:
139	jne	.L19
140
141#ifdef USE_INNER_EMMS
142	emms
143#endif
144
145	/* At this point there are either 1 or 0 pixels remaining to be
146	 * converted.  Convert the last pixel, if needed.
147	 */
148
149	testl	$1, %edx
150	je	.L20
151
152	DO_ONE_LAST_PIXEL()
153
154.L20:
155	popl	%ebx
156	ret
157	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
158
159
160/**
161 * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
162 * instructions are only actually used to read data from the framebuffer.
163 * In practice, the speed-up is pretty small.
164 *
165 * \todo
166 * Do some more testing and determine if there's any reason to have this
167 * function in addition to the MMX version.
168 *
169 * \warning
170 * This function assumes that the caller will issue the EMMS instruction
171 * at the correct places.
172 */
173
174.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
175.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE
176	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
177_generic_read_RGBA_span_BGRA8888_REV_SSE:
178	pushl	%esi
179	pushl	%ebx
180	pushl	%ebp
181
182#ifdef USE_INNER_EMMS
183	emms
184#endif
185
186	LOAD_MASK(movq,%mm1,%mm2)
187
188	movl	16(%esp), %ebx	/* source pointer */
189	movl	24(%esp), %edx	/* number of pixels to copy */
190	movl	20(%esp), %ecx	/* destination pointer */
191
192	testl	%edx, %edx
193	jle	.L35		/* Bail if there's nothing to do. */
194
195	movl	%esp, %ebp
196	subl	$16, %esp
197	andl	$0xfffffff0, %esp
198
199	movl	%ebx, %eax
200	movl	%edx, %esi
201
202	negl	%eax
203	andl	$15, %eax
204	sarl	$2, %eax
205	cmpl	%edx, %eax
206	cmovle	%eax, %esi
207
208	subl	%esi, %edx
209
210	testl	$1, %esi
211	je	.L32
212
213	DO_ONE_PIXEL()
214.L32:
215
216	testl	$2, %esi
217	je	.L31
218
219	movq	(%ebx), %mm0
220	addl	$8, %ebx
221
222	movq	%mm0, %mm3
223	movq	%mm0, %mm4
224
225	pand	%mm2, %mm3
226	psllq	$16, %mm4
227	psrlq	$16, %mm3
228	pand	%mm2, %mm4
229
230	pand	%mm1, %mm0
231	por	%mm4, %mm3
232	por	%mm3, %mm0
233
234	movq	%mm0, (%ecx)
235	addl	$8, %ecx
236.L31:
237
238	movl	%edx, %eax
239	shrl	$2, %eax
240	jmp	.L33
241.L34:
242	movaps	(%ebx), %xmm0
243	addl	$16, %ebx
244
245	/* This would be so much better if we could just move directly from
246	 * an SSE register to an MMX register.  Unfortunately, that
247	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q
248	 * instruction.
249	 */
250
251	movaps	%xmm0, (%esp)
252	movq	(%esp), %mm0
253	movq	8(%esp), %mm5
254
255	movq	%mm0, %mm3
256	movq	%mm0, %mm4
257	movq	%mm5, %mm6
258	movq	%mm5, %mm7
259
260	pand	%mm2, %mm3
261	pand	%mm2, %mm6
262
263	psllq	$16, %mm4
264	psllq	$16, %mm7
265
266	psrlq	$16, %mm3
267	psrlq	$16, %mm6
268
269	pand	%mm2, %mm4
270	pand	%mm2, %mm7
271
272	pand	%mm1, %mm0
273	pand	%mm1, %mm5
274
275	por	%mm4, %mm3
276	por	%mm7, %mm6
277
278	por	%mm3, %mm0
279	por	%mm6, %mm5
280
281	movq	%mm0, (%ecx)
282	movq	%mm5, 8(%ecx)
283	addl	$16, %ecx
284
285	subl	$1, %eax
286.L33:
287	jne	.L34
288
289#ifdef USE_INNER_EMMS
290	emms
291#endif
292	movl	%ebp, %esp
293
294	/* At this point there are either [0, 3] pixels remaining to be
295	 * converted.
296	 */
297
298	testl	$2, %edx
299	je	.L36
300
301	movq	(%ebx), %mm0
302	addl	$8, %ebx
303
304	movq	%mm0, %mm3
305	movq	%mm0, %mm4
306
307	pand	%mm2, %mm3
308	psllq	$16, %mm4
309	psrlq	$16, %mm3
310	pand	%mm2, %mm4
311
312	pand	%mm1, %mm0
313	por	%mm4, %mm3
314	por	%mm3, %mm0
315
316	movq	%mm0, (%ecx)
317	addl	$8, %ecx
318.L36:
319
320	testl	$1, %edx
321	je	.L35
322
323	DO_ONE_LAST_PIXEL()
324.L35:
325	popl	%ebp
326	popl	%ebx
327	popl	%esi
328	ret
329	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE
330
331
332/**
333 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine.
334 */
335
336	.text
337.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
338.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2
339	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
340_generic_read_RGBA_span_BGRA8888_REV_SSE2:
341	pushl	%esi
342	pushl	%ebx
343
344	LOAD_MASK(movdqu,%xmm1,%xmm2)
345
346	movl	12(%esp), %ebx	/* source pointer */
347	movl	20(%esp), %edx	/* number of pixels to copy */
348	movl	16(%esp), %ecx	/* destination pointer */
349
350	movl	%ebx, %eax
351	movl	%edx, %esi
352
353	testl	%edx, %edx
354	jle	.L46		/* Bail if there's nothing to do. */
355
356	/* If the source pointer isn't a multiple of 16 we have to process
357	 * a few pixels the "slow" way to get the address aligned for
358	 * the SSE fetch intsructions.
359	 */
360
361	negl	%eax
362	andl	$15, %eax
363	sarl	$2, %eax
364
365	cmpl	%edx, %eax
366	cmovbe	%eax, %esi
367	subl	%esi, %edx
368
369	testl	$1, %esi
370	je	.L41
371
372	DO_ONE_PIXEL()
373.L41:
374	testl	$2, %esi
375	je	.L40
376
377	movq	(%ebx), %xmm0
378	addl	$8, %ebx
379
380	movdqa	%xmm0, %xmm3
381	movdqa	%xmm0, %xmm4
382	andps	%xmm1, %xmm0
383
384	andps	%xmm2, %xmm3
385	pslldq	$2, %xmm4
386	psrldq	$2, %xmm3
387	andps	%xmm2, %xmm4
388
389	orps	%xmm4, %xmm3
390	orps	%xmm3, %xmm0
391
392	movq	%xmm0, (%ecx)
393	addl	$8, %ecx
394.L40:
395
396	/* Would it be worth having a specialized version of this loop for
397	 * the case where the destination is 16-byte aligned?  That version
398	 * would be identical except that it could use movedqa instead of
399	 * movdqu.
400	 */
401
402	movl	%edx, %eax
403	shrl	$2, %eax
404	jmp	.L42
405.L43:
406	movdqa	(%ebx), %xmm0
407	addl	$16, %ebx
408
409	movdqa	%xmm0, %xmm3
410	movdqa	%xmm0, %xmm4
411	andps	%xmm1, %xmm0
412
413	andps	%xmm2, %xmm3
414	pslldq	$2, %xmm4
415	psrldq	$2, %xmm3
416	andps	%xmm2, %xmm4
417
418	orps	%xmm4, %xmm3
419	orps	%xmm3, %xmm0
420
421	movdqu	%xmm0, (%ecx)
422	addl	$16, %ecx
423	subl	$1, %eax
424.L42:
425	jne	.L43
426
427
428	/* There may be upto 3 pixels remaining to be copied.  Take care
429	 * of them now.  We do the 2 pixel case first because the data
430	 * will be aligned.
431	 */
432
433	testl	$2, %edx
434	je	.L47
435
436	movq	(%ebx), %xmm0
437	addl	$8, %ebx
438
439	movdqa	%xmm0, %xmm3
440	movdqa	%xmm0, %xmm4
441	andps	%xmm1, %xmm0
442
443	andps	%xmm2, %xmm3
444	pslldq	$2, %xmm4
445	psrldq	$2, %xmm3
446	andps	%xmm2, %xmm4
447
448	orps	%xmm4, %xmm3
449	orps	%xmm3, %xmm0
450
451	movq	%xmm0, (%ecx)
452	addl	$8, %ecx
453.L47:
454
455	testl	$1, %edx
456	je	.L46
457
458	DO_ONE_LAST_PIXEL()
459.L46:
460
461	popl	%ebx
462	popl	%esi
463	ret
464	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
465
466
467
468#define MASK_565_L	0x07e0f800
469#define MASK_565_H	0x0000001f
470/* Setting SCALE_ADJUST to 5 gives a perfect match with the
471 * classic C implementation in Mesa.  Setting SCALE_ADJUST
472 * to 0 is slightly faster but at a small cost to accuracy.
473 */
474#define SCALE_ADJUST	5
475#if SCALE_ADJUST == 5
476#define PRESCALE_L 0x00100001
477#define PRESCALE_H 0x00000200
478#define SCALE_L 0x40C620E8
479#define SCALE_H 0x0000839d
480#elif SCALE_ADJUST == 0
481#define PRESCALE_L 0x00200001
482#define PRESCALE_H 0x00000800
483#define SCALE_L 0x01040108
484#define SCALE_H 0x00000108
485#else
486#error SCALE_ADJUST must either be 5 or 0.
487#endif
488#define ALPHA_L 0x00000000
489#define ALPHA_H 0x00ff0000
490
491/**
492 * MMX optimized version of the RGB565 to RGBA copy routine.
493 */
494
495	.text
496	.globl	_generic_read_RGBA_span_RGB565_MMX
497        .hidden _generic_read_RGBA_span_RGB565_MMX
498	.type	_generic_read_RGBA_span_RGB565_MMX, @function
499
500_generic_read_RGBA_span_RGB565_MMX:
501
502#ifdef USE_INNER_EMMS
503	emms
504#endif
505
506	movl	4(%esp), %eax	/* source pointer */
507	movl	8(%esp), %edx	/* destination pointer */
508	movl	12(%esp), %ecx	/* number of pixels to copy */
509
510	pushl	$MASK_565_H
511	pushl	$MASK_565_L
512	movq	(%esp), %mm5
513	pushl	$PRESCALE_H
514	pushl	$PRESCALE_L
515	movq	(%esp), %mm6
516	pushl	$SCALE_H
517	pushl	$SCALE_L
518	movq	(%esp), %mm7
519	pushl	$ALPHA_H
520	pushl	$ALPHA_L
521	movq	(%esp), %mm3
522	addl	$32,%esp
523
524	sarl	$2, %ecx
525	jl	.L01		/* Bail early if the count is negative. */
526	jmp	.L02
527
528.L03:
529	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and
530	 * second pixels into the four words of %mm0 and %mm2.
531      	 */
532
533	movq	(%eax), %mm4
534	addl	$8, %eax
535
536	pshufw	$0x00, %mm4, %mm0
537	pshufw	$0x55, %mm4, %mm2
538
539
540	/* Mask the pixels so that each word of each register contains only
541	 * one color component.
542	 */
543
544	pand	%mm5, %mm0
545	pand	%mm5, %mm2
546
547
548	/* Adjust the component values so that they are as small as possible,
549	 * but large enough so that we can multiply them by an unsigned 16-bit
550	 * number and get a value as large as 0x00ff0000.
551 	 */
552
553	pmullw	%mm6, %mm0
554	pmullw	%mm6, %mm2
555#if SCALE_ADJUST > 0
556	psrlw	$SCALE_ADJUST, %mm0
557	psrlw	$SCALE_ADJUST, %mm2
558#endif
559
560	/* Scale the input component values to be on the range
561	 * [0, 0x00ff0000].  This it the real magic of the whole routine.
562	 */
563
564	pmulhuw	%mm7, %mm0
565	pmulhuw	%mm7, %mm2
566
567
568	/* Always set the alpha value to 0xff.
569	 */
570
571 	por %mm3, %mm0
572 	por %mm3, %mm2
573
574
575	/* Pack the 16-bit values to 8-bit values and store the converted
576	 * pixel data.
577	 */
578
579	packuswb	%mm2, %mm0
580	movq	%mm0, (%edx)
581	addl	$8, %edx
582
583	pshufw	$0xaa, %mm4, %mm0
584	pshufw	$0xff, %mm4, %mm2
585
586	pand	%mm5, %mm0
587	pand	%mm5, %mm2
588	pmullw	%mm6, %mm0
589	pmullw	%mm6, %mm2
590#if SCALE_ADJUST > 0
591	psrlw	$SCALE_ADJUST, %mm0
592	psrlw	$SCALE_ADJUST, %mm2
593#endif
594	pmulhuw	%mm7, %mm0
595	pmulhuw	%mm7, %mm2
596
597 	por %mm3, %mm0
598 	por %mm3, %mm2
599
600	packuswb	%mm2, %mm0
601
602	movq	%mm0, (%edx)
603	addl	$8, %edx
604
605	subl	$1, %ecx
606.L02:
607	jne	.L03
608
609
610	/* At this point there can be at most 3 pixels left to process.  If
611	 * there is either 2 or 3 left, process 2.
612         */
613
614	movl	12(%esp), %ecx
615	testl	$0x02, %ecx
616	je	.L04
617
618	movd	(%eax), %mm4
619	addl	$4, %eax
620
621	pshufw	$0x00, %mm4, %mm0
622	pshufw	$0x55, %mm4, %mm2
623
624	pand	%mm5, %mm0
625	pand	%mm5, %mm2
626	pmullw	%mm6, %mm0
627	pmullw	%mm6, %mm2
628#if SCALE_ADJUST > 0
629	psrlw	$SCALE_ADJUST, %mm0
630	psrlw	$SCALE_ADJUST, %mm2
631#endif
632	pmulhuw	%mm7, %mm0
633	pmulhuw	%mm7, %mm2
634
635 	por %mm3, %mm0
636 	por %mm3, %mm2
637
638	packuswb	%mm2, %mm0
639
640	movq	%mm0, (%edx)
641	addl	$8, %edx
642
643.L04:
644	/* At this point there can be at most 1 pixel left to process.
645	 * Process it if needed.
646         */
647
648	testl	$0x01, %ecx
649	je	.L01
650
651	movzwl	(%eax), %ecx
652	movd	%ecx, %mm4
653
654	pshufw	$0x00, %mm4, %mm0
655
656	pand	%mm5, %mm0
657	pmullw	%mm6, %mm0
658#if SCALE_ADJUST > 0
659	psrlw	$SCALE_ADJUST, %mm0
660#endif
661	pmulhuw	%mm7, %mm0
662
663 	por %mm3, %mm0
664
665	packuswb	%mm0, %mm0
666
667	movd	%mm0, (%edx)
668
669.L01:
670#ifdef USE_INNER_EMMS
671	emms
672#endif
673	ret
674#endif /* !defined(__MINGW32__) && !defined(__APPLE__) */
675
676#if defined (__ELF__) && defined (__linux__)
677	.section .note.GNU-stack,"",%progbits
678#endif
679