1 /*
2  * Loongson MMI optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
5  *                          All Rights Reserved.
6  * Copyright (C) 2019, D. R. Commander.  All Rights Reserved.
7  *
8  * This software is provided 'as-is', without any express or implied
9  * warranty.  In no event will the authors be held liable for any damages
10  * arising from the use of this software.
11  *
12  * Permission is granted to anyone to use this software for any purpose,
13  * including commercial applications, and to alter it and redistribute it
14  * freely, subject to the following restrictions:
15  *
16  * 1. The origin of this software must not be misrepresented; you must not
17  *    claim that you wrote the original software. If you use this software
18  *    in a product, an acknowledgment in the product documentation would be
19  *    appreciated but is not required.
20  * 2. Altered source versions must be plainly marked as such, and must not be
21  *    misrepresented as being the original software.
22  * 3. This notice may not be removed or altered from any source distribution.
23  */
24 
25 #ifndef __LOONGSON_MMINTRIN_H__
26 #define __LOONGSON_MMINTRIN_H__
27 
28 #include <stdint.h>
29 
30 
31 #define FUNCTION_ATTRIBS \
32   __attribute__((__gnu_inline__, __always_inline__, __artificial__))
33 
34 
35 /* Vectors are stored in 64-bit floating-point registers. */
36 typedef double __m64;
37 
38 /* Having a 32-bit datatype allows us to use 32-bit loads in places like
39    load8888. */
40 typedef float __m32;
41 
42 
43 /********** Set Operations **********/
44 
45 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setzero_si64(void)46 _mm_setzero_si64(void)
47 {
48   return 0.0;
49 }
50 
51 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi8(uint8_t __b7,uint8_t __b6,uint8_t __b5,uint8_t __b4,uint8_t __b3,uint8_t __b2,uint8_t __b1,uint8_t __b0)52 _mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
53             uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
54 {
55   __m64 ret;
56   uint32_t lo = ((uint32_t)__b6 << 24) |
57                 ((uint32_t)__b4 << 16) |
58                 ((uint32_t)__b2 << 8) |
59                 (uint32_t)__b0;
60   uint32_t hi = ((uint32_t)__b7 << 24) |
61                 ((uint32_t)__b5 << 16) |
62                 ((uint32_t)__b3 << 8) |
63                 (uint32_t)__b1;
64 
65   asm("mtc1      %1, %0\n\t"
66       "mtc1      %2, $f0\n\t"
67       "punpcklbh %0, %0, $f0\n\t"
68       : "=f" (ret)
69       : "r" (lo), "r" (hi)
70       : "$f0"
71      );
72 
73   return ret;
74 }
75 
76 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi16(uint16_t __h3,uint16_t __h2,uint16_t __h1,uint16_t __h0)77 _mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
78 {
79   __m64 ret;
80   uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
81   uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
82 
83   asm("mtc1      %1, %0\n\t"
84       "mtc1      %2, $f0\n\t"
85       "punpcklhw %0, %0, $f0\n\t"
86       : "=f" (ret)
87       : "r" (lo), "r" (hi)
88       : "$f0"
89      );
90 
91   return ret;
92 }
93 
94 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
95   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
96 
97 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set_pi32(uint32_t __i1,uint32_t __i0)98 _mm_set_pi32(uint32_t __i1, uint32_t __i0)
99 {
100   if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
101     uint64_t val = ((uint64_t)__i1 << 32) |
102                    ((uint64_t)__i0 <<  0);
103 
104     return *(__m64 *)&val;
105   } else if (__i1 == __i0) {
106     uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
107     __m64 ret;
108 
109     asm("pshufh %0, %1, %2\n\t"
110         : "=f" (ret)
111         : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
112        );
113 
114     return ret;
115   } else {
116     uint64_t val = ((uint64_t)__i1 << 32) |
117                    ((uint64_t)__i0 <<  0);
118 
119     return *(__m64 *)&val;
120   }
121 }
122 
123 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi8(uint8_t __b0)124 _mm_set1_pi8(uint8_t __b0)
125 {
126   __m64 ret;
127 
128   asm("sll    $8, %1, 8\n\t"
129       "or     %1, %1, $8\n\t"
130       "mtc1   %1, %0\n\t"
131       "mtc1   $0, $f0\n\t"
132       "pshufh %0, %0, $f0\n\t"
133       : "=f" (ret)
134       : "r" (__b0)
135       : "$8", "$f0"
136      );
137 
138   return ret;
139 }
140 
141 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi16(uint16_t __h0)142 _mm_set1_pi16(uint16_t __h0)
143 {
144   __m64 ret;
145 
146   asm("mtc1   %1, %0\n\t"
147       "mtc1   $0, $f0\n\t"
148       "pshufh %0, %0, $f0\n\t"
149       : "=f" (ret)
150       : "r" (__h0)
151       : "$8", "$f0"
152      );
153 
154   return ret;
155 }
156 
157 extern __inline __m64 FUNCTION_ATTRIBS
_mm_set1_pi32(unsigned __i0)158 _mm_set1_pi32(unsigned __i0)
159 {
160   return _mm_set_pi32(__i0, __i0);
161 }
162 
163 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi8(uint8_t __h0,uint8_t __h1,uint8_t __h2,uint8_t __h3,uint8_t __h4,uint8_t __h5,uint8_t __h6,uint8_t __h7)164 _mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
165              uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
166 {
167   return _mm_set_pi8(__h7, __h6, __h5, __h4,
168                      __h3, __h2, __h1, __h0);
169 }
170 
171 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi16(uint16_t __w0,uint16_t __w1,uint16_t __w2,uint16_t __w3)172 _mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
173 {
174   return _mm_set_pi16(__w3, __w2, __w1, __w0);
175 }
176 
177 extern __inline __m64 FUNCTION_ATTRIBS
_mm_setr_pi32(uint32_t __i0,uint32_t __i1)178 _mm_setr_pi32(uint32_t __i0, uint32_t __i1)
179 {
180   return _mm_set_pi32(__i1, __i0);
181 }
182 
183 
184 /********** Arithmetic Operations **********/
185 
186 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi8(__m64 __m1,__m64 __m2)187 _mm_add_pi8(__m64 __m1, __m64 __m2)
188 {
189   __m64 ret;
190 
191   asm("paddb %0, %1, %2\n\t"
192       : "=f" (ret)
193       : "f" (__m1), "f" (__m2)
194      );
195 
196   return ret;
197 }
198 
199 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi16(__m64 __m1,__m64 __m2)200 _mm_add_pi16(__m64 __m1, __m64 __m2)
201 {
202   __m64 ret;
203 
204   asm("paddh %0, %1, %2\n\t"
205       : "=f" (ret)
206       : "f" (__m1), "f" (__m2)
207      );
208 
209   return ret;
210 }
211 
212 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_pi32(__m64 __m1,__m64 __m2)213 _mm_add_pi32(__m64 __m1, __m64 __m2)
214 {
215   __m64 ret;
216 
217   asm("paddw %0, %1, %2\n\t"
218       : "=f" (ret)
219       : "f" (__m1), "f" (__m2)
220      );
221 
222   return ret;
223 }
224 
225 extern __inline __m64 FUNCTION_ATTRIBS
_mm_add_si64(__m64 __m1,__m64 __m2)226 _mm_add_si64(__m64 __m1, __m64 __m2)
227 {
228   __m64 ret;
229 
230   asm("paddd %0, %1, %2\n\t"
231       : "=f" (ret)
232       : "f" (__m1), "f" (__m2)
233      );
234 
235   return ret;
236 }
237 
238 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pi8(__m64 __m1,__m64 __m2)239 _mm_adds_pi8(__m64 __m1, __m64 __m2)
240 {
241   __m64 ret;
242 
243   asm("paddsb %0, %1, %2\n\t"
244       : "=f" (ret)
245       : "f" (__m1), "f" (__m2)
246      );
247 
248   return ret;
249 }
250 
251 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pi16(__m64 __m1,__m64 __m2)252 _mm_adds_pi16(__m64 __m1, __m64 __m2)
253 {
254   __m64 ret;
255 
256   asm("paddsh %0, %1, %2\n\t"
257       : "=f" (ret)
258       : "f" (__m1), "f" (__m2)
259      );
260 
261   return ret;
262 }
263 
264 
265 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pu8(__m64 __m1,__m64 __m2)266 _mm_adds_pu8(__m64 __m1, __m64 __m2)
267 {
268   __m64 ret;
269 
270   asm("paddusb %0, %1, %2\n\t"
271       : "=f" (ret)
272       : "f" (__m1), "f" (__m2)
273      );
274 
275   return ret;
276 }
277 
278 extern __inline __m64 FUNCTION_ATTRIBS
_mm_adds_pu16(__m64 __m1,__m64 __m2)279 _mm_adds_pu16(__m64 __m1, __m64 __m2)
280 {
281   __m64 ret;
282 
283   asm("paddush %0, %1, %2\n\t"
284       : "=f" (ret)
285       : "f" (__m1), "f" (__m2)
286      );
287 
288   return ret;
289 }
290 
291 extern __inline __m64 FUNCTION_ATTRIBS
_mm_avg_pu8(__m64 __m1,__m64 __m2)292 _mm_avg_pu8(__m64 __m1, __m64 __m2)
293 {
294   __m64 ret;
295 
296   asm("pavgb %0, %1, %2\n\t"
297       : "=f" (ret)
298       : "f" (__m1), "f" (__m2)
299      );
300 
301   return ret;
302 }
303 
304 extern __inline __m64 FUNCTION_ATTRIBS
_mm_avg_pu16(__m64 __m1,__m64 __m2)305 _mm_avg_pu16(__m64 __m1, __m64 __m2)
306 {
307   __m64 ret;
308 
309   asm("pavgh %0, %1, %2\n\t"
310       : "=f" (ret)
311       : "f" (__m1), "f" (__m2)
312      );
313 
314   return ret;
315 }
316 
317 extern __inline __m64 FUNCTION_ATTRIBS
_mm_madd_pi16(__m64 __m1,__m64 __m2)318 _mm_madd_pi16(__m64 __m1, __m64 __m2)
319 {
320   __m64 ret;
321 
322   asm("pmaddhw %0, %1, %2\n\t"
323       : "=f" (ret)
324       : "f" (__m1), "f" (__m2)
325      );
326 
327   return ret;
328 }
329 
330 extern __inline __m64 FUNCTION_ATTRIBS
_mm_max_pi16(__m64 __m1,__m64 __m2)331 _mm_max_pi16(__m64 __m1, __m64 __m2)
332 {
333   __m64 ret;
334 
335   asm("pmaxsh %0, %1, %2\n\t"
336       : "=f" (ret)
337       : "f" (__m1), "f" (__m2)
338      );
339 
340   return ret;
341 }
342 
343 extern __inline __m64 FUNCTION_ATTRIBS
_mm_max_pu8(__m64 __m1,__m64 __m2)344 _mm_max_pu8(__m64 __m1, __m64 __m2)
345 {
346   __m64 ret;
347 
348   asm("pmaxub %0, %1, %2\n\t"
349       : "=f" (ret)
350       : "f" (__m1), "f" (__m2)
351      );
352 
353   return ret;
354 }
355 
356 extern __inline __m64 FUNCTION_ATTRIBS
_mm_min_pi16(__m64 __m1,__m64 __m2)357 _mm_min_pi16(__m64 __m1, __m64 __m2)
358 {
359   __m64 ret;
360 
361   asm("pminsh %0, %1, %2\n\t"
362       : "=f" (ret)
363       : "f" (__m1), "f" (__m2)
364      );
365 
366   return ret;
367 }
368 
369 extern __inline __m64 FUNCTION_ATTRIBS
_mm_min_pu8(__m64 __m1,__m64 __m2)370 _mm_min_pu8(__m64 __m1, __m64 __m2)
371 {
372   __m64 ret;
373 
374   asm("pminub %0, %1, %2\n\t"
375       : "=f" (ret)
376       : "f" (__m1), "f" (__m2)
377      );
378 
379   return ret;
380 }
381 
382 extern __inline int FUNCTION_ATTRIBS
_mm_movemask_pi8(__m64 __m1)383 _mm_movemask_pi8(__m64 __m1)
384 {
385   int ret;
386 
387   asm("pmovmskb %0, %1\n\t"
388       : "=r" (ret)
389       : "y" (__m1)
390      );
391 
392   return ret;
393 }
394 
395 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mulhi_pi16(__m64 __m1,__m64 __m2)396 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
397 {
398   __m64 ret;
399 
400   asm("pmulhh %0, %1, %2\n\t"
401       : "=f" (ret)
402       : "f" (__m1), "f" (__m2)
403      );
404 
405   return ret;
406 }
407 
408 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mulhi_pu16(__m64 __m1,__m64 __m2)409 _mm_mulhi_pu16(__m64 __m1, __m64 __m2)
410 {
411   __m64 ret;
412 
413   asm("pmulhuh %0, %1, %2\n\t"
414       : "=f" (ret)
415       : "f" (__m1), "f" (__m2)
416      );
417 
418   return ret;
419 }
420 
421 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mullo_pi16(__m64 __m1,__m64 __m2)422 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
423 {
424   __m64 ret;
425 
426   asm("pmullh %0, %1, %2\n\t"
427       : "=f" (ret)
428       : "f" (__m1), "f" (__m2)
429      );
430 
431   return ret;
432 }
433 
434 extern __inline __m64 FUNCTION_ATTRIBS
_mm_mul_pu32(__m64 __m1,__m64 __m2)435 _mm_mul_pu32(__m64 __m1, __m64 __m2)
436 {
437   __m64 ret;
438 
439   asm("pmuluw %0, %1, %2\n\t"
440       : "=f" (ret)
441       : "f" (__m1), "f" (__m2)
442      );
443 
444   return ret;
445 }
446 
447 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sad_pu8(__m64 __m1,__m64 __m2)448 _mm_sad_pu8(__m64 __m1, __m64 __m2)
449 {
450   __m64 ret;
451 
452   asm("psadbh %0, %1, %2\n\t"
453       : "=f" (ret)
454       : "f" (__m1), "f" (__m2)
455      );
456 
457   return ret;
458 }
459 
460 
461 extern __inline __m64 FUNCTION_ATTRIBS
_mm_asub_pu8(__m64 __m1,__m64 __m2)462 _mm_asub_pu8(__m64 __m1, __m64 __m2)
463 {
464   __m64 ret;
465 
466   asm("pasubub %0, %1, %2\n\t"
467       : "=f" (ret)
468       : "f" (__m1), "f" (__m2)
469      );
470 
471   return ret;
472 }
473 
474 extern __inline __m64 FUNCTION_ATTRIBS
_mm_biadd_pu8(__m64 __m1,__m64 __m2)475 _mm_biadd_pu8(__m64 __m1, __m64 __m2)
476 {
477   __m64 ret;
478 
479   asm("biadd %0, %1, %2\n\t"
480       : "=f" (ret)
481       : "f" (__m1), "f" (__m2)
482      );
483 
484   return ret;
485 }
486 
487 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi8(__m64 __m1,__m64 __m2)488 _mm_sub_pi8(__m64 __m1, __m64 __m2)
489 {
490   __m64 ret;
491 
492   asm("psubb %0, %1, %2\n\t"
493       : "=f" (ret)
494       : "f" (__m1), "f" (__m2)
495      );
496 
497   return ret;
498 }
499 
500 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi16(__m64 __m1,__m64 __m2)501 _mm_sub_pi16(__m64 __m1, __m64 __m2)
502 {
503   __m64 ret;
504 
505   asm("psubh %0, %1, %2\n\t"
506       : "=f" (ret)
507       : "f" (__m1), "f" (__m2)
508      );
509 
510   return ret;
511 }
512 
513 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_pi32(__m64 __m1,__m64 __m2)514 _mm_sub_pi32(__m64 __m1, __m64 __m2)
515 {
516   __m64 ret;
517 
518   asm("psubw %0, %1, %2\n\t"
519       : "=f" (ret)
520       : "f" (__m1), "f" (__m2)
521      );
522 
523   return ret;
524 }
525 
526 extern __inline __m64 FUNCTION_ATTRIBS
_mm_sub_si64(__m64 __m1,__m64 __m2)527 _mm_sub_si64(__m64 __m1, __m64 __m2)
528 {
529   __m64 ret;
530 
531   asm("psubd %0, %1, %2\n\t"
532       : "=f" (ret)
533       : "f" (__m1), "f" (__m2)
534      );
535 
536   return ret;
537 }
538 
539 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pi8(__m64 __m1,__m64 __m2)540 _mm_subs_pi8(__m64 __m1, __m64 __m2)
541 {
542   __m64 ret;
543 
544   asm("psubsb %0, %1, %2\n\t"
545       : "=f" (ret)
546       : "f" (__m1), "f" (__m2)
547      );
548 
549   return ret;
550 }
551 
552 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pi16(__m64 __m1,__m64 __m2)553 _mm_subs_pi16(__m64 __m1, __m64 __m2)
554 {
555   __m64 ret;
556 
557   asm("psubsh %0, %1, %2\n\t"
558       : "=f" (ret)
559       : "f" (__m1), "f" (__m2)
560      );
561 
562   return ret;
563 }
564 
565 
566 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pu8(__m64 __m1,__m64 __m2)567 _mm_subs_pu8(__m64 __m1, __m64 __m2)
568 {
569   __m64 ret;
570 
571   asm("psubusb %0, %1, %2\n\t"
572       : "=f" (ret)
573       : "f" (__m1), "f" (__m2)
574      );
575 
576   return ret;
577 }
578 
579 extern __inline __m64 FUNCTION_ATTRIBS
_mm_subs_pu16(__m64 __m1,__m64 __m2)580 _mm_subs_pu16(__m64 __m1, __m64 __m2)
581 {
582   __m64 ret;
583 
584   asm("psubush %0, %1, %2\n\t"
585       : "=f" (ret)
586       : "f" (__m1), "f" (__m2)
587      );
588 
589   return ret;
590 }
591 
592 
593 /********** Logical Operations **********/
594 
595 extern __inline __m64 FUNCTION_ATTRIBS
_mm_and_si64(__m64 __m1,__m64 __m2)596 _mm_and_si64(__m64 __m1, __m64 __m2)
597 {
598   __m64 ret;
599 
600   asm("and %0, %1, %2\n\t"
601       : "=f" (ret)
602       : "f" (__m1), "f" (__m2)
603      );
604 
605   return ret;
606 }
607 
608 extern __inline __m64 FUNCTION_ATTRIBS
_mm_andnot_si64(__m64 __m1,__m64 __m2)609 _mm_andnot_si64(__m64 __m1, __m64 __m2)
610 {
611   __m64 ret;
612 
613   asm("andn %0, %1, %2\n\t"
614       : "=f" (ret)
615       : "f" (__m1), "f" (__m2)
616      );
617 
618   return ret;
619 }
620 
621 
622 extern __inline __m64 FUNCTION_ATTRIBS
_mm_or_si32(__m32 __m1,__m32 __m2)623 _mm_or_si32(__m32 __m1, __m32 __m2)
624 {
625   __m32 ret;
626 
627   asm("or %0, %1, %2\n\t"
628       : "=f" (ret)
629       : "f" (__m1), "f" (__m2)
630      );
631 
632   return ret;
633 }
634 
635 extern __inline __m64 FUNCTION_ATTRIBS
_mm_or_si64(__m64 __m1,__m64 __m2)636 _mm_or_si64(__m64 __m1, __m64 __m2)
637 {
638   __m64 ret;
639 
640   asm("or %0, %1, %2\n\t"
641       : "=f" (ret)
642       : "f" (__m1), "f" (__m2)
643      );
644 
645   return ret;
646 }
647 
648 extern __inline __m64 FUNCTION_ATTRIBS
_mm_xor_si64(__m64 __m1,__m64 __m2)649 _mm_xor_si64(__m64 __m1, __m64 __m2)
650 {
651   __m64 ret;
652 
653   asm("xor %0, %1, %2\n\t"
654       : "=f" (ret)
655       : "f" (__m1), "f" (__m2)
656      );
657 
658   return ret;
659 }
660 
661 
662 /********** Shift Operations **********/
663 
664 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_pi16(__m64 __m,int64_t __count)665 _mm_slli_pi16(__m64 __m, int64_t __count)
666 {
667   __m64 ret;
668 
669   asm("psllh  %0, %1, %2\n\t"
670       : "=f" (ret)
671       : "f" (__m), "f" (*(__m64 *)&__count)
672      );
673 
674   return ret;
675 }
676 
677 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_pi32(__m64 __m,int64_t __count)678 _mm_slli_pi32(__m64 __m, int64_t __count)
679 {
680   __m64 ret;
681 
682   asm("psllw %0, %1, %2\n\t"
683       : "=f" (ret)
684       : "f" (__m), "f" (*(__m64 *)&__count)
685      );
686 
687   return ret;
688 }
689 
690 extern __inline __m64 FUNCTION_ATTRIBS
_mm_slli_si64(__m64 __m,int64_t __count)691 _mm_slli_si64(__m64 __m, int64_t __count)
692 {
693   __m64 ret;
694 
695   asm("dsll  %0, %1, %2\n\t"
696       : "=f" (ret)
697       : "f" (__m), "f" (*(__m64 *)&__count)
698      );
699 
700   return ret;
701 }
702 
703 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_pi16(__m64 __m,int64_t __count)704 _mm_srli_pi16(__m64 __m, int64_t __count)
705 {
706   __m64 ret;
707 
708   asm("psrlh %0, %1, %2\n\t"
709       : "=f" (ret)
710       : "f" (__m), "f" (*(__m64 *)&__count)
711      );
712 
713   return ret;
714 }
715 
716 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_pi32(__m64 __m,int64_t __count)717 _mm_srli_pi32(__m64 __m, int64_t __count)
718 {
719   __m64 ret;
720 
721   asm("psrlw %0, %1, %2\n\t"
722       : "=f" (ret)
723       : "f" (__m), "f" (*(__m64 *)&__count)
724      );
725 
726   return ret;
727 }
728 
729 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srli_si64(__m64 __m,int64_t __count)730 _mm_srli_si64(__m64 __m, int64_t __count)
731 {
732   __m64 ret;
733 
734   asm("dsrl  %0, %1, %2\n\t"
735       : "=f" (ret)
736       : "f" (__m), "f" (*(__m64 *)&__count)
737      );
738 
739   return ret;
740 }
741 
742 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_pi16(__m64 __m,int64_t __count)743 _mm_srai_pi16(__m64 __m, int64_t __count)
744 {
745   __m64 ret;
746 
747   asm("psrah %0, %1, %2\n\t"
748       : "=f" (ret)
749       : "f" (__m), "f" (*(__m64 *)&__count)
750      );
751 
752   return ret;
753 }
754 
755 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_pi32(__m64 __m,int64_t __count)756 _mm_srai_pi32(__m64 __m, int64_t __count)
757 {
758   __m64 ret;
759 
760   asm("psraw %0, %1, %2\n\t"
761       : "=f" (ret)
762       : "f" (__m), "f" (*(__m64 *)&__count)
763      );
764 
765   return ret;
766 }
767 
768 extern __inline __m64 FUNCTION_ATTRIBS
_mm_srai_si64(__m64 __m,int64_t __count)769 _mm_srai_si64(__m64 __m, int64_t __count)
770 {
771   __m64 ret;
772 
773   asm("dsra %0, %1, %2\n\t"
774       : "=f" (ret)
775       : "f" (__m), "f" (*(__m64 *)&__count)
776      );
777 
778   return ret;
779 }
780 
781 
782 /********** Conversion Intrinsics **********/
783 
784 extern __inline __m64 FUNCTION_ATTRIBS
to_m64(uint64_t x)785 to_m64(uint64_t x)
786 {
787   return *(__m64 *)&x;
788 }
789 
790 extern __inline uint64_t FUNCTION_ATTRIBS
to_uint64(__m64 x)791 to_uint64(__m64 x)
792 {
793   return *(uint64_t *)&x;
794 }
795 
796 
797 /********** Comparison Intrinsics **********/
798 
799 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi8(__m64 __m1,__m64 __m2)800 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
801 {
802   __m64 ret;
803 
804   asm("pcmpeqb %0, %1, %2\n\t"
805       : "=f" (ret)
806       : "f" (__m1), "f" (__m2)
807      );
808 
809   return ret;
810 }
811 
812 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi16(__m64 __m1,__m64 __m2)813 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
814 {
815   __m64 ret;
816 
817   asm("pcmpeqh %0, %1, %2\n\t"
818       : "=f" (ret)
819       : "f" (__m1), "f" (__m2)
820      );
821 
822   return ret;
823 }
824 
825 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpeq_pi32(__m64 __m1,__m64 __m2)826 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
827 {
828   __m64 ret;
829 
830   asm("pcmpeqw %0, %1, %2\n\t"
831       : "=f" (ret)
832       : "f" (__m1), "f" (__m2)
833      );
834 
835   return ret;
836 }
837 
838 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi8(__m64 __m1,__m64 __m2)839 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
840 {
841   __m64 ret;
842 
843   asm("pcmpgtb %0, %1, %2\n\t"
844       : "=f" (ret)
845       : "f" (__m1), "f" (__m2)
846      );
847 
848   return ret;
849 }
850 
851 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi16(__m64 __m1,__m64 __m2)852 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
853 {
854   __m64 ret;
855 
856   asm("pcmpgth %0, %1, %2\n\t"
857       : "=f" (ret)
858       : "f" (__m1), "f" (__m2)
859      );
860 
861   return ret;
862 }
863 
864 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmpgt_pi32(__m64 __m1,__m64 __m2)865 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
866 {
867   __m64 ret;
868 
869   asm("pcmpgtw %0, %1, %2\n\t"
870       : "=f" (ret)
871       : "f" (__m1), "f" (__m2)
872      );
873 
874   return ret;
875 }
876 
877 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi8(__m64 __m1,__m64 __m2)878 _mm_cmplt_pi8(__m64 __m1, __m64 __m2)
879 {
880   __m64 ret;
881 
882   asm("pcmpltb %0, %1, %2\n\t"
883       : "=f" (ret)
884       : "f" (__m1), "f" (__m2)
885      );
886 
887   return ret;
888 }
889 
890 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi16(__m64 __m1,__m64 __m2)891 _mm_cmplt_pi16(__m64 __m1, __m64 __m2)
892 {
893   __m64 ret;
894 
895   asm("pcmplth %0, %1, %2\n\t"
896       : "=f" (ret)
897       : "f" (__m1), "f" (__m2)
898      );
899 
900   return ret;
901 }
902 
903 extern __inline __m64 FUNCTION_ATTRIBS
_mm_cmplt_pi32(__m64 __m1,__m64 __m2)904 _mm_cmplt_pi32(__m64 __m1, __m64 __m2)
905 {
906   __m64 ret;
907 
908   asm("pcmpltw %0, %1, %2\n\t"
909       : "=f" (ret)
910       : "f" (__m1), "f" (__m2)
911      );
912 
913   return ret;
914 }
915 
916 
917 /********** Miscellaneous Operations **********/
918 
919 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi16(__m64 __m1,__m64 __m2)920 _mm_packs_pi16(__m64 __m1, __m64 __m2)
921 {
922   __m64 ret;
923 
924   asm("packsshb %0, %1, %2\n\t"
925       : "=f" (ret)
926       : "f" (__m1), "f" (__m2)
927      );
928 
929   return ret;
930 }
931 
932 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi32(__m64 __m1,__m64 __m2)933 _mm_packs_pi32(__m64 __m1, __m64 __m2)
934 {
935   __m64 ret;
936 
937   asm("packsswh %0, %1, %2\n\t"
938       : "=f" (ret)
939       : "f" (__m1), "f" (__m2)
940      );
941 
942   return ret;
943 }
944 
945 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pi32_f(__m64 __m1,__m64 __m2)946 _mm_packs_pi32_f(__m64 __m1, __m64 __m2)
947 {
948   __m64 ret;
949 
950   asm("packsswh %0, %1, %2\n\t"
951       : "=f" (ret)
952       : "f" (__m1), "f" (__m2)
953      );
954 
955   return ret;
956 }
957 
958 extern __inline __m64 FUNCTION_ATTRIBS
_mm_packs_pu16(__m64 __m1,__m64 __m2)959 _mm_packs_pu16(__m64 __m1, __m64 __m2)
960 {
961   __m64 ret;
962 
963   asm("packushb %0, %1, %2\n\t"
964       : "=f" (ret)
965       : "f" (__m1), "f" (__m2)
966      );
967 
968   return ret;
969 }
970 
971 extern __inline __m64 FUNCTION_ATTRIBS
_mm_extract_pi16(__m64 __m,int64_t __pos)972 _mm_extract_pi16(__m64 __m, int64_t __pos)
973 {
974   __m64 ret;
975 
976   asm("pextrh %0, %1, %2\n\t"
977       : "=f" (ret)
978       : "f" (__m), "f" (*(__m64 *)&__pos)
979      );
980 
981   return ret;
982 }
983 
984 extern __inline __m64 FUNCTION_ATTRIBS
_mm_insert_pi16(__m64 __m1,__m64 __m2,int64_t __pos)985 _mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
986 {
987   __m64 ret;
988 
989   switch (__pos) {
990   case 0:
991 
992     asm("pinsrh_0 %0, %1, %2\n\t"
993         : "=f" (ret)
994         : "f" (__m1), "f" (__m2), "i" (__pos)
995        );
996 
997     break;
998 
999   case 1:
1000 
1001     asm("pinsrh_1 %0, %1, %2\n\t"
1002         : "=f" (ret)
1003         : "f" (__m1), "f" (__m2), "i" (__pos)
1004        );
1005 
1006     break;
1007   case 2:
1008 
1009     asm("pinsrh_2 %0, %1, %2\n\t"
1010         : "=f" (ret)
1011         : "f" (__m1), "f" (__m2), "i" (__pos)
1012        );
1013 
1014     break;
1015 
1016   case 3:
1017 
1018     asm("pinsrh_3 %0, %1, %2\n\t"
1019         : "=f" (ret)
1020         : "f" (__m1), "f" (__m2), "i" (__pos)
1021        );
1022 
1023     break;
1024   }
1025 
1026   return ret;
1027 }
1028 
1029 extern __inline __m64 FUNCTION_ATTRIBS
_mm_shuffle_pi16(__m64 __m,int64_t __n)1030 _mm_shuffle_pi16(__m64 __m, int64_t __n)
1031 {
1032   __m64 ret;
1033 
1034   asm("pshufh %0, %1, %2\n\t"
1035       : "=f" (ret)
1036       : "f" (__m), "f" (*(__m64 *)&__n)
1037      );
1038 
1039   return ret;
1040 }
1041 
1042 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi8(__m64 __m1,__m64 __m2)1043 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
1044 {
1045   __m64 ret;
1046 
1047   asm("punpckhbh %0, %1, %2\n\t"
1048       : "=f" (ret)
1049       : "f" (__m1), "f" (__m2)
1050      );
1051 
1052   return ret;
1053 }
1054 
1055 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi8_f(__m64 __m1,__m64 __m2)1056 _mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
1057 {
1058   __m64 ret;
1059 
1060   asm("punpckhbh %0, %1, %2\n\t"
1061       : "=f" (ret)
1062       : "f" (__m1), "f" (__m2)
1063      );
1064 
1065   return ret;
1066 }
1067 
1068 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi16(__m64 __m1,__m64 __m2)1069 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
1070 {
1071   __m64 ret;
1072 
1073   asm("punpckhhw %0, %1, %2\n\t"
1074       : "=f" (ret)
1075       : "f" (__m1), "f" (__m2)
1076      );
1077 
1078   return ret;
1079 }
1080 
1081 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi16_f(__m64 __m1,__m64 __m2)1082 _mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
1083 {
1084   __m64 ret;
1085 
1086   asm("punpckhhw %0, %1, %2\n\t"
1087       : "=f" (ret)
1088       : "f" (__m1), "f" (__m2)
1089      );
1090 
1091   return ret;
1092 }
1093 
1094 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpackhi_pi32(__m64 __m1,__m64 __m2)1095 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
1096 {
1097   __m64 ret;
1098 
1099   asm("punpckhwd %0, %1, %2\n\t"
1100       : "=f" (ret)
1101       : "f" (__m1), "f" (__m2)
1102      );
1103 
1104   return ret;
1105 }
1106 
1107 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8(__m64 __m1,__m64 __m2)1108 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
1109 {
1110   __m64 ret;
1111 
1112   asm("punpcklbh %0, %1, %2\n\t"
1113       : "=f" (ret)
1114       : "f" (__m1), "f" (__m2)
1115      );
1116 
1117   return ret;
1118 }
1119 
1120 /* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
1121    which preserves the data. */
1122 
1123 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8_f64(__m64 __m1,__m64 __m2)1124 _mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
1125 {
1126   __m64 ret;
1127 
1128   asm("punpcklbh %0, %1, %2\n\t"
1129       : "=f" (ret)
1130       : "f" (__m1), "f" (__m2)
1131      );
1132 
1133   return ret;
1134 }
1135 
1136 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
1137    datatype, which allows load8888 to use 32-bit loads. */
1138 
1139 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi8_f(__m32 __m1,__m64 __m2)1140 _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
1141 {
1142   __m64 ret;
1143 
1144   asm("punpcklbh %0, %1, %2\n\t"
1145       : "=f" (ret)
1146       : "f" (__m1), "f" (__m2)
1147      );
1148 
1149   return ret;
1150 }
1151 
1152 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi16(__m64 __m1,__m64 __m2)1153 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
1154 {
1155   __m64 ret;
1156 
1157   asm("punpcklhw %0, %1, %2\n\t"
1158       : "=f" (ret)
1159       : "f" (__m1), "f" (__m2)
1160      );
1161 
1162   return ret;
1163 }
1164 
1165 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi16_f(__m64 __m1,__m64 __m2)1166 _mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
1167 {
1168   __m64 ret;
1169 
1170   asm("punpcklhw %0, %1, %2\n\t"
1171       : "=f" (ret)
1172       : "f" (__m1), "f" (__m2)
1173      );
1174 
1175   return ret;
1176 }
1177 
1178 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi32(__m64 __m1,__m64 __m2)1179 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
1180 {
1181   __m64 ret;
1182 
1183   asm("punpcklwd %0, %1, %2\n\t"
1184       : "=f" (ret)
1185       : "f" (__m1), "f" (__m2)
1186      );
1187 
1188   return ret;
1189 }
1190 
1191 
1192 extern __inline __m64 FUNCTION_ATTRIBS
_mm_unpacklo_pi32_f(__m64 __m1,__m64 __m2)1193 _mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
1194 {
1195   __m64 ret;
1196 
1197   asm("punpcklwd %0, %1, %2\n\t"
1198       : "=f" (ret)
1199       : "f" (__m1), "f" (__m2)
1200      );
1201 
1202   return ret;
1203 }
1204 
1205 extern __inline void FUNCTION_ATTRIBS
_mm_store_pi32(__m32 * dest,__m64 src)1206 _mm_store_pi32(__m32 *dest, __m64 src)
1207 {
1208   src = _mm_packs_pu16(src, _mm_setzero_si64());
1209 
1210   asm("swc1 %1, %0\n\t"
1211       : "=m" (*dest)
1212       : "f" (src)
1213       : "memory"
1214      );
1215 }
1216 
1217 extern __inline void FUNCTION_ATTRIBS
_mm_store_si64(__m64 * dest,__m64 src)1218 _mm_store_si64(__m64 *dest, __m64 src)
1219 {
1220   asm("gssdlc1 %1, 7+%0\n\t"
1221       "gssdrc1 %1, %0\n\t"
1222       : "=m" (*dest)
1223       : "f" (src)
1224       : "memory"
1225      );
1226 }
1227 
1228 extern __inline __m64 FUNCTION_ATTRIBS
_mm_load_si32(const __m32 * src)1229 _mm_load_si32(const __m32 *src)
1230 {
1231   __m32 ret;
1232 
1233   asm("lwc1 %0, %1\n\t"
1234       : "=f" (ret)
1235       : "m" (*src)
1236      );
1237 
1238   return ret;
1239 }
1240 
1241 extern __inline __m64 FUNCTION_ATTRIBS
_mm_load_si64(const __m64 * src)1242 _mm_load_si64(const __m64 *src)
1243 {
1244   __m64 ret;
1245 
1246   asm("ldc1 %0, %1\n\t"
1247       : "=f" (ret)
1248       : "m" (*src)
1249       : "memory"
1250      );
1251 
1252   return ret;
1253 }
1254 
1255 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadu_si64(const __m64 * src)1256 _mm_loadu_si64(const __m64 *src)
1257 {
1258   __m64 ret;
1259 
1260   asm("gsldlc1 %0,  7(%1)\n\t"
1261       "gsldrc1 %0,  0(%1)\n\t"
1262       : "=f" (ret)
1263       : "r" (src)
1264       : "memory"
1265      );
1266 
1267   return ret;
1268 }
1269 
1270 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi8(const uint32_t * src)1271 _mm_loadlo_pi8(const uint32_t *src)
1272 {
1273   return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
1274 }
1275 
1276 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi8_f(__m64 src)1277 _mm_loadlo_pi8_f(__m64 src)
1278 {
1279   return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
1280 }
1281 
1282 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi8_f(__m64 src)1283 _mm_loadhi_pi8_f(__m64 src)
1284 {
1285   return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
1286 }
1287 
1288 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi16(__m64 src)1289 _mm_loadlo_pi16(__m64 src)
1290 {
1291   return _mm_unpacklo_pi16(src, _mm_setzero_si64());
1292 }
1293 
1294 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadlo_pi16_f(__m64 src)1295 _mm_loadlo_pi16_f(__m64 src)
1296 {
1297   return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
1298 }
1299 
1300 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi16(__m64 src)1301 _mm_loadhi_pi16(__m64 src)
1302 {
1303   return _mm_unpackhi_pi16(src, _mm_setzero_si64());
1304 }
1305 
1306 extern __inline __m64 FUNCTION_ATTRIBS
_mm_loadhi_pi16_f(__m64 src)1307 _mm_loadhi_pi16_f(__m64 src)
1308 {
1309   return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
1310 }
1311 
1312 extern __inline __m64 FUNCTION_ATTRIBS
_mm_expand_alpha(__m64 pixel)1313 _mm_expand_alpha(__m64 pixel)
1314 {
1315   return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
1316 }
1317 
1318 extern __inline __m64 FUNCTION_ATTRIBS
_mm_expand_alpha_rev(__m64 pixel)1319 _mm_expand_alpha_rev(__m64 pixel)
1320 {
1321   return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
1322 }
1323 
1324 #endif  /* __LOONGSON_MMINTRIN_H__ */
1325