1 /*
2  * Loongson MMI optimizations for libjpeg-turbo
3  *
4  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5  * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
6  * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
7  *                          All Rights Reserved.
8  * Authors:  ZhuChen     <zhuchen@loongson.cn>
9  *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
10  *           CaiWanwei   <caiwanwei@loongson.cn>
11  *           ZhangLixia  <zhanglixia-hf@loongson.cn>
12  *
13  * Based on the x86 SIMD extension for IJG JPEG library
14  * Copyright (C) 1999-2006, MIYASAKA Masaru.
15  *
16  * This software is provided 'as-is', without any express or implied
17  * warranty.  In no event will the authors be held liable for any damages
18  * arising from the use of this software.
19  *
20  * Permission is granted to anyone to use this software for any purpose,
21  * including commercial applications, and to alter it and redistribute it
22  * freely, subject to the following restrictions:
23  *
24  * 1. The origin of this software must not be misrepresented; you must not
25  *    claim that you wrote the original software. If you use this software
26  *    in a product, an acknowledgment in the product documentation would be
27  *    appreciated but is not required.
28  * 2. Altered source versions must be plainly marked as such, and must not be
29  *    misrepresented as being the original software.
30  * 3. This notice may not be removed or altered from any source distribution.
31  */
32 
33 /* This file is included by jccolor-mmi.c */
34 
35 
36 #if RGB_RED == 0
37 #define mmA  mm0
38 #define mmB  mm1
39 #elif RGB_GREEN == 0
40 #define mmA  mm2
41 #define mmB  mm3
42 #elif RGB_BLUE == 0
43 #define mmA  mm4
44 #define mmB  mm5
45 #else
46 #define mmA  mm6
47 #define mmB  mm7
48 #endif
49 
50 #if RGB_RED == 1
51 #define mmC  mm0
52 #define mmD  mm1
53 #elif RGB_GREEN == 1
54 #define mmC  mm2
55 #define mmD  mm3
56 #elif RGB_BLUE == 1
57 #define mmC  mm4
58 #define mmD  mm5
59 #else
60 #define mmC  mm6
61 #define mmD  mm7
62 #endif
63 
64 #if RGB_RED == 2
65 #define mmE  mm0
66 #define mmF  mm1
67 #elif RGB_GREEN == 2
68 #define mmE  mm2
69 #define mmF  mm3
70 #elif RGB_BLUE == 2
71 #define mmE  mm4
72 #define mmF  mm5
73 #else
74 #define mmE  mm6
75 #define mmF  mm7
76 #endif
77 
78 #if RGB_RED == 3
79 #define mmG  mm0
80 #define mmH  mm1
81 #elif RGB_GREEN == 3
82 #define mmG  mm2
83 #define mmH  mm3
84 #elif RGB_BLUE == 3
85 #define mmG  mm4
86 #define mmH  mm5
87 #else
88 #define mmG  mm6
89 #define mmH  mm7
90 #endif
91 
92 
jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width,JSAMPARRAY input_buf,JSAMPIMAGE output_buf,JDIMENSION output_row,int num_rows)93 void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
94                                JSAMPIMAGE output_buf, JDIMENSION output_row,
95                                int num_rows)
96 {
97   JSAMPROW inptr, outptr0, outptr1, outptr2;
98   int num_cols, col;
99   __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
100   __m64 wk[7];
101   __m64 Y_BG, Cb_RG, Cr_BG;
102 
103   while (--num_rows >= 0) {
104     inptr = *input_buf++;
105     outptr0 = output_buf[0][output_row];
106     outptr1 = output_buf[1][output_row];
107     outptr2 = output_buf[2][output_row];
108     output_row++;
109 
110     for (num_cols = image_width; num_cols > 0; num_cols -= 8,
111          outptr0 += 8, outptr1 += 8, outptr2 += 8) {
112 
113 #if RGB_PIXELSIZE == 3
114 
115       if (num_cols < 8) {
116         col = num_cols * 3;
117         asm(".set noreorder\r\n"
118 
119             "li     $8, 1\r\n"
120             "move   $9, %3\r\n"
121             "and    $10, $9, $8\r\n"
122             "beqz   $10, 1f\r\n"
123             "nop    \r\n"
124             "subu   $9, $9, 1\r\n"
125             "xor    $12, $12, $12\r\n"
126             "move   $13, %5\r\n"
127             "dadd   $13, $13, $9\r\n"
128             "lbu    $12, 0($13)\r\n"
129 
130             "1:     \r\n"
131             "li     $8, 2\r\n"
132             "and    $10, $9, $8\r\n"
133             "beqz   $10, 2f\r\n"
134             "nop    \r\n"
135             "subu   $9, $9, 2\r\n"
136             "xor    $11, $11, $11\r\n"
137             "move   $13, %5\r\n"
138             "dadd   $13, $13, $9\r\n"
139             "lhu    $11, 0($13)\r\n"
140             "sll    $12, $12, 16\r\n"
141             "or     $12, $12, $11\r\n"
142 
143             "2:     \r\n"
144             "dmtc1  $12, %0\r\n"
145             "li     $8, 4\r\n"
146             "and    $10, $9, $8\r\n"
147             "beqz   $10, 3f\r\n"
148             "nop    \r\n"
149             "subu   $9, $9, 4\r\n"
150             "move   $13, %5\r\n"
151             "dadd   $13, $13, $9\r\n"
152             "lwu    $14, 0($13)\r\n"
153             "dmtc1  $14, %1\r\n"
154             "dsll32 $12, $12, 0\r\n"
155             "or     $12, $12, $14\r\n"
156             "dmtc1  $12, %0\r\n"
157 
158             "3:     \r\n"
159             "li     $8, 8\r\n"
160             "and    $10, $9, $8\r\n"
161             "beqz   $10, 4f\r\n"
162             "nop    \r\n"
163             "mov.s  %1, %0\r\n"
164             "ldc1   %0, 0(%5)\r\n"
165             "li     $9, 8\r\n"
166             "j      5f\r\n"
167             "nop    \r\n"
168 
169             "4:     \r\n"
170             "li     $8, 16\r\n"
171             "and    $10, $9, $8\r\n"
172             "beqz   $10, 5f\r\n"
173             "nop    \r\n"
174             "mov.s  %2, %0\r\n"
175             "ldc1   %0, 0(%5)\r\n"
176             "ldc1   %1, 8(%5)\r\n"
177 
178             "5:     \r\n"
179             "nop    \r\n"
180             ".set reorder\r\n"
181 
182             : "=f" (mmA), "=f" (mmG), "=f" (mmF)
183             : "r" (col), "r" (num_rows), "r" (inptr)
184             : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
185               "$14", "memory"
186            );
187       } else {
188         if (!(((long)inptr) & 7)) {
189           mmA = _mm_load_si64((__m64 *)&inptr[0]);
190           mmG = _mm_load_si64((__m64 *)&inptr[8]);
191           mmF = _mm_load_si64((__m64 *)&inptr[16]);
192         } else {
193           mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
194           mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
195           mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
196         }
197         inptr += RGB_PIXELSIZE * 8;
198       }
199       mmD = mmA;
200       mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
201       mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT);
202 
203       mmA = _mm_unpackhi_pi8(mmA, mmG);
204       mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
205 
206       mmD = _mm_unpacklo_pi8(mmD, mmF);
207       mmG = _mm_unpackhi_pi8(mmG, mmF);
208 
209       mmE = mmA;
210       mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
211       mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT);
212 
213       mmA = _mm_unpackhi_pi8(mmA, mmD);
214       mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
215 
216       mmE = _mm_unpacklo_pi8(mmE, mmG);
217       mmD = _mm_unpackhi_pi8(mmD, mmG);
218       mmC = mmA;
219       mmA = _mm_loadlo_pi8_f(mmA);
220       mmC = _mm_loadhi_pi8_f(mmC);
221 
222       mmB = mmE;
223       mmE = _mm_loadlo_pi8_f(mmE);
224       mmB = _mm_loadhi_pi8_f(mmB);
225 
226       mmF = mmD;
227       mmD = _mm_loadlo_pi8_f(mmD);
228       mmF = _mm_loadhi_pi8_f(mmF);
229 
230 #else  /* RGB_PIXELSIZE == 4 */
231 
232       if (num_cols < 8) {
233         col = num_cols;
234         asm(".set noreorder\r\n"
235 
236             "li     $8, 1\r\n"
237             "move   $9, %4\r\n"
238             "and    $10, $9, $8\r\n"
239             "beqz   $10, 1f\r\n"
240             "nop    \r\n"
241             "subu   $9, $9, 1\r\n"
242             "dsll   $11, $9, 2\r\n"
243             "move   $13, %5\r\n"
244             "daddu  $13, $13, $11\r\n"
245             "lwc1   %0, 0($13)\r\n"
246 
247             "1:     \r\n"
248             "li     $8, 2\r\n"
249             "and    $10, $9, $8\r\n"
250             "beqz   $10, 2f\r\n"
251             "nop    \r\n"
252             "subu   $9, $9, 2\r\n"
253             "dsll   $11, $9, 2\r\n"
254             "move   $13, %5\r\n"
255             "daddu  $13, $13, $11\r\n"
256             "mov.s  %1, %0\r\n"
257             "ldc1   %0, 0($13)\r\n"
258 
259             "2:     \r\n"
260             "li     $8, 4\r\n"
261             "and    $10, $9, $8\r\n"
262             "beqz   $10, 3f\r\n"
263             "nop    \r\n"
264             "mov.s  %2, %0\r\n"
265             "mov.s  %3, %1\r\n"
266             "ldc1   %0, 0(%5)\r\n"
267             "ldc1   %1, 8(%5)\r\n"
268 
269             "3:     \r\n"
270             "nop    \r\n"
271             ".set reorder\r\n"
272 
273             : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
274             : "r" (col), "r" (inptr)
275             : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
276            );
277       } else {
278         if (!(((long)inptr) & 7)) {
279           mmA = _mm_load_si64((__m64 *)&inptr[0]);
280           mmF = _mm_load_si64((__m64 *)&inptr[8]);
281           mmD = _mm_load_si64((__m64 *)&inptr[16]);
282           mmC = _mm_load_si64((__m64 *)&inptr[24]);
283         } else {
284           mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
285           mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
286           mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
287           mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
288         }
289         inptr += RGB_PIXELSIZE * 8;
290       }
291       mmB = mmA;
292       mmA = _mm_unpacklo_pi8(mmA, mmF);
293       mmB = _mm_unpackhi_pi8(mmB, mmF);
294 
295       mmG = mmD;
296       mmD = _mm_unpacklo_pi8(mmD, mmC);
297       mmG = _mm_unpackhi_pi8(mmG, mmC);
298 
299       mmE = mmA;
300       mmA = _mm_unpacklo_pi16(mmA, mmD);
301       mmE = _mm_unpackhi_pi16(mmE, mmD);
302 
303       mmH = mmB;
304       mmB = _mm_unpacklo_pi16(mmB, mmG);
305       mmH = _mm_unpackhi_pi16(mmH, mmG);
306 
307       mmC = mmA;
308       mmA = _mm_loadlo_pi8_f(mmA);
309       mmC = _mm_loadhi_pi8_f(mmC);
310 
311       mmD = mmB;
312       mmB = _mm_loadlo_pi8_f(mmB);
313       mmD = _mm_loadhi_pi8_f(mmD);
314 
315       mmG = mmE;
316       mmE = _mm_loadlo_pi8_f(mmE);
317       mmG = _mm_loadhi_pi8_f(mmG);
318 
319       mmF = mmH;
320       mmF = _mm_unpacklo_pi8(mmF, mmH);
321       mmH = _mm_unpackhi_pi8(mmH, mmH);
322       mmF = _mm_srli_pi16(mmF, BYTE_BIT);
323       mmH = _mm_srli_pi16(mmH, BYTE_BIT);
324 
325 #endif
326 
327       wk[0] = mm0;
328       wk[1] = mm1;
329       wk[2] = mm4;
330       wk[3] = mm5;
331 
332       mm6 = mm1;
333       mm1 = _mm_unpacklo_pi16(mm1, mm3);
334       mm6 = _mm_unpackhi_pi16(mm6, mm3);
335       mm7 = mm1;
336       mm4 = mm6;
337       mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337);
338       mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
339       mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033);
340       mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
341 
342       wk[4] = mm1;
343       wk[5] = mm6;
344 
345       mm1 = _mm_loadlo_pi16_f(mm5);
346       mm6 = _mm_loadhi_pi16_f(mm5);
347       mm1 = _mm_srli_pi32(mm1, 1);
348       mm6 = _mm_srli_pi32(mm6, 1);
349 
350       mm5 = PD_ONEHALFM1_CJ;
351       mm7 = _mm_add_pi32(mm7, mm1);
352       mm4 = _mm_add_pi32(mm4, mm6);
353       mm7 = _mm_add_pi32(mm7, mm5);
354       mm4 = _mm_add_pi32(mm4, mm5);
355       mm7 = _mm_srli_pi32(mm7, SCALEBITS);
356       mm4 = _mm_srli_pi32(mm4, SCALEBITS);
357       mm7 = _mm_packs_pi32(mm7, mm4);
358 
359       mm1 = wk[2];
360       mm6 = mm0;
361       mm0 = _mm_unpacklo_pi16(mm0, mm2);
362       mm6 = _mm_unpackhi_pi16(mm6, mm2);
363       mm5 = mm0;
364       mm4 = mm6;
365       mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337);
366       mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
367       mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033);
368       mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
369 
370       wk[6] = mm0;
371       wk[7] = mm6;
372       mm0 = _mm_loadlo_pi16_f(mm1);
373       mm6 = _mm_loadhi_pi16_f(mm1);
374       mm0 = _mm_srli_pi32(mm0, 1);
375       mm6 = _mm_srli_pi32(mm6, 1);
376 
377       mm1 = PD_ONEHALFM1_CJ;
378       mm5 = _mm_add_pi32(mm5, mm0);
379       mm4 = _mm_add_pi32(mm4, mm6);
380       mm5 = _mm_add_pi32(mm5, mm1);
381       mm4 = _mm_add_pi32(mm4, mm1);
382       mm5 = _mm_srli_pi32(mm5, SCALEBITS);
383       mm4 = _mm_srli_pi32(mm4, SCALEBITS);
384       mm5 = _mm_packs_pi32(mm5, mm4);
385 
386       mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
387       mm5  = _mm_or_si64(mm5, mm7);
388       Cb_RG = mm5;
389 
390       mm0 = wk[3];
391       mm6 = wk[2];
392       mm1 = wk[1];
393 
394       mm4 = mm0;
395       mm0 = _mm_unpacklo_pi16(mm0, mm3);
396       mm4 = _mm_unpackhi_pi16(mm4, mm3);
397       mm7 = mm0;
398       mm5 = mm4;
399       mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250);
400       mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
401       mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041);
402       mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
403 
404       mm3 = PD_ONEHALF;
405       mm0 = _mm_add_pi32(mm0, wk[4]);
406       mm4 = _mm_add_pi32(mm4, wk[5]);
407       mm0 = _mm_add_pi32(mm0, mm3);
408       mm4 = _mm_add_pi32(mm4, mm3);
409       mm0 = _mm_srli_pi32(mm0, SCALEBITS);
410       mm4 = _mm_srli_pi32(mm4, SCALEBITS);
411       mm0 = _mm_packs_pi32(mm0, mm4);
412 
413       mm3 = _mm_loadlo_pi16_f(mm1);
414       mm4 = _mm_loadhi_pi16_f(mm1);
415       mm3 = _mm_srli_pi32(mm3, 1);
416       mm4 = _mm_srli_pi32(mm4, 1);
417 
418       mm1 = PD_ONEHALFM1_CJ;
419       mm7 = _mm_add_pi32(mm7, mm3);
420       mm5 = _mm_add_pi32(mm5, mm4);
421       mm7 = _mm_add_pi32(mm7, mm1);
422       mm5 = _mm_add_pi32(mm5, mm1);
423       mm7 = _mm_srli_pi32(mm7, SCALEBITS);
424       mm5 = _mm_srli_pi32(mm5, SCALEBITS);
425       mm7 = _mm_packs_pi32(mm7, mm5);
426 
427       mm3 = wk[0];
428       mm4 = mm6;
429       mm6 = _mm_unpacklo_pi16(mm6, mm2);
430       mm4 = _mm_unpackhi_pi16(mm4, mm2);
431       mm1 = mm6;
432       mm5 = mm4;
433       mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250);
434       mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
435       mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041);
436       mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
437 
438       mm2 = PD_ONEHALF;
439       mm6 = _mm_add_pi32(mm6, wk[6]);
440       mm4 = _mm_add_pi32(mm4, wk[7]);
441       mm6 = _mm_add_pi32(mm6, mm2);
442       mm4 = _mm_add_pi32(mm4, mm2);
443       mm6 = _mm_srli_pi32(mm6, SCALEBITS);
444       mm4 = _mm_srli_pi32(mm4, SCALEBITS);
445       mm6 = _mm_packs_pi32(mm6, mm4);
446 
447       mm0 = _mm_slli_pi16(mm0, BYTE_BIT);
448       mm6 = _mm_or_si64(mm6, mm0);
449       Y_BG = mm6;
450 
451       mm2 = _mm_loadlo_pi16_f(mm3);
452       mm4 = _mm_loadhi_pi16_f(mm3);
453       mm2 = _mm_srli_pi32(mm2, 1);
454       mm4 = _mm_srli_pi32(mm4, 1);
455 
456       mm0 = PD_ONEHALFM1_CJ;
457       mm1 = _mm_add_pi32(mm1, mm2);
458       mm5 = _mm_add_pi32(mm5, mm4);
459       mm1 = _mm_add_pi32(mm1, mm0);
460       mm5 = _mm_add_pi32(mm5, mm0);
461       mm1 = _mm_srli_pi32(mm1, SCALEBITS);
462       mm5 = _mm_srli_pi32(mm5, SCALEBITS);
463       mm1 = _mm_packs_pi32(mm1, mm5);
464 
465       mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
466       mm1 = _mm_or_si64(mm1, mm7);
467       Cr_BG = mm1;
468 
469       _mm_store_si64((__m64 *)&outptr0[0], Y_BG);
470       _mm_store_si64((__m64 *)&outptr1[0], Cb_RG);
471       _mm_store_si64((__m64 *)&outptr2[0], Cr_BG);
472     }
473   }
474 }
475 
476 #undef mmA
477 #undef mmB
478 #undef mmC
479 #undef mmD
480 #undef mmE
481 #undef mmF
482 #undef mmG
483 #undef mmH
484