1 /*
2 * Loongson MMI optimizations for libjpeg-turbo
3 *
4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 * Copyright (C) 2014-2015, 2019, D. R. Commander. All Rights Reserved.
6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
7 * All Rights Reserved.
8 * Authors: ZhuChen <zhuchen@loongson.cn>
9 * SunZhangzhi <sunzhangzhi-cq@loongson.cn>
10 * CaiWanwei <caiwanwei@loongson.cn>
11 * ZhangLixia <zhanglixia-hf@loongson.cn>
12 *
13 * Based on the x86 SIMD extension for IJG JPEG library
14 * Copyright (C) 1999-2006, MIYASAKA Masaru.
15 *
16 * This software is provided 'as-is', without any express or implied
17 * warranty. In no event will the authors be held liable for any damages
18 * arising from the use of this software.
19 *
20 * Permission is granted to anyone to use this software for any purpose,
21 * including commercial applications, and to alter it and redistribute it
22 * freely, subject to the following restrictions:
23 *
24 * 1. The origin of this software must not be misrepresented; you must not
25 * claim that you wrote the original software. If you use this software
26 * in a product, an acknowledgment in the product documentation would be
27 * appreciated but is not required.
28 * 2. Altered source versions must be plainly marked as such, and must not be
29 * misrepresented as being the original software.
30 * 3. This notice may not be removed or altered from any source distribution.
31 */
32
33 /* This file is included by jccolor-mmi.c */
34
35
36 #if RGB_RED == 0
37 #define mmA mm0
38 #define mmB mm1
39 #elif RGB_GREEN == 0
40 #define mmA mm2
41 #define mmB mm3
42 #elif RGB_BLUE == 0
43 #define mmA mm4
44 #define mmB mm5
45 #else
46 #define mmA mm6
47 #define mmB mm7
48 #endif
49
50 #if RGB_RED == 1
51 #define mmC mm0
52 #define mmD mm1
53 #elif RGB_GREEN == 1
54 #define mmC mm2
55 #define mmD mm3
56 #elif RGB_BLUE == 1
57 #define mmC mm4
58 #define mmD mm5
59 #else
60 #define mmC mm6
61 #define mmD mm7
62 #endif
63
64 #if RGB_RED == 2
65 #define mmE mm0
66 #define mmF mm1
67 #elif RGB_GREEN == 2
68 #define mmE mm2
69 #define mmF mm3
70 #elif RGB_BLUE == 2
71 #define mmE mm4
72 #define mmF mm5
73 #else
74 #define mmE mm6
75 #define mmF mm7
76 #endif
77
78 #if RGB_RED == 3
79 #define mmG mm0
80 #define mmH mm1
81 #elif RGB_GREEN == 3
82 #define mmG mm2
83 #define mmH mm3
84 #elif RGB_BLUE == 3
85 #define mmG mm4
86 #define mmH mm5
87 #else
88 #define mmG mm6
89 #define mmH mm7
90 #endif
91
92
jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width,JSAMPARRAY input_buf,JSAMPIMAGE output_buf,JDIMENSION output_row,int num_rows)93 void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
94 JSAMPIMAGE output_buf, JDIMENSION output_row,
95 int num_rows)
96 {
97 JSAMPROW inptr, outptr0, outptr1, outptr2;
98 int num_cols, col;
99 __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
100 __m64 wk[7];
101 __m64 Y_BG, Cb_RG, Cr_BG;
102
103 while (--num_rows >= 0) {
104 inptr = *input_buf++;
105 outptr0 = output_buf[0][output_row];
106 outptr1 = output_buf[1][output_row];
107 outptr2 = output_buf[2][output_row];
108 output_row++;
109
110 for (num_cols = image_width; num_cols > 0; num_cols -= 8,
111 outptr0 += 8, outptr1 += 8, outptr2 += 8) {
112
113 #if RGB_PIXELSIZE == 3
114
115 if (num_cols < 8) {
116 col = num_cols * 3;
117 asm(".set noreorder\r\n"
118
119 "li $8, 1\r\n"
120 "move $9, %3\r\n"
121 "and $10, $9, $8\r\n"
122 "beqz $10, 1f\r\n"
123 "nop \r\n"
124 "subu $9, $9, 1\r\n"
125 "xor $12, $12, $12\r\n"
126 "move $13, %5\r\n"
127 "dadd $13, $13, $9\r\n"
128 "lbu $12, 0($13)\r\n"
129
130 "1: \r\n"
131 "li $8, 2\r\n"
132 "and $10, $9, $8\r\n"
133 "beqz $10, 2f\r\n"
134 "nop \r\n"
135 "subu $9, $9, 2\r\n"
136 "xor $11, $11, $11\r\n"
137 "move $13, %5\r\n"
138 "dadd $13, $13, $9\r\n"
139 "lhu $11, 0($13)\r\n"
140 "sll $12, $12, 16\r\n"
141 "or $12, $12, $11\r\n"
142
143 "2: \r\n"
144 "dmtc1 $12, %0\r\n"
145 "li $8, 4\r\n"
146 "and $10, $9, $8\r\n"
147 "beqz $10, 3f\r\n"
148 "nop \r\n"
149 "subu $9, $9, 4\r\n"
150 "move $13, %5\r\n"
151 "dadd $13, $13, $9\r\n"
152 "lwu $14, 0($13)\r\n"
153 "dmtc1 $14, %1\r\n"
154 "dsll32 $12, $12, 0\r\n"
155 "or $12, $12, $14\r\n"
156 "dmtc1 $12, %0\r\n"
157
158 "3: \r\n"
159 "li $8, 8\r\n"
160 "and $10, $9, $8\r\n"
161 "beqz $10, 4f\r\n"
162 "nop \r\n"
163 "mov.s %1, %0\r\n"
164 "ldc1 %0, 0(%5)\r\n"
165 "li $9, 8\r\n"
166 "j 5f\r\n"
167 "nop \r\n"
168
169 "4: \r\n"
170 "li $8, 16\r\n"
171 "and $10, $9, $8\r\n"
172 "beqz $10, 5f\r\n"
173 "nop \r\n"
174 "mov.s %2, %0\r\n"
175 "ldc1 %0, 0(%5)\r\n"
176 "ldc1 %1, 8(%5)\r\n"
177
178 "5: \r\n"
179 "nop \r\n"
180 ".set reorder\r\n"
181
182 : "=f" (mmA), "=f" (mmG), "=f" (mmF)
183 : "r" (col), "r" (num_rows), "r" (inptr)
184 : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
185 "$14", "memory"
186 );
187 } else {
188 if (!(((long)inptr) & 7)) {
189 mmA = _mm_load_si64((__m64 *)&inptr[0]);
190 mmG = _mm_load_si64((__m64 *)&inptr[8]);
191 mmF = _mm_load_si64((__m64 *)&inptr[16]);
192 } else {
193 mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
194 mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
195 mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
196 }
197 inptr += RGB_PIXELSIZE * 8;
198 }
199 mmD = mmA;
200 mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
201 mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT);
202
203 mmA = _mm_unpackhi_pi8(mmA, mmG);
204 mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
205
206 mmD = _mm_unpacklo_pi8(mmD, mmF);
207 mmG = _mm_unpackhi_pi8(mmG, mmF);
208
209 mmE = mmA;
210 mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
211 mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT);
212
213 mmA = _mm_unpackhi_pi8(mmA, mmD);
214 mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
215
216 mmE = _mm_unpacklo_pi8(mmE, mmG);
217 mmD = _mm_unpackhi_pi8(mmD, mmG);
218 mmC = mmA;
219 mmA = _mm_loadlo_pi8_f(mmA);
220 mmC = _mm_loadhi_pi8_f(mmC);
221
222 mmB = mmE;
223 mmE = _mm_loadlo_pi8_f(mmE);
224 mmB = _mm_loadhi_pi8_f(mmB);
225
226 mmF = mmD;
227 mmD = _mm_loadlo_pi8_f(mmD);
228 mmF = _mm_loadhi_pi8_f(mmF);
229
230 #else /* RGB_PIXELSIZE == 4 */
231
232 if (num_cols < 8) {
233 col = num_cols;
234 asm(".set noreorder\r\n"
235
236 "li $8, 1\r\n"
237 "move $9, %4\r\n"
238 "and $10, $9, $8\r\n"
239 "beqz $10, 1f\r\n"
240 "nop \r\n"
241 "subu $9, $9, 1\r\n"
242 "dsll $11, $9, 2\r\n"
243 "move $13, %5\r\n"
244 "daddu $13, $13, $11\r\n"
245 "lwc1 %0, 0($13)\r\n"
246
247 "1: \r\n"
248 "li $8, 2\r\n"
249 "and $10, $9, $8\r\n"
250 "beqz $10, 2f\r\n"
251 "nop \r\n"
252 "subu $9, $9, 2\r\n"
253 "dsll $11, $9, 2\r\n"
254 "move $13, %5\r\n"
255 "daddu $13, $13, $11\r\n"
256 "mov.s %1, %0\r\n"
257 "ldc1 %0, 0($13)\r\n"
258
259 "2: \r\n"
260 "li $8, 4\r\n"
261 "and $10, $9, $8\r\n"
262 "beqz $10, 3f\r\n"
263 "nop \r\n"
264 "mov.s %2, %0\r\n"
265 "mov.s %3, %1\r\n"
266 "ldc1 %0, 0(%5)\r\n"
267 "ldc1 %1, 8(%5)\r\n"
268
269 "3: \r\n"
270 "nop \r\n"
271 ".set reorder\r\n"
272
273 : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
274 : "r" (col), "r" (inptr)
275 : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
276 );
277 } else {
278 if (!(((long)inptr) & 7)) {
279 mmA = _mm_load_si64((__m64 *)&inptr[0]);
280 mmF = _mm_load_si64((__m64 *)&inptr[8]);
281 mmD = _mm_load_si64((__m64 *)&inptr[16]);
282 mmC = _mm_load_si64((__m64 *)&inptr[24]);
283 } else {
284 mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
285 mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
286 mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
287 mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
288 }
289 inptr += RGB_PIXELSIZE * 8;
290 }
291 mmB = mmA;
292 mmA = _mm_unpacklo_pi8(mmA, mmF);
293 mmB = _mm_unpackhi_pi8(mmB, mmF);
294
295 mmG = mmD;
296 mmD = _mm_unpacklo_pi8(mmD, mmC);
297 mmG = _mm_unpackhi_pi8(mmG, mmC);
298
299 mmE = mmA;
300 mmA = _mm_unpacklo_pi16(mmA, mmD);
301 mmE = _mm_unpackhi_pi16(mmE, mmD);
302
303 mmH = mmB;
304 mmB = _mm_unpacklo_pi16(mmB, mmG);
305 mmH = _mm_unpackhi_pi16(mmH, mmG);
306
307 mmC = mmA;
308 mmA = _mm_loadlo_pi8_f(mmA);
309 mmC = _mm_loadhi_pi8_f(mmC);
310
311 mmD = mmB;
312 mmB = _mm_loadlo_pi8_f(mmB);
313 mmD = _mm_loadhi_pi8_f(mmD);
314
315 mmG = mmE;
316 mmE = _mm_loadlo_pi8_f(mmE);
317 mmG = _mm_loadhi_pi8_f(mmG);
318
319 mmF = mmH;
320 mmF = _mm_unpacklo_pi8(mmF, mmH);
321 mmH = _mm_unpackhi_pi8(mmH, mmH);
322 mmF = _mm_srli_pi16(mmF, BYTE_BIT);
323 mmH = _mm_srli_pi16(mmH, BYTE_BIT);
324
325 #endif
326
327 wk[0] = mm0;
328 wk[1] = mm1;
329 wk[2] = mm4;
330 wk[3] = mm5;
331
332 mm6 = mm1;
333 mm1 = _mm_unpacklo_pi16(mm1, mm3);
334 mm6 = _mm_unpackhi_pi16(mm6, mm3);
335 mm7 = mm1;
336 mm4 = mm6;
337 mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337);
338 mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
339 mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033);
340 mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
341
342 wk[4] = mm1;
343 wk[5] = mm6;
344
345 mm1 = _mm_loadlo_pi16_f(mm5);
346 mm6 = _mm_loadhi_pi16_f(mm5);
347 mm1 = _mm_srli_pi32(mm1, 1);
348 mm6 = _mm_srli_pi32(mm6, 1);
349
350 mm5 = PD_ONEHALFM1_CJ;
351 mm7 = _mm_add_pi32(mm7, mm1);
352 mm4 = _mm_add_pi32(mm4, mm6);
353 mm7 = _mm_add_pi32(mm7, mm5);
354 mm4 = _mm_add_pi32(mm4, mm5);
355 mm7 = _mm_srli_pi32(mm7, SCALEBITS);
356 mm4 = _mm_srli_pi32(mm4, SCALEBITS);
357 mm7 = _mm_packs_pi32(mm7, mm4);
358
359 mm1 = wk[2];
360 mm6 = mm0;
361 mm0 = _mm_unpacklo_pi16(mm0, mm2);
362 mm6 = _mm_unpackhi_pi16(mm6, mm2);
363 mm5 = mm0;
364 mm4 = mm6;
365 mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337);
366 mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
367 mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033);
368 mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
369
370 wk[6] = mm0;
371 wk[7] = mm6;
372 mm0 = _mm_loadlo_pi16_f(mm1);
373 mm6 = _mm_loadhi_pi16_f(mm1);
374 mm0 = _mm_srli_pi32(mm0, 1);
375 mm6 = _mm_srli_pi32(mm6, 1);
376
377 mm1 = PD_ONEHALFM1_CJ;
378 mm5 = _mm_add_pi32(mm5, mm0);
379 mm4 = _mm_add_pi32(mm4, mm6);
380 mm5 = _mm_add_pi32(mm5, mm1);
381 mm4 = _mm_add_pi32(mm4, mm1);
382 mm5 = _mm_srli_pi32(mm5, SCALEBITS);
383 mm4 = _mm_srli_pi32(mm4, SCALEBITS);
384 mm5 = _mm_packs_pi32(mm5, mm4);
385
386 mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
387 mm5 = _mm_or_si64(mm5, mm7);
388 Cb_RG = mm5;
389
390 mm0 = wk[3];
391 mm6 = wk[2];
392 mm1 = wk[1];
393
394 mm4 = mm0;
395 mm0 = _mm_unpacklo_pi16(mm0, mm3);
396 mm4 = _mm_unpackhi_pi16(mm4, mm3);
397 mm7 = mm0;
398 mm5 = mm4;
399 mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250);
400 mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
401 mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041);
402 mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
403
404 mm3 = PD_ONEHALF;
405 mm0 = _mm_add_pi32(mm0, wk[4]);
406 mm4 = _mm_add_pi32(mm4, wk[5]);
407 mm0 = _mm_add_pi32(mm0, mm3);
408 mm4 = _mm_add_pi32(mm4, mm3);
409 mm0 = _mm_srli_pi32(mm0, SCALEBITS);
410 mm4 = _mm_srli_pi32(mm4, SCALEBITS);
411 mm0 = _mm_packs_pi32(mm0, mm4);
412
413 mm3 = _mm_loadlo_pi16_f(mm1);
414 mm4 = _mm_loadhi_pi16_f(mm1);
415 mm3 = _mm_srli_pi32(mm3, 1);
416 mm4 = _mm_srli_pi32(mm4, 1);
417
418 mm1 = PD_ONEHALFM1_CJ;
419 mm7 = _mm_add_pi32(mm7, mm3);
420 mm5 = _mm_add_pi32(mm5, mm4);
421 mm7 = _mm_add_pi32(mm7, mm1);
422 mm5 = _mm_add_pi32(mm5, mm1);
423 mm7 = _mm_srli_pi32(mm7, SCALEBITS);
424 mm5 = _mm_srli_pi32(mm5, SCALEBITS);
425 mm7 = _mm_packs_pi32(mm7, mm5);
426
427 mm3 = wk[0];
428 mm4 = mm6;
429 mm6 = _mm_unpacklo_pi16(mm6, mm2);
430 mm4 = _mm_unpackhi_pi16(mm4, mm2);
431 mm1 = mm6;
432 mm5 = mm4;
433 mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250);
434 mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
435 mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041);
436 mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
437
438 mm2 = PD_ONEHALF;
439 mm6 = _mm_add_pi32(mm6, wk[6]);
440 mm4 = _mm_add_pi32(mm4, wk[7]);
441 mm6 = _mm_add_pi32(mm6, mm2);
442 mm4 = _mm_add_pi32(mm4, mm2);
443 mm6 = _mm_srli_pi32(mm6, SCALEBITS);
444 mm4 = _mm_srli_pi32(mm4, SCALEBITS);
445 mm6 = _mm_packs_pi32(mm6, mm4);
446
447 mm0 = _mm_slli_pi16(mm0, BYTE_BIT);
448 mm6 = _mm_or_si64(mm6, mm0);
449 Y_BG = mm6;
450
451 mm2 = _mm_loadlo_pi16_f(mm3);
452 mm4 = _mm_loadhi_pi16_f(mm3);
453 mm2 = _mm_srli_pi32(mm2, 1);
454 mm4 = _mm_srli_pi32(mm4, 1);
455
456 mm0 = PD_ONEHALFM1_CJ;
457 mm1 = _mm_add_pi32(mm1, mm2);
458 mm5 = _mm_add_pi32(mm5, mm4);
459 mm1 = _mm_add_pi32(mm1, mm0);
460 mm5 = _mm_add_pi32(mm5, mm0);
461 mm1 = _mm_srli_pi32(mm1, SCALEBITS);
462 mm5 = _mm_srli_pi32(mm5, SCALEBITS);
463 mm1 = _mm_packs_pi32(mm1, mm5);
464
465 mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
466 mm1 = _mm_or_si64(mm1, mm7);
467 Cr_BG = mm1;
468
469 _mm_store_si64((__m64 *)&outptr0[0], Y_BG);
470 _mm_store_si64((__m64 *)&outptr1[0], Cb_RG);
471 _mm_store_si64((__m64 *)&outptr2[0], Cr_BG);
472 }
473 }
474 }
475
476 #undef mmA
477 #undef mmB
478 #undef mmC
479 #undef mmD
480 #undef mmE
481 #undef mmF
482 #undef mmG
483 #undef mmH
484