1/*
2 * MIPS DSPr2 optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
5 * All rights reserved.
6 * Authors:  Teodora Novkovic (teodora.novkovic@imgtec.com)
7 *           Darko Laus       (darko.laus@imgtec.com)
8 * This software is provided 'as-is', without any express or implied
9 * warranty.  In no event will the authors be held liable for any damages
10 * arising from the use of this software.
11 *
12 * Permission is granted to anyone to use this software for any purpose,
13 * including commercial applications, and to alter it and redistribute it
14 * freely, subject to the following restrictions:
15 *
16 * 1. The origin of this software must not be misrepresented; you must not
17 *    claim that you wrote the original software. If you use this software
18 *    in a product, an acknowledgment in the product documentation would be
19 *    appreciated but is not required.
20 * 2. Altered source versions must be plainly marked as such, and must not be
21 *    misrepresented as being the original software.
22 * 3. This notice may not be removed or altered from any source distribution.
23 */
24
25#include "jsimd_mips_dspr2_asm.h"
26
27/*****************************************************************************/
28LEAF_MIPS_DSPR2(jsimd_c_null_convert_mips_dspr2)
29/*
30 * a0     - cinfo->image_width
31 * a1     - input_buf
32 * a2     - output_buf
33 * a3     - output_row
34 * 16(sp) - num_rows
35 * 20(sp) - cinfo->num_components
36 *
37 * Null conversion for compression
38 */
39
40    SAVE_REGS_ON_STACK 8, s0, s1
41
42    lw        t9, 24(sp)   // t9 = num_rows
43    lw        s0, 28(sp)   // s0 = cinfo->num_components
44    andi      t0, a0, 3    // t0 = cinfo->image_width & 3
45    beqz      t0, 4f       // no residual
46     nop
470:
48    addiu     t9, t9, -1
49    bltz      t9, 7f
50     li       t1, 0
511:
52    sll       t3, t1, 2
53    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
54    lw        t2, 0(a1)    // t2 = inptr = *input_buf
55    sll       t4, a3, 2
56    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
57    addu      t2, t2, t1
58    addu      s1, t5, a0
59    addu      t6, t5, t0
602:
61    lbu       t3, 0(t2)
62    addiu     t5, t5, 1
63    sb        t3, -1(t5)
64    bne       t6, t5, 2b
65     addu     t2, t2, s0
663:
67    lbu       t3, 0(t2)
68    addu      t4, t2, s0
69    addu      t7, t4, s0
70    addu      t8, t7, s0
71    addu      t2, t8, s0
72    lbu       t4, 0(t4)
73    lbu       t7, 0(t7)
74    lbu       t8, 0(t8)
75    addiu     t5, t5, 4
76    sb        t3, -4(t5)
77    sb        t4, -3(t5)
78    sb        t7, -2(t5)
79    bne       s1, t5, 3b
80     sb       t8, -1(t5)
81    addiu     t1, t1, 1
82    bne       t1, s0, 1b
83     nop
84    addiu     a1, a1, 4
85    bgez      t9, 0b
86     addiu    a3, a3, 1
87    b         7f
88     nop
894:
90    addiu     t9, t9, -1
91    bltz      t9, 7f
92     li       t1, 0
935:
94    sll       t3, t1, 2
95    lwx       t5, t3(a2)   // t5 = outptr = output_buf[ci]
96    lw        t2, 0(a1)    // t2 = inptr = *input_buf
97    sll       t4, a3, 2
98    lwx       t5, t4(t5)   // t5 = outptr = output_buf[ci][output_row]
99    addu      t2, t2, t1
100    addu      s1, t5, a0
101    addu      t6, t5, t0
1026:
103    lbu       t3, 0(t2)
104    addu      t4, t2, s0
105    addu      t7, t4, s0
106    addu      t8, t7, s0
107    addu      t2, t8, s0
108    lbu       t4, 0(t4)
109    lbu       t7, 0(t7)
110    lbu       t8, 0(t8)
111    addiu     t5, t5, 4
112    sb        t3, -4(t5)
113    sb        t4, -3(t5)
114    sb        t7, -2(t5)
115    bne       s1, t5, 6b
116     sb       t8, -1(t5)
117    addiu     t1, t1, 1
118    bne       t1, s0, 5b
119     nop
120    addiu     a1, a1, 4
121    bgez      t9, 4b
122     addiu    a3, a3, 1
1237:
124    RESTORE_REGS_FROM_STACK 8, s0, s1
125
126    j         ra
127     nop
128
129END(jsimd_c_null_convert_mips_dspr2)
130
131/*****************************************************************************/
132/*
133 * jsimd_extrgb_ycc_convert_mips_dspr2
134 * jsimd_extbgr_ycc_convert_mips_dspr2
135 * jsimd_extrgbx_ycc_convert_mips_dspr2
136 * jsimd_extbgrx_ycc_convert_mips_dspr2
137 * jsimd_extxbgr_ycc_convert_mips_dspr2
138 * jsimd_extxrgb_ycc_convert_mips_dspr2
139 *
140 * Colorspace conversion RGB -> YCbCr
141 */
142
143.macro GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
144
145.macro DO_RGB_TO_YCC r,    \
146                     g,    \
147                     b,    \
148                     inptr
149    lbu     \r, \r_offs(\inptr)
150    lbu     \g, \g_offs(\inptr)
151    lbu     \b, \b_offs(\inptr)
152    addiu   \inptr, \pixel_size
153.endm
154
155LEAF_MIPS_DSPR2(jsimd_\colorid\()_ycc_convert_mips_dspr2)
156/*
157 * a0     - cinfo->image_width
158 * a1     - input_buf
159 * a2     - output_buf
160 * a3     - output_row
161 * 16(sp) - num_rows
162 */
163
164    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
165
166    lw      t7, 48(sp)        // t7 = num_rows
167    li      s0, 0x4c8b        // FIX(0.29900)
168    li      s1, 0x9646        // FIX(0.58700)
169    li      s2, 0x1d2f        // FIX(0.11400)
170    li      s3, 0xffffd4cd    // -FIX(0.16874)
171    li      s4, 0xffffab33    // -FIX(0.33126)
172    li      s5, 0x8000        // FIX(0.50000)
173    li      s6, 0xffff94d1    // -FIX(0.41869)
174    li      s7, 0xffffeb2f    // -FIX(0.08131)
175    li      t8, 0x807fff      // CBCR_OFFSET + ONE_HALF-1
176
1770:
178    addiu   t7, -1            // --num_rows
179    lw      t6, 0(a1)         // t6 = input_buf[0]
180    lw      t0, 0(a2)
181    lw      t1, 4(a2)
182    lw      t2, 8(a2)
183    sll     t3, a3, 2
184    lwx     t0, t3(t0)        // t0 = output_buf[0][output_row]
185    lwx     t1, t3(t1)        // t1 = output_buf[1][output_row]
186    lwx     t2, t3(t2)        // t2 = output_buf[2][output_row]
187
188    addu    t9, t2, a0        // t9 = end address
189    addiu   a3, 1
190
1911:
192    DO_RGB_TO_YCC t3, t4, t5, t6
193
194    mtlo    s5, $ac0
195    mtlo    t8, $ac1
196    mtlo    t8, $ac2
197    maddu   $ac0, s2, t5
198    maddu   $ac1, s5, t5
199    maddu   $ac2, s5, t3
200    maddu   $ac0, s0, t3
201    maddu   $ac1, s3, t3
202    maddu   $ac2, s6, t4
203    maddu   $ac0, s1, t4
204    maddu   $ac1, s4, t4
205    maddu   $ac2, s7, t5
206    extr.w  t3, $ac0, 16
207    extr.w  t4, $ac1, 16
208    extr.w  t5, $ac2, 16
209    sb      t3, 0(t0)
210    sb      t4, 0(t1)
211    sb      t5, 0(t2)
212    addiu   t0, 1
213    addiu   t2, 1
214    bne     t2, t9, 1b
215     addiu  t1, 1
216    bgtz    t7, 0b
217     addiu  a1, 4
218
219    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
220
221    j ra
222     nop
223END(jsimd_\colorid\()_ycc_convert_mips_dspr2)
224
225.purgem DO_RGB_TO_YCC
226
227.endm
228
229/*------------------------------------------id -- pix R  G  B */
230GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
231GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
232GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
233GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
234GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
235GENERATE_JSIMD_RGB_YCC_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
236
237/*****************************************************************************/
238/*
239 * jsimd_ycc_extrgb_convert_mips_dspr2
240 * jsimd_ycc_extbgr_convert_mips_dspr2
241 * jsimd_ycc_extrgbx_convert_mips_dspr2
242 * jsimd_ycc_extbgrx_convert_mips_dspr2
243 * jsimd_ycc_extxbgr_convert_mips_dspr2
244 * jsimd_ycc_extxrgb_convert_mips_dspr2
245 *
246 * Colorspace conversion YCbCr -> RGB
247 */
248
249.macro GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs, a_offs
250
251.macro STORE_YCC_TO_RGB  scratch0 \
252                         scratch1 \
253                         scratch2 \
254                         outptr
255    sb       \scratch0, \r_offs(\outptr)
256    sb       \scratch1, \g_offs(\outptr)
257    sb       \scratch2, \b_offs(\outptr)
258.if (\pixel_size == 4)
259    li       t0, 0xFF
260    sb       t0, \a_offs(\outptr)
261.endif
262    addiu    \outptr, \pixel_size
263.endm
264
265LEAF_MIPS_DSPR2(jsimd_ycc_\colorid\()_convert_mips_dspr2)
266/*
267 * a0     - cinfo->image_width
268 * a1     - input_buf
269 * a2     - input_row
270 * a3     - output_buf
271 * 16(sp) - num_rows
272 */
273
274    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
275
276    lw         s1, 48(sp)
277    li         t3, 0x8000
278    li         t4, 0x166e9     // FIX(1.40200)
279    li         t5, 0x1c5a2     // FIX(1.77200)
280    li         t6, 0xffff492e  // -FIX(0.71414)
281    li         t7, 0xffffa7e6  // -FIX(0.34414)
282    repl.ph    t8, 128
283
2840:
285    lw         s0, 0(a3)
286    lw         t0, 0(a1)
287    lw         t1, 4(a1)
288    lw         t2, 8(a1)
289    sll        s5, a2, 2
290    addiu      s1, -1
291    lwx        s2, s5(t0)
292    lwx        s3, s5(t1)
293    lwx        s4, s5(t2)
294    addu       t9, s2, a0
295    addiu      a2, 1
296
2971:
298    lbu        s7, 0(s4)       // cr
299    lbu        s6, 0(s3)       // cb
300    lbu        s5, 0(s2)       // y
301    addiu      s2, 1
302    addiu      s4, 1
303    addiu      s7, -128
304    addiu      s6, -128
305    mul        t2, t7, s6
306    mul        t0, t6, s7      // Crgtab[cr]
307    sll        s7, 15
308    mulq_rs.w  t1, t4, s7      // Crrtab[cr]
309    sll        s6, 15
310    addu       t2, t3          // Cbgtab[cb]
311    addu       t2, t0
312
313    mulq_rs.w  t0, t5, s6      // Cbbtab[cb]
314    sra        t2, 16
315    addu       t1, s5
316    addu       t2, s5          // add y
317    ins        t2, t1, 16, 16
318    subu.ph    t2, t2, t8
319    addu       t0, s5
320    shll_s.ph  t2, t2, 8
321    subu       t0, 128
322    shra.ph    t2, t2, 8
323    shll_s.w   t0, t0, 24
324    addu.ph    t2, t2, t8      // clip & store
325    sra        t0, t0, 24
326    sra        t1, t2, 16
327    addiu      t0, 128
328
329    STORE_YCC_TO_RGB t1, t2, t0, s0
330
331    bne        s2, t9, 1b
332     addiu     s3, 1
333    bgtz       s1, 0b
334     addiu     a3, 4
335
336    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
337
338    j ra
339     nop
340END(jsimd_ycc_\colorid\()_convert_mips_dspr2)
341
342.purgem STORE_YCC_TO_RGB
343
344.endm
345
346/*------------------------------------------id -- pix R  G  B  A */
347GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2, 3
348GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0, 3
349GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2, 3
350GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0, 3
351GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1, 0
352GENERATE_JSIMD_YCC_RGB_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3, 0
353
354/*****************************************************************************/
355/*
356 * jsimd_extrgb_gray_convert_mips_dspr2
357 * jsimd_extbgr_gray_convert_mips_dspr2
358 * jsimd_extrgbx_gray_convert_mips_dspr2
359 * jsimd_extbgrx_gray_convert_mips_dspr2
360 * jsimd_extxbgr_gray_convert_mips_dspr2
361 * jsimd_extxrgb_gray_convert_mips_dspr2
362 *
363 * Colorspace conversion RGB -> GRAY
364 */
365
366.macro GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 colorid, pixel_size, r_offs, g_offs, b_offs
367
368.macro DO_RGB_TO_GRAY r,    \
369                      g,    \
370                      b,    \
371                      inptr
372    lbu     \r, \r_offs(\inptr)
373    lbu     \g, \g_offs(\inptr)
374    lbu     \b, \b_offs(\inptr)
375    addiu   \inptr, \pixel_size
376.endm
377
378LEAF_MIPS_DSPR2(jsimd_\colorid\()_gray_convert_mips_dspr2)
379/*
380 * a0     - cinfo->image_width
381 * a1     - input_buf
382 * a2     - output_buf
383 * a3     - output_row
384 * 16(sp) - num_rows
385 */
386
387    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
388
389    li      s0, 0x4c8b             // s0 = FIX(0.29900)
390    li      s1, 0x9646             // s1 = FIX(0.58700)
391    li      s2, 0x1d2f             // s2 = FIX(0.11400)
392    li      s7, 0x8000             // s7 = FIX(0.50000)
393    lw      s6, 48(sp)
394    andi    t7, a0, 3
395
3960:
397    addiu   s6, -1                 // s6 = num_rows
398    lw      t0, 0(a1)
399    lw      t1, 0(a2)
400    sll     t3, a3, 2
401    lwx     t1, t3(t1)
402    addiu   a3, 1
403    addu    t9, t1, a0
404    subu    t8, t9, t7
405    beq     t1, t8, 2f
406     nop
407
4081:
409    DO_RGB_TO_GRAY t3, t4, t5, t0
410    DO_RGB_TO_GRAY s3, s4, s5, t0
411
412    mtlo    s7, $ac0
413    maddu   $ac0, s2, t5
414    maddu   $ac0, s1, t4
415    maddu   $ac0, s0, t3
416    mtlo    s7, $ac1
417    maddu   $ac1, s2, s5
418    maddu   $ac1, s1, s4
419    maddu   $ac1, s0, s3
420    extr.w  t6, $ac0, 16
421
422    DO_RGB_TO_GRAY t3, t4, t5, t0
423    DO_RGB_TO_GRAY s3, s4, s5, t0
424
425    mtlo    s7, $ac0
426    maddu   $ac0, s2, t5
427    maddu   $ac0, s1, t4
428    extr.w  t2, $ac1, 16
429    maddu   $ac0, s0, t3
430    mtlo    s7, $ac1
431    maddu   $ac1, s2, s5
432    maddu   $ac1, s1, s4
433    maddu   $ac1, s0, s3
434    extr.w  t5, $ac0, 16
435    sb      t6, 0(t1)
436    sb      t2, 1(t1)
437    extr.w  t3, $ac1, 16
438    addiu   t1, 4
439    sb      t5, -2(t1)
440    sb      t3, -1(t1)
441    bne     t1, t8, 1b
442     nop
443
4442:
445    beqz    t7, 4f
446     nop
447
4483:
449    DO_RGB_TO_GRAY t3, t4, t5, t0
450
451    mtlo    s7, $ac0
452    maddu   $ac0, s2, t5
453    maddu   $ac0, s1, t4
454    maddu   $ac0, s0, t3
455    extr.w  t6, $ac0, 16
456    sb      t6, 0(t1)
457    addiu   t1, 1
458    bne     t1, t9, 3b
459     nop
460
4614:
462    bgtz    s6, 0b
463     addiu  a1, 4
464
465    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
466
467    j ra
468     nop
469END(jsimd_\colorid\()_gray_convert_mips_dspr2)
470
471.purgem DO_RGB_TO_GRAY
472
473.endm
474
475/*------------------------------------------id --  pix R  G  B */
476GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgb,  3, 0, 1, 2
477GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgr,  3, 2, 1, 0
478GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extrgbx, 4, 0, 1, 2
479GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extbgrx, 4, 2, 1, 0
480GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxbgr, 4, 3, 2, 1
481GENERATE_JSIMD_RGB_GRAY_CONVERT_MIPS_DSPR2 extxrgb, 4, 1, 2, 3
482/*****************************************************************************/
483/*
484 * jsimd_h2v2_merged_upsample_mips_dspr2
485 * jsimd_h2v2_extrgb_merged_upsample_mips_dspr2
486 * jsimd_h2v2_extrgbx_merged_upsample_mips_dspr2
487 * jsimd_h2v2_extbgr_merged_upsample_mips_dspr2
488 * jsimd_h2v2_extbgrx_merged_upsample_mips_dspr2
489 * jsimd_h2v2_extxbgr_merged_upsample_mips_dspr2
490 * jsimd_h2v2_extxrgb_merged_upsample_mips_dspr2
491 *
492 * Merged h2v2 upsample routines
493 */
494.macro GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
495                                                pixel_size, \
496                                                r1_offs,    \
497                                                g1_offs,    \
498                                                b1_offs,    \
499                                                a1_offs,    \
500                                                r2_offs,    \
501                                                g2_offs,    \
502                                                b2_offs,    \
503                                                a2_offs
504
505.macro STORE_H2V2_2_PIXELS  scratch0 \
506                            scratch1 \
507                            scratch2 \
508                            scratch3 \
509                            scratch4 \
510                            scratch5 \
511                            outptr
512    sb       \scratch0, \r1_offs(\outptr)
513    sb       \scratch1, \g1_offs(\outptr)
514    sb       \scratch2, \b1_offs(\outptr)
515    sb       \scratch3, \r2_offs(\outptr)
516    sb       \scratch4, \g2_offs(\outptr)
517    sb       \scratch5, \b2_offs(\outptr)
518.if (\pixel_size == 8)
519    li       \scratch0, 0xFF
520    sb       \scratch0, \a1_offs(\outptr)
521    sb       \scratch0, \a2_offs(\outptr)
522.endif
523    addiu    \outptr, \pixel_size
524.endm
525
526.macro STORE_H2V2_1_PIXEL  scratch0 \
527                           scratch1 \
528                           scratch2 \
529                           outptr
530    sb    \scratch0, \r1_offs(\outptr)
531    sb    \scratch1, \g1_offs(\outptr)
532    sb    \scratch2, \b1_offs(\outptr)
533
534.if (\pixel_size == 8)
535    li    t0, 0xFF
536    sb    t0, \a1_offs(\outptr)
537.endif
538.endm
539
540LEAF_MIPS_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
541/*
542 * a0     - cinfo->output_width
543 * a1     - input_buf
544 * a2     - in_row_group_ctr
545 * a3     - output_buf
546 * 16(sp) - cinfo->sample_range_limit
547 */
548
549    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
550
551    lw           t9, 56(sp)        // cinfo->sample_range_limit
552    lw           v0, 0(a1)
553    lw           v1, 4(a1)
554    lw           t0, 8(a1)
555    sll          t1, a2, 3
556    addiu        t2, t1, 4
557    sll          t3, a2, 2
558    lw           t4, 0(a3)         // t4 = output_buf[0]
559    lwx          t1, t1(v0)        // t1 = input_buf[0][in_row_group_ctr*2]
560    lwx          t2, t2(v0)        // t2 = input_buf[0][in_row_group_ctr*2 + 1]
561    lwx          t5, t3(v1)        // t5 = input_buf[1][in_row_group_ctr]
562    lwx          t6, t3(t0)        // t6 = input_buf[2][in_row_group_ctr]
563    lw           t7, 4(a3)         // t7 = output_buf[1]
564    li           s1, 0xe6ea
565    addiu        t8, s1, 0x7fff    // t8 = 0x166e9 [FIX(1.40200)]
566    addiu        s0, t8, 0x5eb9    // s0 = 0x1c5a2 [FIX(1.77200)]
567    addiu        s1, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
568    xori         s2, s1, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
569    srl          t3, a0, 1
570    blez         t3, 2f
571     addu        t0, t5, t3        // t0 = end address
572 1:
573    lbu          t3, 0(t5)
574    lbu          s3, 0(t6)
575    addiu        t5, t5, 1
576    addiu        t3, t3, -128      // (cb - 128)
577    addiu        s3, s3, -128      // (cr - 128)
578    mult         $ac1, s1, t3
579    madd         $ac1, s2, s3
580    sll          s3, s3, 15
581    sll          t3, t3, 15
582    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
583    extr_r.w     s5, $ac1, 16
584    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
585    lbu          v0, 0(t1)
586    addiu        t6, t6, 1
587    addiu        t1, t1, 2
588    addu         t3, v0, s4        // y+cred
589    addu         s3, v0, s5        // y+cgreen
590    addu         v1, v0, s6        // y+cblue
591    addu         t3, t9, t3        // y+cred
592    addu         s3, t9, s3        // y+cgreen
593    addu         v1, t9, v1        // y+cblue
594    lbu          AT, 0(t3)
595    lbu          s7, 0(s3)
596    lbu          ra, 0(v1)
597    lbu          v0, -1(t1)
598    addu         t3, v0, s4        // y+cred
599    addu         s3, v0, s5        // y+cgreen
600    addu         v1, v0, s6        // y+cblue
601    addu         t3, t9, t3        // y+cred
602    addu         s3, t9, s3        // y+cgreen
603    addu         v1, t9, v1        // y+cblue
604    lbu          t3, 0(t3)
605    lbu          s3, 0(s3)
606    lbu          v1, 0(v1)
607    lbu          v0, 0(t2)
608
609    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
610
611    addu         t3, v0, s4        // y+cred
612    addu         s3, v0, s5        // y+cgreen
613    addu         v1, v0, s6        // y+cblue
614    addu         t3, t9, t3        // y+cred
615    addu         s3, t9, s3        // y+cgreen
616    addu         v1, t9, v1        // y+cblue
617    lbu          AT, 0(t3)
618    lbu          s7, 0(s3)
619    lbu          ra, 0(v1)
620    lbu          v0, 1(t2)
621    addiu        t2, t2, 2
622    addu         t3, v0, s4        // y+cred
623    addu         s3, v0, s5        // y+cgreen
624    addu         v1, v0, s6        // y+cblue
625    addu         t3, t9, t3        // y+cred
626    addu         s3, t9, s3        // y+cgreen
627    addu         v1, t9, v1        // y+cblue
628    lbu          t3, 0(t3)
629    lbu          s3, 0(s3)
630    lbu          v1, 0(v1)
631
632    STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
633
634    bne          t0, t5, 1b
635     nop
6362:
637    andi         t0, a0, 1
638    beqz         t0, 4f
639     lbu          t3, 0(t5)
640    lbu          s3, 0(t6)
641    addiu        t3, t3, -128      // (cb - 128)
642    addiu        s3, s3, -128      // (cr - 128)
643    mult         $ac1, s1, t3
644    madd         $ac1, s2, s3
645    sll          s3, s3, 15
646    sll          t3, t3, 15
647    lbu          v0, 0(t1)
648    extr_r.w     s5, $ac1, 16
649    mulq_rs.w    s4, t8, s3        // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
650    mulq_rs.w    s6, s0, t3        // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
651    addu         t3, v0, s4        // y+cred
652    addu         s3, v0, s5        // y+cgreen
653    addu         v1, v0, s6        // y+cblue
654    addu         t3, t9, t3        // y+cred
655    addu         s3, t9, s3        // y+cgreen
656    addu         v1, t9, v1        // y+cblue
657    lbu          t3, 0(t3)
658    lbu          s3, 0(s3)
659    lbu          v1, 0(v1)
660    lbu          v0, 0(t2)
661
662    STORE_H2V2_1_PIXEL t3, s3, v1, t4
663
664    addu         t3, v0, s4        // y+cred
665    addu         s3, v0, s5        // y+cgreen
666    addu         v1, v0, s6        // y+cblue
667    addu         t3, t9, t3        // y+cred
668    addu         s3, t9, s3        // y+cgreen
669    addu         v1, t9, v1        // y+cblue
670    lbu          t3, 0(t3)
671    lbu          s3, 0(s3)
672    lbu          v1, 0(v1)
673
674    STORE_H2V2_1_PIXEL t3, s3, v1, t7
6754:
676    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
677
678    j           ra
679     nop
680
681END(jsimd_h2v2_\colorid\()_merged_upsample_mips_dspr2)
682
683.purgem STORE_H2V2_1_PIXEL
684.purgem STORE_H2V2_2_PIXELS
685.endm
686
687/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
688GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
689GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
690GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
691GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
692GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
693GENERATE_H2V2_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
694/*****************************************************************************/
695/*
696 * jsimd_h2v1_merged_upsample_mips_dspr2
697 * jsimd_h2v1_extrgb_merged_upsample_mips_dspr2
698 * jsimd_h2v1_extrgbx_merged_upsample_mips_dspr2
699 * jsimd_h2v1_extbgr_merged_upsample_mips_dspr2
700 * jsimd_h2v1_extbgrx_merged_upsample_mips_dspr2
701 * jsimd_h2v1_extxbgr_merged_upsample_mips_dspr2
702 * jsimd_h2v1_extxrgb_merged_upsample_mips_dspr2
703 *
704 * Merged h2v1 upsample routines
705 */
706
707.macro GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 colorid,    \
708                                                pixel_size, \
709                                                r1_offs,    \
710                                                g1_offs,    \
711                                                b1_offs,    \
712                                                a1_offs,    \
713                                                r2_offs,    \
714                                                g2_offs,    \
715                                                b2_offs,    \
716                                                a2_offs
717
718.macro STORE_H2V1_2_PIXELS  scratch0 \
719                            scratch1 \
720                            scratch2 \
721                            scratch3 \
722                            scratch4 \
723                            scratch5 \
724                            outptr
725    sb       \scratch0, \r1_offs(\outptr)
726    sb       \scratch1, \g1_offs(\outptr)
727    sb       \scratch2, \b1_offs(\outptr)
728    sb       \scratch3, \r2_offs(\outptr)
729    sb       \scratch4, \g2_offs(\outptr)
730    sb       \scratch5, \b2_offs(\outptr)
731.if (\pixel_size == 8)
732    li       t0, 0xFF
733    sb       t0, \a1_offs(\outptr)
734    sb       t0, \a2_offs(\outptr)
735.endif
736    addiu    \outptr, \pixel_size
737.endm
738
739.macro STORE_H2V1_1_PIXEL  scratch0 \
740                           scratch1 \
741                           scratch2 \
742                           outptr
743    sb    \scratch0, \r1_offs(\outptr)
744    sb    \scratch1, \g1_offs(\outptr)
745    sb    \scratch2, \b1_offs(\outptr)
746.if (\pixel_size == 8)
747    li    t0, 0xFF
748    sb    t0, \a1_offs(\outptr)
749.endif
750.endm
751
752LEAF_MIPS_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
753/*
754 * a0     - cinfo->output_width
755 * a1     - input_buf
756 * a2     - in_row_group_ctr
757 * a3     - output_buf
758 * 16(sp) - range_limit
759 */
760
761    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
762
763    li           t0, 0xe6ea
764    lw           t1, 0(a1)         // t1 = input_buf[0]
765    lw           t2, 4(a1)         // t2 = input_buf[1]
766    lw           t3, 8(a1)         // t3 = input_buf[2]
767    lw           t8, 56(sp)        // t8 = range_limit
768    addiu        s1, t0, 0x7fff    // s1 = 0x166e9 [FIX(1.40200)]
769    addiu        s2, s1, 0x5eb9    // s2 = 0x1c5a2 [FIX(1.77200)]
770    addiu        s0, t0, 0x9916    // s0 = 0x8000
771    addiu        s4, zero, 0xa7e6  // s4 = 0xffffa7e6 [-FIX(0.34414)]
772    xori         s3, s4, 0xeec8    // s3 = 0xffff492e [-FIX(0.71414)]
773    srl          t0, a0, 1
774    sll          t4, a2, 2
775    lwx          s5, t4(t1)        // s5 = inptr0
776    lwx          s6, t4(t2)        // s6 = inptr1
777    lwx          s7, t4(t3)        // s7 = inptr2
778    lw           t7, 0(a3)         // t7 = outptr
779    blez         t0, 2f
780     addu        t9, s6, t0        // t9 = end address
7811:
782    lbu          t2, 0(s6)         // t2 = cb
783    lbu          t0, 0(s7)         // t0 = cr
784    lbu          t1, 0(s5)         // t1 = y
785    addiu        t2, t2, -128      // t2 = cb - 128
786    addiu        t0, t0, -128      // t0 = cr - 128
787    mult         $ac1, s4, t2
788    madd         $ac1, s3, t0
789    sll          t0, t0, 15
790    sll          t2, t2, 15
791    mulq_rs.w    t0, s1, t0        // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
792    extr_r.w     t5, $ac1, 16
793    mulq_rs.w    t6, s2, t2        // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
794    addiu        s7, s7, 1
795    addiu        s6, s6, 1
796    addu         t2, t1, t0        // t2 = y + cred
797    addu         t3, t1, t5        // t3 = y + cgreen
798    addu         t4, t1, t6        // t4 = y + cblue
799    addu         t2, t8, t2
800    addu         t3, t8, t3
801    addu         t4, t8, t4
802    lbu          t1, 1(s5)
803    lbu          v0, 0(t2)
804    lbu          v1, 0(t3)
805    lbu          ra, 0(t4)
806    addu         t2, t1, t0
807    addu         t3, t1, t5
808    addu         t4, t1, t6
809    addu         t2, t8, t2
810    addu         t3, t8, t3
811    addu         t4, t8, t4
812    lbu          t2, 0(t2)
813    lbu          t3, 0(t3)
814    lbu          t4, 0(t4)
815
816    STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
817
818    bne          t9, s6, 1b
819     addiu       s5, s5, 2
8202:
821    andi         t0, a0, 1
822    beqz         t0, 4f
823     nop
8243:
825    lbu          t2, 0(s6)
826    lbu          t0, 0(s7)
827    lbu          t1, 0(s5)
828    addiu        t2, t2, -128      //(cb - 128)
829    addiu        t0, t0, -128      //(cr - 128)
830    mul          t3, s4, t2
831    mul          t4, s3, t0
832    sll          t0, t0, 15
833    sll          t2, t2, 15
834    mulq_rs.w    t0, s1, t0       // (C1*cr + ONE_HALF)>> SCALEBITS
835    mulq_rs.w    t6, s2, t2       // (C2*cb + ONE_HALF)>> SCALEBITS
836    addu         t3, t3, s0
837    addu         t3, t4, t3
838    sra          t5, t3, 16       // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
839    addu         t2, t1, t0       // y + cred
840    addu         t3, t1, t5       // y + cgreen
841    addu         t4, t1, t6       // y + cblue
842    addu         t2, t8, t2
843    addu         t3, t8, t3
844    addu         t4, t8, t4
845    lbu          t2, 0(t2)
846    lbu          t3, 0(t3)
847    lbu          t4, 0(t4)
848
849    STORE_H2V1_1_PIXEL t2, t3, t4, t7
8504:
851    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
852
853    j            ra
854     nop
855
856END(jsimd_h2v1_\colorid\()_merged_upsample_mips_dspr2)
857
858.purgem STORE_H2V1_1_PIXEL
859.purgem STORE_H2V1_2_PIXELS
860.endm
861
862/*-----------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
863GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgb,  6, 0, 1, 2, 6, 3, 4, 5, 6
864GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgr,  6, 2, 1, 0, 3, 5, 4, 3, 6
865GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
866GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
867GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
868GENERATE_H2V1_MERGED_UPSAMPLE_MIPS_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
869/*****************************************************************************/
870/*
871 * jsimd_h2v2_fancy_upsample_mips_dspr2
872 *
873 * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
874 */
875LEAF_MIPS_DSPR2(jsimd_h2v2_fancy_upsample_mips_dspr2)
876/*
877 * a0     - cinfo->max_v_samp_factor
878 * a1     - downsampled_width
879 * a2     - input_data
880 * a3     - output_data_ptr
881 */
882
883    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
884
885    li             s4, 0
886    lw             s2, 0(a3)       // s2 = *output_data_ptr
8870:
888    li             t9, 2
889    lw             s1, -4(a2)      // s1 = inptr1
890
8911:
892    lw             s0, 0(a2)       // s0 = inptr0
893    lwx            s3, s4(s2)
894    addiu          s5, a1, -2      // s5 = downsampled_width - 2
895    srl            t4, s5, 1
896    sll            t4, t4, 1
897    lbu            t0, 0(s0)
898    lbu            t1, 1(s0)
899    lbu            t2, 0(s1)
900    lbu            t3, 1(s1)
901    addiu          s0, 2
902    addiu          s1, 2
903    addu           t8, s0, t4      // t8 = end address
904    andi           s5, s5, 1       // s5 = residual
905    sll            t4, t0, 1
906    sll            t6, t1, 1
907    addu           t0, t0, t4      // t0 = (*inptr0++) * 3
908    addu           t1, t1, t6      // t1 = (*inptr0++) * 3
909    addu           t7, t0, t2      // t7 = thiscolsum
910    addu           t6, t1, t3      // t5 = nextcolsum
911    sll            t0, t7, 2       // t0 = thiscolsum * 4
912    subu           t1, t0, t7      // t1 = thiscolsum * 3
913    shra_r.w       t0, t0, 4
914    addiu          t1, 7
915    addu           t1, t1, t6
916    srl            t1, t1, 4
917    sb             t0, 0(s3)
918    sb             t1, 1(s3)
919    beq            t8, s0, 22f     // skip to final iteration if width == 3
920     addiu          s3, 2
9212:
922    lh             t0, 0(s0)       // t0 = A3|A2
923    lh             t2, 0(s1)       // t2 = B3|B2
924    addiu          s0, 2
925    addiu          s1, 2
926    preceu.ph.qbr  t0, t0          // t0 = 0|A3|0|A2
927    preceu.ph.qbr  t2, t2          // t2 = 0|B3|0|B2
928    shll.ph        t1, t0, 1
929    sll            t3, t6, 1
930    addu.ph        t0, t1, t0      // t0 = A3*3|A2*3
931    addu           t3, t3, t6      // t3 = this * 3
932    addu.ph        t0, t0, t2      // t0 = next2|next1
933    addu           t1, t3, t7
934    andi           t7, t0, 0xFFFF  // t7 = next1
935    sll            t2, t7, 1
936    addu           t2, t7, t2      // t2 = next1*3
937    addu           t4, t2, t6
938    srl            t6, t0, 16      // t6 = next2
939    shra_r.w       t1, t1, 4       // t1 = (this*3 + last + 8) >> 4
940    addu           t0, t3, t7
941    addiu          t0, 7
942    srl            t0, t0, 4       // t0 = (this*3 + next1 + 7) >> 4
943    shra_r.w       t4, t4, 4       // t3 = (next1*3 + this + 8) >> 4
944    addu           t2, t2, t6
945    addiu          t2, 7
946    srl            t2, t2, 4       // t2 = (next1*3 + next2 + 7) >> 4
947    sb             t1, 0(s3)
948    sb             t0, 1(s3)
949    sb             t4, 2(s3)
950    sb             t2, 3(s3)
951    bne            t8, s0, 2b
952     addiu         s3, 4
95322:
954    beqz           s5, 4f
955     addu          t8, s0, s5
9563:
957    lbu            t0, 0(s0)
958    lbu            t2, 0(s1)
959    addiu          s0, 1
960    addiu          s1, 1
961    sll            t3, t6, 1
962    sll            t1, t0, 1
963    addu           t1, t0, t1      // t1 = inptr0 * 3
964    addu           t3, t3, t6      // t3 = thiscolsum * 3
965    addu           t5, t1, t2
966    addu           t1, t3, t7
967    shra_r.w       t1, t1, 4
968    addu           t0, t3, t5
969    addiu          t0, 7
970    srl            t0, t0, 4
971    sb             t1, 0(s3)
972    sb             t0, 1(s3)
973    addiu          s3, 2
974    move           t7, t6
975    bne            t8, s0, 3b
976     move          t6, t5
9774:
978    sll            t0, t6, 2       // t0 = thiscolsum * 4
979    subu           t1, t0, t6      // t1 = thiscolsum * 3
980    addu           t1, t1, t7
981    addiu          s4, 4
982    shra_r.w       t1, t1, 4
983    addiu          t0, 7
984    srl            t0, t0, 4
985    sb             t1, 0(s3)
986    sb             t0, 1(s3)
987    addiu          t9, -1
988    addiu          s3, 2
989    bnez           t9, 1b
990     lw            s1, 4(a2)
991    srl            t0, s4, 2
992    subu           t0, a0, t0
993    bgtz           t0, 0b
994     addiu         a2, 4
995
996    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
997
998    j ra
999     nop
1000END(jsimd_h2v2_fancy_upsample_mips_dspr2)
1001
1002/*****************************************************************************/
1003LEAF_MIPS_DSPR2(jsimd_h2v1_fancy_upsample_mips_dspr2)
1004/*
1005 * a0     - cinfo->max_v_samp_factor
1006 * a1     - downsampled_width
1007 * a2     - input_data
1008 * a3     - output_data_ptr
1009 */
1010
1011    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1012
1013    .set at
1014
1015    beqz           a0, 3f
1016     sll           t0, a0, 2
1017    lw             s1, 0(a3)
1018    li             s3, 0x10001
1019    addu           s0, s1, t0
10200:
1021    addiu          t8, a1, -2
1022    srl            t9, t8, 2
1023    lw             t7, 0(a2)
1024    lw             s2, 0(s1)
1025    lbu            t0, 0(t7)
1026    lbu            t1, 1(t7)   // t1 = inptr[1]
1027    sll            t2, t0, 1
1028    addu           t2, t2, t0  // t2 = invalue*3
1029    addu           t2, t2, t1
1030    shra_r.w       t2, t2, 2
1031    sb             t0, 0(s2)
1032    sb             t2, 1(s2)
1033    beqz           t9, 11f
1034     addiu         s2, 2
10351:
1036    ulw            t0, 0(t7)   // t0 = |P3|P2|P1|P0|
1037    ulw            t1, 1(t7)
1038    ulh            t2, 4(t7)   // t2 = |0|0|P5|P4|
1039    preceu.ph.qbl  t3, t0      // t3 = |0|P3|0|P2|
1040    preceu.ph.qbr  t0, t0      // t0 = |0|P1|0|P0|
1041    preceu.ph.qbr  t2, t2      // t2 = |0|P5|0|P4|
1042    preceu.ph.qbl  t4, t1      // t4 = |0|P4|0|P3|
1043    preceu.ph.qbr  t1, t1      // t1 = |0|P2|0|P1|
1044    shll.ph        t5, t4, 1
1045    shll.ph        t6, t1, 1
1046    addu.ph        t5, t5, t4  // t5 = |P4*3|P3*3|
1047    addu.ph        t6, t6, t1  // t6 = |P2*3|P1*3|
1048    addu.ph        t4, t3, s3
1049    addu.ph        t0, t0, s3
1050    addu.ph        t4, t4, t5
1051    addu.ph        t0, t0, t6
1052    shrl.ph        t4, t4, 2   // t4 = |0|P3|0|P2|
1053    shrl.ph        t0, t0, 2   // t0 = |0|P1|0|P0|
1054    addu.ph        t2, t2, t5
1055    addu.ph        t3, t3, t6
1056    shra_r.ph      t2, t2, 2   // t2 = |0|P5|0|P4|
1057    shra_r.ph      t3, t3, 2   // t3 = |0|P3|0|P2|
1058    shll.ph        t2, t2, 8
1059    shll.ph        t3, t3, 8
1060    or             t2, t4, t2
1061    or             t3, t3, t0
1062    addiu          t9, -1
1063    usw            t3, 0(s2)
1064    usw            t2, 4(s2)
1065    addiu          s2, 8
1066    bgtz           t9, 1b
1067     addiu         t7, 4
106811:
1069    andi           t8, 3
1070    beqz           t8, 22f
1071     addiu         t7, 1
1072
10732:
1074    lbu            t0, 0(t7)
1075    addiu          t7, 1
1076    sll            t1, t0, 1
1077    addu           t2, t0, t1  // t2 = invalue
1078    lbu            t3, -2(t7)
1079    lbu            t4, 0(t7)
1080    addiu          t3, 1
1081    addiu          t4, 2
1082    addu           t3, t3, t2
1083    addu           t4, t4, t2
1084    srl            t3, 2
1085    srl            t4, 2
1086    sb             t3, 0(s2)
1087    sb             t4, 1(s2)
1088    addiu          t8, -1
1089    bgtz           t8, 2b
1090     addiu         s2, 2
1091
109222:
1093    lbu            t0, 0(t7)
1094    lbu            t2, -1(t7)
1095    sll            t1, t0, 1
1096    addu           t1, t1, t0 // t1 = invalue * 3
1097    addu           t1, t1, t2
1098    addiu          t1, 1
1099    srl            t1, t1, 2
1100    sb             t1, 0(s2)
1101    sb             t0, 1(s2)
1102    addiu          s1, 4
1103    bne            s1, s0, 0b
1104     addiu         a2, 4
11053:
1106    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1107
1108    j              ra
1109     nop
1110END(jsimd_h2v1_fancy_upsample_mips_dspr2)
1111
1112/*****************************************************************************/
1113LEAF_MIPS_DSPR2(jsimd_h2v1_downsample_mips_dspr2)
1114/*
1115 * a0     - cinfo->image_width
1116 * a1     - cinfo->max_v_samp_factor
1117 * a2     - compptr->v_samp_factor
1118 * a3     - compptr->width_in_blocks
1119 * 16(sp) - input_data
1120 * 20(sp) - output_data
1121 */
1122    .set at
1123
1124    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
1125
1126    beqz        a2, 7f
1127     lw         s1, 44(sp)  // s1 = output_data
1128    lw          s0, 40(sp)  // s0 = input_data
1129    srl         s2, a0, 2
1130    andi        t9, a0, 2
1131    srl         t7, t9, 1
1132    addu        s2, t7, s2
1133    sll         t0, a3, 3   // t0 = width_in_blocks*DCT
1134    srl         t7, t0, 1
1135    subu        s2, t7, s2
11360:
1137    andi        t6, a0, 1   // t6 = temp_index
1138    addiu       t6, -1
1139    lw          t4, 0(s1)   // t4 = outptr
1140    lw          t5, 0(s0)   // t5 = inptr0
1141    li          s3, 0       // s3 = bias
1142    srl         t7, a0, 1   // t7 = image_width1
1143    srl         s4, t7, 2
1144    andi        t8, t7, 3
11451:
1146    ulhu        t0, 0(t5)
1147    ulhu        t1, 2(t5)
1148    ulhu        t2, 4(t5)
1149    ulhu        t3, 6(t5)
1150    raddu.w.qb  t0, t0
1151    raddu.w.qb  t1, t1
1152    raddu.w.qb  t2, t2
1153    raddu.w.qb  t3, t3
1154    shra.ph     t0, t0, 1
1155    shra_r.ph   t1, t1, 1
1156    shra.ph     t2, t2, 1
1157    shra_r.ph   t3, t3, 1
1158    sb          t0, 0(t4)
1159    sb          t1, 1(t4)
1160    sb          t2, 2(t4)
1161    sb          t3, 3(t4)
1162    addiu       s4, -1
1163    addiu       t4, 4
1164    bgtz        s4, 1b
1165     addiu      t5, 8
1166    beqz        t8, 3f
1167     addu       s4, t4, t8
11682:
1169    ulhu        t0, 0(t5)
1170    raddu.w.qb  t0, t0
1171    addqh.w     t0, t0, s3
1172    xori        s3, s3, 1
1173    sb          t0, 0(t4)
1174    addiu       t4, 1
1175    bne         t4, s4, 2b
1176     addiu      t5, 2
11773:
1178    lbux        t1, t6(t5)
1179    sll         t1, 1
1180    addqh.w     t2, t1, s3  // t2 = pixval1
1181    xori        s3, s3, 1
1182    addqh.w     t3, t1, s3  // t3 = pixval2
1183    blez        s2, 5f
1184     append     t3, t2,  8
1185    addu        t5, t4, s2  // t5 = loop_end2
11864:
1187    ush         t3, 0(t4)
1188    addiu       s2, -1
1189    bgtz        s2, 4b
1190     addiu      t4,  2
11915:
1192    beqz        t9, 6f
1193     nop
1194    sb          t2, 0(t4)
11956:
1196    addiu       s1, 4
1197    addiu       a2, -1
1198    bnez        a2, 0b
1199     addiu      s0, 4
12007:
1201    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
1202
1203    j           ra
1204    nop
1205END(jsimd_h2v1_downsample_mips_dspr2)
1206
1207/*****************************************************************************/
1208LEAF_MIPS_DSPR2(jsimd_h2v2_downsample_mips_dspr2)
1209
1210/*
1211 * a0     - cinfo->image_width
1212 * a1     - cinfo->max_v_samp_factor
1213 * a2     - compptr->v_samp_factor
1214 * a3     - compptr->width_in_blocks
1215 * 16(sp) - input_data
1216 * 20(sp) - output_data
1217 */
1218    .set at
1219    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1220
1221    beqz         a2, 8f
1222     lw          s1, 52(sp)      // s1 = output_data
1223    lw           s0, 48(sp)      // s0 = input_data
1224
1225    andi         t6, a0, 1       // t6 = temp_index
1226    addiu        t6, -1
1227    srl          t7, a0, 1       // t7 = image_width1
1228    srl          s4, t7, 2
1229    andi         t8, t7, 3
1230    andi         t9, a0, 2
1231    srl          s2, a0, 2
1232    srl          t7, t9, 1
1233    addu         s2, t7, s2
1234    sll          t0, a3, 3       // s2 = width_in_blocks*DCT
1235    srl          t7, t0, 1
1236    subu         s2, t7, s2
12370:
1238    lw           t4, 0(s1)       // t4 = outptr
1239    lw           t5, 0(s0)       // t5 = inptr0
1240    lw           s7, 4(s0)       // s7 = inptr1
1241    li           s6, 1           // s6 = bias
12422:
1243    ulw          t0, 0(t5)       // t0 = |P3|P2|P1|P0|
1244    ulw          t1, 0(s7)       // t1 = |Q3|Q2|Q1|Q0|
1245    ulw          t2, 4(t5)
1246    ulw          t3, 4(s7)
1247    precrq.ph.w  t7, t0, t1      // t2 = |P3|P2|Q3|Q2|
1248    ins          t0, t1, 16, 16  // t0 = |Q1|Q0|P1|P0|
1249    raddu.w.qb   t1, t7
1250    raddu.w.qb   t0, t0
1251    shra_r.w     t1, t1, 2
1252    addiu        t0, 1
1253    srl          t0, 2
1254    precrq.ph.w  t7, t2, t3
1255    ins          t2, t3, 16, 16
1256    raddu.w.qb   t7, t7
1257    raddu.w.qb   t2, t2
1258    shra_r.w     t7, t7, 2
1259    addiu        t2, 1
1260    srl          t2, 2
1261    sb           t0, 0(t4)
1262    sb           t1, 1(t4)
1263    sb           t2, 2(t4)
1264    sb           t7, 3(t4)
1265    addiu        t4, 4
1266    addiu        t5, 8
1267    addiu        s4, s4, -1
1268    bgtz         s4, 2b
1269     addiu       s7, 8
1270    beqz         t8, 4f
1271     addu        t8, t4, t8
12723:
1273    ulhu         t0, 0(t5)
1274    ulhu         t1, 0(s7)
1275    ins          t0, t1, 16, 16
1276    raddu.w.qb   t0, t0
1277    addu         t0, t0, s6
1278    srl          t0, 2
1279    xori         s6, s6, 3
1280    sb           t0, 0(t4)
1281    addiu        t5, 2
1282    addiu        t4, 1
1283    bne          t8, t4, 3b
1284     addiu       s7, 2
12854:
1286    lbux         t1, t6(t5)
1287    sll          t1, 1
1288    lbux         t0, t6(s7)
1289    sll          t0, 1
1290    addu         t1, t1, t0
1291    addu         t3, t1, s6
1292    srl          t0, t3, 2       // t2 = pixval1
1293    xori         s6, s6, 3
1294    addu         t2, t1, s6
1295    srl          t1, t2, 2       // t3 = pixval2
1296    blez         s2, 6f
1297     append      t1, t0, 8
12985:
1299    ush          t1, 0(t4)
1300    addiu        s2, -1
1301    bgtz         s2, 5b
1302     addiu       t4, 2
13036:
1304    beqz         t9, 7f
1305     nop
1306    sb           t0, 0(t4)
13077:
1308    addiu        s1, 4
1309    addiu        a2, -1
1310    bnez         a2, 0b
1311     addiu       s0, 8
13128:
1313    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1314
1315    j            ra
1316     nop
1317END(jsimd_h2v2_downsample_mips_dspr2)
1318/*****************************************************************************/
1319LEAF_MIPS_DSPR2(jsimd_h2v2_smooth_downsample_mips_dspr2)
1320/*
1321 * a0     - input_data
1322 * a1     - output_data
1323 * a2     - compptr->v_samp_factor
1324 * a3     - cinfo->max_v_samp_factor
1325 * 16(sp) - cinfo->smoothing_factor
1326 * 20(sp) - compptr->width_in_blocks
1327 * 24(sp) - cinfo->image_width
1328 */
1329
1330    .set at
1331
1332    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1333
1334    lw          s7, 52(sp)      // compptr->width_in_blocks
1335    lw          s0, 56(sp)      // cinfo->image_width
1336    lw          s6, 48(sp)      // cinfo->smoothing_factor
1337    sll         s7, 3           // output_cols = width_in_blocks * DCTSIZE
1338    sll         v0, s7, 1
1339    subu        v0, v0, s0
1340    blez        v0, 2f
1341    move        v1, zero
1342    addiu       t0, a3, 2       // t0 = cinfo->max_v_samp_factor + 2
13430:
1344    addiu       t1, a0, -4
1345    sll         t2, v1, 2
1346    lwx         t1, t2(t1)
1347    move        t3, v0
1348    addu        t1, t1, s0
1349    lbu         t2, -1(t1)
13501:
1351    addiu       t3, t3, -1
1352    sb          t2, 0(t1)
1353    bgtz        t3, 1b
1354    addiu       t1, t1, 1
1355    addiu       v1, v1, 1
1356    bne         v1, t0, 0b
1357    nop
13582:
1359    li          v0, 80
1360    mul         v0, s6, v0
1361    li          v1, 16384
1362    move        t4, zero
1363    move        t5, zero
1364    subu        t6, v1, v0      // t6 = 16384 - tmp_smoot_f * 80
1365    sll         t7, s6, 4       // t7 = tmp_smoot_f * 16
13663:
1367/* Special case for first column: pretend column -1 is same as column 0 */
1368    sll         v0, t4, 2
1369    lwx         t8, v0(a1)      //  outptr = output_data[outrow]
1370    sll         v1, t5, 2
1371    addiu       t9, v1, 4
1372    addiu       s0, v1, -4
1373    addiu       s1, v1, 8
1374    lwx         s2, v1(a0)      // inptr0 = input_data[inrow]
1375    lwx         t9, t9(a0)      // inptr1 = input_data[inrow+1]
1376    lwx         s0, s0(a0)      // above_ptr = input_data[inrow-1]
1377    lwx         s1, s1(a0)      // below_ptr = input_data[inrow+2]
1378    lh          v0, 0(s2)
1379    lh          v1, 0(t9)
1380    lh          t0, 0(s0)
1381    lh          t1, 0(s1)
1382    ins         v0, v1, 16, 16
1383    ins         t0, t1, 16, 16
1384    raddu.w.qb  t2, v0
1385    raddu.w.qb  s3, t0
1386    lbu         v0, 0(s2)
1387    lbu         v1, 2(s2)
1388    lbu         t0, 0(t9)
1389    lbu         t1, 2(t9)
1390    addu        v0, v0, v1
1391    mult        $ac1,t2, t6
1392    addu        t0, t0, t1
1393    lbu         t2, 2(s0)
1394    addu        t0, t0, v0
1395    lbu         t3, 2(s1)
1396    addu        s3, t0, s3
1397    lbu         v0, 0(s0)
1398    lbu         t0, 0(s1)
1399    sll         s3, s3, 1
1400    addu        v0, v0, t2
1401    addu        t0, t0, t3
1402    addu        t0, t0, v0
1403    addu        s3, t0, s3
1404    madd        $ac1,s3, t7
1405    extr_r.w    v0, $ac1, 16
1406    addiu       t8, t8, 1
1407    addiu       s2, s2, 2
1408    addiu       t9, t9, 2
1409    addiu       s0, s0, 2
1410    addiu       s1, s1, 2
1411    sb          v0, -1(t8)
1412    addiu       s4, s7, -2
1413    and         s4, s4, 3
1414    addu        s5, s4, t8      //end adress
14154:
1416    lh          v0, 0(s2)
1417    lh          v1, 0(t9)
1418    lh          t0, 0(s0)
1419    lh          t1, 0(s1)
1420    ins         v0, v1, 16, 16
1421    ins         t0, t1, 16, 16
1422    raddu.w.qb  t2, v0
1423    raddu.w.qb  s3, t0
1424    lbu         v0, -1(s2)
1425    lbu         v1, 2(s2)
1426    lbu         t0, -1(t9)
1427    lbu         t1, 2(t9)
1428    addu        v0, v0, v1
1429    mult        $ac1, t2, t6
1430    addu        t0, t0, t1
1431    lbu         t2, 2(s0)
1432    addu        t0, t0, v0
1433    lbu         t3, 2(s1)
1434    addu        s3, t0, s3
1435    lbu         v0, -1(s0)
1436    lbu         t0, -1(s1)
1437    sll         s3, s3, 1
1438    addu        v0, v0, t2
1439    addu        t0, t0, t3
1440    addu        t0, t0, v0
1441    addu        s3, t0, s3
1442    madd        $ac1, s3, t7
1443    extr_r.w    t2, $ac1, 16
1444    addiu       t8, t8, 1
1445    addiu       s2, s2, 2
1446    addiu       t9, t9, 2
1447    addiu       s0, s0, 2
1448    sb          t2, -1(t8)
1449    bne         s5, t8, 4b
1450    addiu       s1, s1, 2
1451    addiu       s5, s7, -2
1452    subu        s5, s5, s4
1453    addu        s5, s5, t8      //end adress
14545:
1455    lh          v0, 0(s2)
1456    lh          v1, 0(t9)
1457    lh          t0, 0(s0)
1458    lh          t1, 0(s1)
1459    ins         v0, v1, 16, 16
1460    ins         t0, t1, 16, 16
1461    raddu.w.qb  t2, v0
1462    raddu.w.qb  s3, t0
1463    lbu         v0, -1(s2)
1464    lbu         v1, 2(s2)
1465    lbu         t0, -1(t9)
1466    lbu         t1, 2(t9)
1467    addu        v0, v0, v1
1468    mult        $ac1, t2, t6
1469    addu        t0, t0, t1
1470    lbu         t2, 2(s0)
1471    addu        t0, t0, v0
1472    lbu         t3, 2(s1)
1473    addu        s3, t0, s3
1474    lbu         v0, -1(s0)
1475    lbu         t0, -1(s1)
1476    sll         s3, s3, 1
1477    addu        v0, v0, t2
1478    addu        t0, t0, t3
1479    lh          v1, 2(t9)
1480    addu        t0, t0, v0
1481    lh          v0, 2(s2)
1482    addu        s3, t0, s3
1483    lh          t0, 2(s0)
1484    lh          t1, 2(s1)
1485    madd        $ac1, s3, t7
1486    extr_r.w    t2, $ac1, 16
1487    ins         t0, t1, 16, 16
1488    ins         v0, v1, 16, 16
1489    raddu.w.qb  s3, t0
1490    lbu         v1, 4(s2)
1491    lbu         t0, 1(t9)
1492    lbu         t1, 4(t9)
1493    sb          t2, 0(t8)
1494    raddu.w.qb  t3, v0
1495    lbu         v0, 1(s2)
1496    addu        t0, t0, t1
1497    mult        $ac1, t3, t6
1498    addu        v0, v0, v1
1499    lbu         t2, 4(s0)
1500    addu        t0, t0, v0
1501    lbu         v0, 1(s0)
1502    addu        s3, t0, s3
1503    lbu         t0, 1(s1)
1504    lbu         t3, 4(s1)
1505    addu        v0, v0, t2
1506    sll         s3, s3, 1
1507    addu        t0, t0, t3
1508    lh          v1, 4(t9)
1509    addu        t0, t0, v0
1510    lh          v0, 4(s2)
1511    addu        s3, t0, s3
1512    lh          t0, 4(s0)
1513    lh          t1, 4(s1)
1514    madd        $ac1, s3, t7
1515    extr_r.w    t2, $ac1, 16
1516    ins         t0, t1, 16, 16
1517    ins         v0, v1, 16, 16
1518    raddu.w.qb  s3, t0
1519    lbu         v1, 6(s2)
1520    lbu         t0, 3(t9)
1521    lbu         t1, 6(t9)
1522    sb          t2, 1(t8)
1523    raddu.w.qb  t3, v0
1524    lbu         v0, 3(s2)
1525    addu        t0, t0,t1
1526    mult        $ac1, t3, t6
1527    addu        v0, v0, v1
1528    lbu         t2, 6(s0)
1529    addu        t0, t0, v0
1530    lbu         v0, 3(s0)
1531    addu        s3, t0, s3
1532    lbu         t0, 3(s1)
1533    lbu         t3, 6(s1)
1534    addu        v0, v0, t2
1535    sll         s3, s3, 1
1536    addu        t0, t0, t3
1537    lh          v1, 6(t9)
1538    addu        t0, t0, v0
1539    lh          v0, 6(s2)
1540    addu        s3, t0, s3
1541    lh          t0, 6(s0)
1542    lh          t1, 6(s1)
1543    madd        $ac1, s3, t7
1544    extr_r.w    t3, $ac1, 16
1545    ins         t0, t1, 16, 16
1546    ins         v0, v1, 16, 16
1547    raddu.w.qb  s3, t0
1548    lbu         v1, 8(s2)
1549    lbu         t0, 5(t9)
1550    lbu         t1, 8(t9)
1551    sb          t3, 2(t8)
1552    raddu.w.qb  t2, v0
1553    lbu         v0, 5(s2)
1554    addu        t0, t0, t1
1555    mult        $ac1, t2, t6
1556    addu        v0, v0, v1
1557    lbu         t2, 8(s0)
1558    addu        t0, t0, v0
1559    lbu         v0, 5(s0)
1560    addu        s3, t0, s3
1561    lbu         t0, 5(s1)
1562    lbu         t3, 8(s1)
1563    addu        v0, v0, t2
1564    sll         s3, s3, 1
1565    addu        t0, t0, t3
1566    addiu       t8, t8, 4
1567    addu        t0, t0, v0
1568    addiu       s2, s2, 8
1569    addu        s3, t0, s3
1570    addiu       t9, t9, 8
1571    madd        $ac1, s3, t7
1572    extr_r.w    t1, $ac1, 16
1573    addiu       s0, s0, 8
1574    addiu       s1, s1, 8
1575    bne         s5, t8, 5b
1576    sb          t1, -1(t8)
1577/* Special case for last column */
1578    lh          v0, 0(s2)
1579    lh          v1, 0(t9)
1580    lh          t0, 0(s0)
1581    lh          t1, 0(s1)
1582    ins         v0, v1, 16, 16
1583    ins         t0, t1, 16, 16
1584    raddu.w.qb  t2, v0
1585    raddu.w.qb  s3, t0
1586    lbu         v0, -1(s2)
1587    lbu         v1, 1(s2)
1588    lbu         t0, -1(t9)
1589    lbu         t1, 1(t9)
1590    addu        v0, v0, v1
1591    mult        $ac1, t2, t6
1592    addu        t0, t0, t1
1593    lbu         t2, 1(s0)
1594    addu        t0, t0, v0
1595    lbu         t3, 1(s1)
1596    addu        s3, t0, s3
1597    lbu         v0, -1(s0)
1598    lbu         t0, -1(s1)
1599    sll         s3, s3, 1
1600    addu        v0, v0, t2
1601    addu        t0, t0, t3
1602    addu        t0, t0, v0
1603    addu        s3, t0, s3
1604    madd        $ac1, s3, t7
1605    extr_r.w    t0, $ac1, 16
1606    addiu       t5, t5, 2
1607    sb          t0, 0(t8)
1608    addiu       t4, t4, 1
1609    bne         t4, a2, 3b
1610    addiu       t5, t5, 2
1611
1612    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1613
1614    j           ra
1615     nop
1616
1617END(jsimd_h2v2_smooth_downsample_mips_dspr2)
1618
1619/*****************************************************************************/
1620LEAF_MIPS_DSPR2(jsimd_int_upsample_mips_dspr2)
1621/*
1622 * a0     - upsample->h_expand[compptr->component_index]
1623 * a1     - upsample->v_expand[compptr->component_index]
1624 * a2     - input_data
1625 * a3     - output_data_ptr
1626 * 16(sp) - cinfo->output_width
1627 * 20(sp) - cinfo->max_v_samp_factor
1628 */
1629    .set at
1630
1631    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1632
1633    lw      s0, 0(a3)    // s0 = output_data
1634    lw      s1, 32(sp)   // s1 = cinfo->output_width
1635    lw      s2, 36(sp)   // s2 = cinfo->max_v_samp_factor
1636    li      t6, 0        // t6 = inrow
1637    beqz    s2, 10f
1638     li     s3, 0        // s3 = outrow
16390:
1640    addu    t0, a2, t6
1641    addu    t7, s0, s3
1642    lw      t3, 0(t0)    // t3 = inptr
1643    lw      t8, 0(t7)    // t8 = outptr
1644    beqz    s1, 4f
1645     addu   t5, t8, s1   // t5 = outend
16461:
1647    lb      t2, 0(t3)    // t2 = invalue = *inptr++
1648    addiu   t3, 1
1649    beqz    a0, 3f
1650     move   t0, a0       // t0 = h_expand
16512:
1652    sb      t2, 0(t8)
1653    addiu   t0, -1
1654    bgtz    t0, 2b
1655     addiu  t8, 1
16563:
1657    bgt     t5, t8, 1b
1658     nop
16594:
1660    addiu   t9, a1, -1   // t9 = v_expand - 1
1661    blez    t9, 9f
1662     nop
16635:
1664    lw      t3, 0(s0)
1665    lw      t4, 4(s0)
1666    subu    t0, s1, 0xF
1667    blez    t0, 7f
1668     addu   t5, t3, s1   // t5 = end address
1669    andi    t7, s1, 0xF  // t7 = residual
1670    subu    t8, t5, t7
16716:
1672    ulw     t0, 0(t3)
1673    ulw     t1, 4(t3)
1674    ulw     t2, 8(t3)
1675    usw     t0, 0(t4)
1676    ulw     t0, 12(t3)
1677    usw     t1, 4(t4)
1678    usw     t2, 8(t4)
1679    usw     t0, 12(t4)
1680    addiu   t3, 16
1681    bne     t3, t8, 6b
1682     addiu  t4, 16
1683    beqz    t7, 8f
1684     nop
16857:
1686    lbu     t0, 0(t3)
1687    sb      t0, 0(t4)
1688    addiu   t3, 1
1689    bne     t3, t5, 7b
1690     addiu  t4, 1
16918:
1692    addiu   t9, -1
1693    bgtz    t9, 5b
1694     addiu  s0, 8
16959:
1696    addu    s3, s3, a1
1697    bne     s3, s2, 0b
1698     addiu  t6, 1
169910:
1700    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1701
1702    j       ra
1703     nop
1704END(jsimd_int_upsample_mips_dspr2)
1705
1706/*****************************************************************************/
1707LEAF_MIPS_DSPR2(jsimd_h2v1_upsample_mips_dspr2)
1708/*
1709 * a0     - cinfo->max_v_samp_factor
1710 * a1     - cinfo->output_width
1711 * a2     - input_data
1712 * a3     - output_data_ptr
1713 */
1714    lw      t7, 0(a3)       // t7 = output_data
1715    andi    t8, a1, 0xf     // t8 = residual
1716    sll     t0, a0, 2
1717    blez    a0, 4f
1718     addu   t9, t7, t0      // t9 = output_data end address
17190:
1720    lw      t5, 0(t7)       // t5 = outptr
1721    lw      t6, 0(a2)       // t6 = inptr
1722    addu    t3, t5, a1      // t3 = outptr + output_width (end address)
1723    subu    t3, t8          // t3 = end address - residual
1724    beq     t5, t3, 2f
1725     move   t4, t8
17261:
1727    ulw     t0, 0(t6)       // t0 = |P3|P2|P1|P0|
1728    ulw     t2, 4(t6)       // t2 = |P7|P6|P5|P4|
1729    srl     t1, t0, 16      // t1 = |X|X|P3|P2|
1730    ins     t0, t0, 16, 16  // t0 = |P1|P0|P1|P0|
1731    ins     t1, t1, 16, 16  // t1 = |P3|P2|P3|P2|
1732    ins     t0, t0, 8, 16   // t0 = |P1|P1|P0|P0|
1733    ins     t1, t1, 8, 16   // t1 = |P3|P3|P2|P2|
1734    usw     t0, 0(t5)
1735    usw     t1, 4(t5)
1736    srl     t0, t2, 16      // t0 = |X|X|P7|P6|
1737    ins     t2, t2, 16, 16  // t2 = |P5|P4|P5|P4|
1738    ins     t0, t0, 16, 16  // t0 = |P7|P6|P7|P6|
1739    ins     t2, t2, 8, 16   // t2 = |P5|P5|P4|P4|
1740    ins     t0, t0, 8, 16   // t0 = |P7|P7|P6|P6|
1741    usw     t2, 8(t5)
1742    usw     t0, 12(t5)
1743    addiu   t5, 16
1744    bne     t5, t3, 1b
1745     addiu  t6, 8
1746    beqz    t8, 3f
1747     move   t4, t8
17482:
1749    lbu     t1, 0(t6)
1750    sb      t1, 0(t5)
1751    sb      t1, 1(t5)
1752    addiu   t4, -2
1753    addiu   t6, 1
1754    bgtz    t4, 2b
1755     addiu  t5, 2
17563:
1757    addiu   t7, 4
1758    bne     t9, t7, 0b
1759     addiu  a2, 4
17604:
1761    j       ra
1762     nop
1763END(jsimd_h2v1_upsample_mips_dspr2)
1764
1765/*****************************************************************************/
1766LEAF_MIPS_DSPR2(jsimd_h2v2_upsample_mips_dspr2)
1767/*
1768 * a0     - cinfo->max_v_samp_factor
1769 * a1     - cinfo->output_width
1770 * a2     - input_data
1771 * a3     - output_data_ptr
1772 */
1773    lw      t7, 0(a3)
1774    blez    a0, 7f
1775     andi   t9, a1, 0xf     // t9 = residual
17760:
1777    lw      t6, 0(a2)       // t6 = inptr
1778    lw      t5, 0(t7)       // t5 = outptr
1779    addu    t8, t5, a1      // t8 = outptr end address
1780    subu    t8, t9          // t8 = end address - residual
1781    beq     t5, t8, 2f
1782     move   t4, t9
17831:
1784    ulw     t0, 0(t6)
1785    srl     t1, t0, 16
1786    ins     t0, t0, 16, 16
1787    ins     t0, t0, 8, 16
1788    ins     t1, t1, 16, 16
1789    ins     t1, t1, 8, 16
1790    ulw     t2, 4(t6)
1791    usw     t0, 0(t5)
1792    usw     t1, 4(t5)
1793    srl     t3, t2, 16
1794    ins     t2, t2, 16, 16
1795    ins     t2, t2, 8, 16
1796    ins     t3, t3, 16, 16
1797    ins     t3, t3, 8, 16
1798    usw     t2, 8(t5)
1799    usw     t3, 12(t5)
1800    addiu   t5, 16
1801    bne     t5, t8, 1b
1802     addiu  t6, 8
1803    beqz    t9, 3f
1804     move   t4, t9
18052:
1806    lbu     t0, 0(t6)
1807    sb      t0, 0(t5)
1808    sb      t0, 1(t5)
1809    addiu   t4, -2
1810    addiu   t6, 1
1811    bgtz    t4, 2b
1812     addiu  t5, 2
18133:
1814    lw      t6, 0(t7)       // t6 = outptr[0]
1815    lw      t5, 4(t7)       // t5 = outptr[1]
1816    addu    t4, t6, a1      // t4 = new end address
1817    beq     a1, t9, 5f
1818     subu   t8, t4, t9
18194:
1820    ulw     t0, 0(t6)
1821    ulw     t1, 4(t6)
1822    ulw     t2, 8(t6)
1823    usw     t0, 0(t5)
1824    ulw     t0, 12(t6)
1825    usw     t1, 4(t5)
1826    usw     t2, 8(t5)
1827    usw     t0, 12(t5)
1828    addiu   t6, 16
1829    bne     t6, t8, 4b
1830     addiu  t5, 16
1831    beqz    t9, 6f
1832     nop
18335:
1834    lbu     t0, 0(t6)
1835    sb      t0, 0(t5)
1836    addiu   t6, 1
1837    bne     t6, t4, 5b
1838     addiu  t5, 1
18396:
1840    addiu   t7, 8
1841    addiu   a0, -2
1842    bgtz    a0, 0b
1843     addiu  a2, 4
18447:
1845    j       ra
1846     nop
1847END(jsimd_h2v2_upsample_mips_dspr2)
1848
1849/*****************************************************************************/
1850LEAF_MIPS_DSPR2(jsimd_idct_islow_mips_dspr2)
1851/*
1852 * a0     - coef_block
1853 * a1     - compptr->dcttable
1854 * a2     - output
1855 * a3     - range_limit
1856 */
1857
1858    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1859
1860    addiu     sp, sp, -256
1861    move      v0, sp
1862    addiu     v1, zero, 8      // v1 = DCTSIZE = 8
18631:
1864    lh        s4, 32(a0)       // s4 = inptr[16]
1865    lh        s5, 64(a0)       // s5 = inptr[32]
1866    lh        s6, 96(a0)       // s6 = inptr[48]
1867    lh        t1, 112(a0)      // t1 = inptr[56]
1868    lh        t7, 16(a0)       // t7 = inptr[8]
1869    lh        t5, 80(a0)       // t5 = inptr[40]
1870    lh        t3, 48(a0)       // t3 = inptr[24]
1871    or        s4, s4, t1
1872    or        s4, s4, t3
1873    or        s4, s4, t5
1874    or        s4, s4, t7
1875    or        s4, s4, s5
1876    or        s4, s4, s6
1877    bnez      s4, 2f
1878     addiu    v1, v1, -1
1879    lh        s5, 0(a1)        // quantptr[DCTSIZE*0]
1880    lh        s6, 0(a0)        // inptr[DCTSIZE*0]
1881    mul       s5, s5, s6       // DEQUANTIZE(inptr[0], quantptr[0])
1882    sll       s5, s5, 2
1883    sw        s5, 0(v0)
1884    sw        s5, 32(v0)
1885    sw        s5, 64(v0)
1886    sw        s5, 96(v0)
1887    sw        s5, 128(v0)
1888    sw        s5, 160(v0)
1889    sw        s5, 192(v0)
1890    b         3f
1891     sw       s5, 224(v0)
18922:
1893    lh        t0, 112(a1)
1894    lh        t2, 48(a1)
1895    lh        t4, 80(a1)
1896    lh        t6, 16(a1)
1897    mul       t0, t0, t1       // DEQUANTIZE(inptr[DCTSIZE*7],quant[DCTSIZE*7])
1898    mul       t1, t2, t3       // DEQUANTIZE(inptr[DCTSIZE*3],quant[DCTSIZE*3])
1899    mul       t2, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*5],quant[DCTSIZE*5])
1900    mul       t3, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*1],quant[DCTSIZE*1])
1901    lh        t4, 32(a1)
1902    lh        t5, 32(a0)
1903    lh        t6, 96(a1)
1904    lh        t7, 96(a0)
1905    addu      s0, t0, t1       // z3 = tmp0 + tmp2
1906    addu      s1, t1, t2       // z2 = tmp1 + tmp2
1907    addu      s2, t2, t3       // z4 = tmp1 + tmp3
1908    addu      s3, s0, s2       // z3 + z4
1909    addiu     t9, zero, 9633   // FIX_1_175875602
1910    mul       s3, s3, t9       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
1911    addu      t8, t0, t3       // z1 = tmp0 + tmp3
1912    addiu     t9, zero, 2446   // FIX_0_298631336
1913    mul       t0, t0, t9       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
1914    addiu     t9, zero, 16819  // FIX_2_053119869
1915    mul       t2, t2, t9       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
1916    addiu     t9, zero, 25172  // FIX_3_072711026
1917    mul       t1, t1, t9       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
1918    addiu     t9, zero, 12299  // FIX_1_501321110
1919    mul       t3, t3, t9       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
1920    addiu     t9, zero, 16069  // FIX_1_961570560
1921    mul       s0, s0, t9       // -z3 = MULTIPLY(z3, FIX_1_961570560)
1922    addiu     t9, zero, 3196   // FIX_0_390180644
1923    mul       s2, s2, t9       // -z4 = MULTIPLY(z4, FIX_0_390180644)
1924    addiu     t9, zero, 7373   // FIX_0_899976223
1925    mul       t8, t8, t9       // -z1 = MULTIPLY(z1, FIX_0_899976223)
1926    addiu     t9, zero, 20995  // FIX_2_562915447
1927    mul       s1, s1, t9       // -z2 = MULTIPLY(z2, FIX_2_562915447)
1928    subu      s0, s3, s0       // z3 += z5
1929    addu      t0, t0, s0       // tmp0 += z3
1930    addu      t1, t1, s0       // tmp2 += z3
1931    subu      s2, s3, s2       // z4 += z5
1932    addu      t2, t2, s2       // tmp1 += z4
1933    addu      t3, t3, s2       // tmp3 += z4
1934    subu      t0, t0, t8       // tmp0 += z1
1935    subu      t1, t1, s1       // tmp2 += z2
1936    subu      t2, t2, s1       // tmp1 += z2
1937    subu      t3, t3, t8       // tmp3 += z1
1938    mul       s0, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*2],quant[DCTSIZE*2])
1939    addiu     t9, zero, 6270   // FIX_0_765366865
1940    mul       s1, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*6],quant[DCTSIZE*6])
1941    lh        t4, 0(a1)
1942    lh        t5, 0(a0)
1943    lh        t6, 64(a1)
1944    lh        t7, 64(a0)
1945    mul       s2, t9, s0       // MULTIPLY(z2, FIX_0_765366865)
1946    mul       t5, t4, t5       // DEQUANTIZE(inptr[DCTSIZE*0],quant[DCTSIZE*0])
1947    mul       t6, t6, t7       // DEQUANTIZE(inptr[DCTSIZE*4],quant[DCTSIZE*4])
1948    addiu     t9, zero, 4433   // FIX_0_541196100
1949    addu      s3, s0, s1       // z2 + z3
1950    mul       s3, s3, t9       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
1951    addiu     t9, zero, 15137  // FIX_1_847759065
1952    mul       t8, s1, t9       // MULTIPLY(z3, FIX_1_847759065)
1953    addu      t4, t5, t6
1954    subu      t5, t5, t6
1955    sll       t4, t4, 13       // tmp0 = (z2 + z3) << CONST_BITS
1956    sll       t5, t5, 13       // tmp1 = (z2 - z3) << CONST_BITS
1957    addu      t7, s3, s2       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
1958    subu      t6, s3, t8       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
1959    addu      s0, t4, t7
1960    subu      s1, t4, t7
1961    addu      s2, t5, t6
1962    subu      s3, t5, t6
1963    addu      t4, s0, t3
1964    subu      s0, s0, t3
1965    addu      t3, s2, t1
1966    subu      s2, s2, t1
1967    addu      t1, s3, t2
1968    subu      s3, s3, t2
1969    addu      t2, s1, t0
1970    subu      s1, s1, t0
1971    shra_r.w  t4, t4, 11
1972    shra_r.w  t3, t3, 11
1973    shra_r.w  t1, t1, 11
1974    shra_r.w  t2, t2, 11
1975    shra_r.w  s1, s1, 11
1976    shra_r.w  s3, s3, 11
1977    shra_r.w  s2, s2, 11
1978    shra_r.w  s0, s0, 11
1979    sw        t4, 0(v0)
1980    sw        t3, 32(v0)
1981    sw        t1, 64(v0)
1982    sw        t2, 96(v0)
1983    sw        s1, 128(v0)
1984    sw        s3, 160(v0)
1985    sw        s2, 192(v0)
1986    sw        s0, 224(v0)
19873:
1988    addiu     a1, a1, 2
1989    addiu     a0, a0, 2
1990    bgtz      v1, 1b
1991     addiu    v0, v0, 4
1992    move      v0, sp
1993    addiu     v1, zero, 8
19944:
1995    lw        t0, 8(v0)        // z2 = (INT32) wsptr[2]
1996    lw        t1, 24(v0)       // z3 = (INT32) wsptr[6]
1997    lw        t2, 0(v0)        // (INT32) wsptr[0]
1998    lw        t3, 16(v0)       // (INT32) wsptr[4]
1999    lw        s4, 4(v0)        // (INT32) wsptr[1]
2000    lw        s5, 12(v0)       // (INT32) wsptr[3]
2001    lw        s6, 20(v0)       // (INT32) wsptr[5]
2002    lw        s7, 28(v0)       // (INT32) wsptr[7]
2003    or        s4, s4, t0
2004    or        s4, s4, t1
2005    or        s4, s4, t3
2006    or        s4, s4, s7
2007    or        s4, s4, s5
2008    or        s4, s4, s6
2009    bnez      s4, 5f
2010     addiu    v1, v1, -1
2011    shra_r.w  s5, t2, 5
2012    andi      s5, s5, 0x3ff
2013    lbux      s5, s5(a3)
2014    lw        s1, 0(a2)
2015    replv.qb  s5, s5
2016    usw       s5, 0(s1)
2017    usw       s5, 4(s1)
2018    b         6f
2019     nop
20205:
2021    addu      t4, t0, t1       // z2 + z3
2022    addiu     t8, zero, 4433   // FIX_0_541196100
2023    mul       t5, t4, t8       // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
2024    addiu     t8, zero, 15137  // FIX_1_847759065
2025    mul       t1, t1, t8       // MULTIPLY(z3, FIX_1_847759065)
2026    addiu     t8, zero, 6270   // FIX_0_765366865
2027    mul       t0, t0, t8       // MULTIPLY(z2, FIX_0_765366865)
2028    addu      t4, t2, t3       // (INT32) wsptr[0] + (INT32) wsptr[4]
2029    subu      t2, t2, t3       // (INT32) wsptr[0] - (INT32) wsptr[4]
2030    sll       t4, t4, 13       // tmp0 = ((wsptr[0] + wsptr[4]) << CONST_BITS
2031    sll       t2, t2, 13       // tmp1 = ((wsptr[0] - wsptr[4]) << CONST_BITS
2032    subu      t1, t5, t1       // tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065)
2033    subu      t3, t2, t1       // tmp12 = tmp1 - tmp2
2034    addu      t2, t2, t1       // tmp11 = tmp1 + tmp2
2035    addu      t5, t5, t0       // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
2036    subu      t1, t4, t5       // tmp13 = tmp0 - tmp3
2037    addu      t0, t4, t5       // tmp10 = tmp0 + tmp3
2038    lw        t4, 28(v0)       // tmp0 = (INT32) wsptr[7]
2039    lw        t6, 12(v0)       // tmp2 = (INT32) wsptr[3]
2040    lw        t5, 20(v0)       // tmp1 = (INT32) wsptr[5]
2041    lw        t7, 4(v0)        // tmp3 = (INT32) wsptr[1]
2042    addu      s0, t4, t6       // z3 = tmp0 + tmp2
2043    addiu     t8, zero, 9633   // FIX_1_175875602
2044    addu      s1, t5, t7       // z4 = tmp1 + tmp3
2045    addu      s2, s0, s1       // z3 + z4
2046    mul       s2, s2, t8       // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
2047    addu      s3, t4, t7       // z1 = tmp0 + tmp3
2048    addu      t9, t5, t6       // z2 = tmp1 + tmp2
2049    addiu     t8, zero, 16069  // FIX_1_961570560
2050    mul       s0, s0, t8       // -z3 = MULTIPLY(z3, FIX_1_961570560)
2051    addiu     t8, zero, 3196   // FIX_0_390180644
2052    mul       s1, s1, t8       // -z4 = MULTIPLY(z4, FIX_0_390180644)
2053    addiu     t8, zero, 2446   // FIX_0_298631336
2054    mul       t4, t4, t8       // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
2055    addiu     t8, zero, 7373   // FIX_0_899976223
2056    mul       s3, s3, t8       // -z1 = MULTIPLY(z1, FIX_0_899976223)
2057    addiu     t8, zero, 16819  // FIX_2_053119869
2058    mul       t5, t5, t8       // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
2059    addiu     t8, zero, 20995  // FIX_2_562915447
2060    mul       t9, t9, t8       // -z2 = MULTIPLY(z2, FIX_2_562915447)
2061    addiu     t8, zero, 25172  // FIX_3_072711026
2062    mul       t6, t6, t8       // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
2063    addiu     t8, zero, 12299  // FIX_1_501321110
2064    mul       t7, t7, t8       // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
2065    subu      s0, s2, s0       // z3 += z5
2066    subu      s1, s2, s1       // z4 += z5
2067    addu      t4, t4, s0
2068    subu      t4, t4, s3       // tmp0
2069    addu      t5, t5, s1
2070    subu      t5, t5, t9       // tmp1
2071    addu      t6, t6, s0
2072    subu      t6, t6, t9       // tmp2
2073    addu      t7, t7, s1
2074    subu      t7, t7, s3       // tmp3
2075    addu      s0, t0, t7
2076    subu      t0, t0, t7
2077    addu      t7, t2, t6
2078    subu      t2, t2, t6
2079    addu      t6, t3, t5
2080    subu      t3, t3, t5
2081    addu      t5, t1, t4
2082    subu      t1, t1, t4
2083    shra_r.w  s0, s0, 18
2084    shra_r.w  t7, t7, 18
2085    shra_r.w  t6, t6, 18
2086    shra_r.w  t5, t5, 18
2087    shra_r.w  t1, t1, 18
2088    shra_r.w  t3, t3, 18
2089    shra_r.w  t2, t2, 18
2090    shra_r.w  t0, t0, 18
2091    andi      s0, s0, 0x3ff
2092    andi      t7, t7, 0x3ff
2093    andi      t6, t6, 0x3ff
2094    andi      t5, t5, 0x3ff
2095    andi      t1, t1, 0x3ff
2096    andi      t3, t3, 0x3ff
2097    andi      t2, t2, 0x3ff
2098    andi      t0, t0, 0x3ff
2099    lw        s1, 0(a2)
2100    lbux      s0, s0(a3)
2101    lbux      t7, t7(a3)
2102    lbux      t6, t6(a3)
2103    lbux      t5, t5(a3)
2104    lbux      t1, t1(a3)
2105    lbux      t3, t3(a3)
2106    lbux      t2, t2(a3)
2107    lbux      t0, t0(a3)
2108    sb        s0, 0(s1)
2109    sb        t7, 1(s1)
2110    sb        t6, 2(s1)
2111    sb        t5, 3(s1)
2112    sb        t1, 4(s1)
2113    sb        t3, 5(s1)
2114    sb        t2, 6(s1)
2115    sb        t0, 7(s1)
21166:
2117    addiu     v0, v0, 32
2118    bgtz      v1, 4b
2119     addiu    a2, a2, 4
2120    addiu     sp, sp, 256
2121
2122    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2123
2124    j         ra
2125     nop
2126
2127END(jsimd_idct_islow_mips_dspr2)
2128
2129/*****************************************************************************/
2130LEAF_MIPS_DSPR2(jsimd_idct_ifast_cols_mips_dspr2)
2131/*
2132 * a0     - inptr
2133 * a1     - quantptr
2134 * a2     - wsptr
2135 * a3     - mips_idct_ifast_coefs
2136 */
2137
2138    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2139
2140    addiu          t9, a0, 16            // end address
2141    or             AT, a3, zero
2142
21430:
2144    lw             s0, 0(a1)             // quantptr[DCTSIZE*0]
2145    lw             t0, 0(a0)             // inptr[DCTSIZE*0]
2146    lw             t1, 16(a0)            // inptr[DCTSIZE*1]
2147    muleq_s.w.phl  v0, t0, s0            // tmp0 ...
2148    lw             t2, 32(a0)            // inptr[DCTSIZE*2]
2149    lw             t3, 48(a0)            // inptr[DCTSIZE*3]
2150    lw             t4, 64(a0)            // inptr[DCTSIZE*4]
2151    lw             t5, 80(a0)            // inptr[DCTSIZE*5]
2152    muleq_s.w.phr  t0, t0, s0            // ... tmp0 ...
2153    lw             t6, 96(a0)            // inptr[DCTSIZE*6]
2154    lw             t7, 112(a0)           // inptr[DCTSIZE*7]
2155    or             s4, t1, t2
2156    or             s5, t3, t4
2157    bnez           s4, 1f
2158     ins           t0, v0, 16, 16        // ... tmp0
2159    bnez           s5, 1f
2160     or            s6, t5, t6
2161    or             s6, s6, t7
2162    bnez           s6, 1f
2163     sw            t0, 0(a2)             // wsptr[DCTSIZE*0]
2164    sw             t0, 16(a2)            // wsptr[DCTSIZE*1]
2165    sw             t0, 32(a2)            // wsptr[DCTSIZE*2]
2166    sw             t0, 48(a2)            // wsptr[DCTSIZE*3]
2167    sw             t0, 64(a2)            // wsptr[DCTSIZE*4]
2168    sw             t0, 80(a2)            // wsptr[DCTSIZE*5]
2169    sw             t0, 96(a2)            // wsptr[DCTSIZE*6]
2170    sw             t0, 112(a2)           // wsptr[DCTSIZE*7]
2171    addiu          a0, a0, 4
2172    b              2f
2173     addiu         a1, a1, 4
2174
21751:
2176    lw             s1, 32(a1)            // quantptr[DCTSIZE*2]
2177    lw             s2, 64(a1)            // quantptr[DCTSIZE*4]
2178    muleq_s.w.phl  v0, t2, s1            // tmp1 ...
2179    muleq_s.w.phr  t2, t2, s1            // ... tmp1 ...
2180    lw             s0, 16(a1)            // quantptr[DCTSIZE*1]
2181    lw             s1, 48(a1)            // quantptr[DCTSIZE*3]
2182    lw             s3, 96(a1)            // quantptr[DCTSIZE*6]
2183    muleq_s.w.phl  v1, t4, s2            // tmp2 ...
2184    muleq_s.w.phr  t4, t4, s2            // ... tmp2 ...
2185    lw             s2, 80(a1)            // quantptr[DCTSIZE*5]
2186    lw             t8, 4(AT)             // FIX(1.414213562)
2187    ins            t2, v0, 16, 16        // ... tmp1
2188    muleq_s.w.phl  v0, t6, s3            // tmp3 ...
2189    muleq_s.w.phr  t6, t6, s3            // ... tmp3 ...
2190    ins            t4, v1, 16, 16        // ... tmp2
2191    addq.ph        s4, t0, t4            // tmp10
2192    subq.ph        s5, t0, t4            // tmp11
2193    ins            t6, v0, 16, 16        // ... tmp3
2194    subq.ph        s6, t2, t6            // tmp12 ...
2195    addq.ph        s7, t2, t6            // tmp13
2196    mulq_s.ph      s6, s6, t8            // ... tmp12 ...
2197    addq.ph        t0, s4, s7            // tmp0
2198    subq.ph        t6, s4, s7            // tmp3
2199    muleq_s.w.phl  v0, t1, s0            // tmp4 ...
2200    muleq_s.w.phr  t1, t1, s0            // ... tmp4 ...
2201    shll_s.ph      s6, s6, 1             // x2
2202    lw             s3, 112(a1)           // quantptr[DCTSIZE*7]
2203    subq.ph        s6, s6, s7            // ... tmp12
2204    muleq_s.w.phl  v1, t7, s3            // tmp7 ...
2205    muleq_s.w.phr  t7, t7, s3            // ... tmp7 ...
2206    ins            t1, v0, 16, 16        // ... tmp4
2207    addq.ph        t2, s5, s6            // tmp1
2208    subq.ph        t4, s5, s6            // tmp2
2209    muleq_s.w.phl  v0, t5, s2            // tmp6 ...
2210    muleq_s.w.phr  t5, t5, s2            // ... tmp6 ...
2211    ins            t7, v1, 16, 16        // ... tmp7
2212    addq.ph        s5, t1, t7            // z11
2213    subq.ph        s6, t1, t7            // z12
2214    muleq_s.w.phl  v1, t3, s1            // tmp5 ...
2215    muleq_s.w.phr  t3, t3, s1            // ... tmp5 ...
2216    ins            t5, v0, 16, 16        // ... tmp6
2217    ins            t3, v1, 16, 16        // ... tmp5
2218    addq.ph        s7, t5, t3            // z13
2219    subq.ph        v0, t5, t3            // z10
2220    addq.ph        t7, s5, s7            // tmp7
2221    subq.ph        s5, s5, s7            // tmp11 ...
2222    addq.ph        v1, v0, s6            // z5 ...
2223    mulq_s.ph      s5, s5, t8            // ... tmp11
2224    lw             t8, 8(AT)             // FIX(1.847759065)
2225    lw             s4, 0(AT)             // FIX(1.082392200)
2226    addq.ph        s0, t0, t7
2227    subq.ph        s1, t0, t7
2228    mulq_s.ph      v1, v1, t8            // ... z5
2229    shll_s.ph      s5, s5, 1             // x2
2230    lw             t8, 12(AT)            // FIX(-2.613125930)
2231    sw             s0, 0(a2)             // wsptr[DCTSIZE*0]
2232    shll_s.ph      v0, v0, 1             // x4
2233    mulq_s.ph      v0, v0, t8            // tmp12 ...
2234    mulq_s.ph      s4, s6, s4            // tmp10 ...
2235    shll_s.ph      v1, v1, 1             // x2
2236    addiu          a0, a0, 4
2237    addiu          a1, a1, 4
2238    sw             s1, 112(a2)           // wsptr[DCTSIZE*7]
2239    shll_s.ph      s6, v0, 1             // x4
2240    shll_s.ph      s4, s4, 1             // x2
2241    addq.ph        s6, s6, v1            // ... tmp12
2242    subq.ph        t5, s6, t7            // tmp6
2243    subq.ph        s4, s4, v1            // ... tmp10
2244    subq.ph        t3, s5, t5            // tmp5
2245    addq.ph        s2, t2, t5
2246    addq.ph        t1, s4, t3            // tmp4
2247    subq.ph        s3, t2, t5
2248    sw             s2, 16(a2)            // wsptr[DCTSIZE*1]
2249    sw             s3, 96(a2)            // wsptr[DCTSIZE*6]
2250    addq.ph        v0, t4, t3
2251    subq.ph        v1, t4, t3
2252    sw             v0, 32(a2)            // wsptr[DCTSIZE*2]
2253    sw             v1, 80(a2)            // wsptr[DCTSIZE*5]
2254    addq.ph        v0, t6, t1
2255    subq.ph        v1, t6, t1
2256    sw             v0, 64(a2)            // wsptr[DCTSIZE*4]
2257    sw             v1, 48(a2)            // wsptr[DCTSIZE*3]
2258
22592:
2260    bne            a0, t9, 0b
2261     addiu         a2, a2, 4
2262
2263    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2264
2265    j              ra
2266     nop
2267
2268END(jsimd_idct_ifast_cols_mips_dspr2)
2269
2270/*****************************************************************************/
2271LEAF_MIPS_DSPR2(jsimd_idct_ifast_rows_mips_dspr2)
2272/*
2273 * a0     - wsptr
2274 * a1     - output_buf
2275 * a2     - output_col
2276 * a3     - mips_idct_ifast_coefs
2277 */
2278
2279    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2280
2281    addiu         t9, a0, 128        // end address
2282    lui           s8, 0x8080
2283    ori           s8, s8, 0x8080
2284
22850:
2286    lw            AT, 36(sp)         // restore $a3 (mips_idct_ifast_coefs)
2287    lw            t0, 0(a0)          // wsptr[DCTSIZE*0+0/1]  b a
2288    lw            s0, 16(a0)         // wsptr[DCTSIZE*1+0/1]  B A
2289    lw            t2, 4(a0)          // wsptr[DCTSIZE*0+2/3]  d c
2290    lw            s2, 20(a0)         // wsptr[DCTSIZE*1+2/3]  D C
2291    lw            t4, 8(a0)          // wsptr[DCTSIZE*0+4/5]  f e
2292    lw            s4, 24(a0)         // wsptr[DCTSIZE*1+4/5]  F E
2293    lw            t6, 12(a0)         // wsptr[DCTSIZE*0+6/7]  h g
2294    lw            s6, 28(a0)         // wsptr[DCTSIZE*1+6/7]  H G
2295    precrq.ph.w   t1, s0, t0         // B b
2296    ins           t0, s0, 16, 16     // A a
2297    bnez          t1, 1f
2298     or           s0, t2, s2
2299    bnez          s0, 1f
2300     or           s0, t4, s4
2301    bnez          s0, 1f
2302     or           s0, t6, s6
2303    bnez          s0, 1f
2304     shll_s.ph    s0, t0, 2          // A a
2305    lw            a3, 0(a1)
2306    lw            AT, 4(a1)
2307    precrq.ph.w   t0, s0, s0         // A A
2308    ins           s0, s0, 16, 16     // a a
2309    addu          a3, a3, a2
2310    addu          AT, AT, a2
2311    precrq.qb.ph  t0, t0, t0         // A A A A
2312    precrq.qb.ph  s0, s0, s0         // a a a a
2313    addu.qb       s0, s0, s8
2314    addu.qb       t0, t0, s8
2315    sw            s0, 0(a3)
2316    sw            s0, 4(a3)
2317    sw            t0, 0(AT)
2318    sw            t0, 4(AT)
2319    addiu         a0, a0, 32
2320    bne           a0, t9, 0b
2321     addiu        a1, a1, 8
2322    b             2f
2323     nop
2324
23251:
2326    precrq.ph.w   t3, s2, t2
2327    ins           t2, s2, 16, 16
2328    precrq.ph.w   t5, s4, t4
2329    ins           t4, s4, 16, 16
2330    precrq.ph.w   t7, s6, t6
2331    ins           t6, s6, 16, 16
2332    lw            t8, 4(AT)          // FIX(1.414213562)
2333    addq.ph       s4, t0, t4         // tmp10
2334    subq.ph       s5, t0, t4         // tmp11
2335    subq.ph       s6, t2, t6         // tmp12 ...
2336    addq.ph       s7, t2, t6         // tmp13
2337    mulq_s.ph     s6, s6, t8         // ... tmp12 ...
2338    addq.ph       t0, s4, s7         // tmp0
2339    subq.ph       t6, s4, s7         // tmp3
2340    shll_s.ph     s6, s6, 1          // x2
2341    subq.ph       s6, s6, s7         // ... tmp12
2342    addq.ph       t2, s5, s6         // tmp1
2343    subq.ph       t4, s5, s6         // tmp2
2344    addq.ph       s5, t1, t7         // z11
2345    subq.ph       s6, t1, t7         // z12
2346    addq.ph       s7, t5, t3         // z13
2347    subq.ph       v0, t5, t3         // z10
2348    addq.ph       t7, s5, s7         // tmp7
2349    subq.ph       s5, s5, s7         // tmp11 ...
2350    addq.ph       v1, v0, s6         // z5 ...
2351    mulq_s.ph     s5, s5, t8         // ... tmp11
2352    lw            t8, 8(AT)          // FIX(1.847759065)
2353    lw            s4, 0(AT)          // FIX(1.082392200)
2354    addq.ph       s0, t0, t7         // tmp0 + tmp7
2355    subq.ph       s7, t0, t7         // tmp0 - tmp7
2356    mulq_s.ph     v1, v1, t8         // ... z5
2357    lw            a3, 0(a1)
2358    lw            t8, 12(AT)         // FIX(-2.613125930)
2359    shll_s.ph     s5, s5, 1          // x2
2360    addu          a3, a3, a2
2361    shll_s.ph     v0, v0, 1          // x4
2362    mulq_s.ph     v0, v0, t8         // tmp12 ...
2363    mulq_s.ph     s4, s6, s4         // tmp10 ...
2364    shll_s.ph     v1, v1, 1          // x2
2365    addiu         a0, a0, 32
2366    addiu         a1, a1, 8
2367    shll_s.ph     s6, v0, 1          // x4
2368    shll_s.ph     s4, s4, 1          // x2
2369    addq.ph       s6, s6, v1         // ... tmp12
2370    shll_s.ph     s0, s0, 2
2371    subq.ph       t5, s6, t7         // tmp6
2372    subq.ph       s4, s4, v1         // ... tmp10
2373    subq.ph       t3, s5, t5         // tmp5
2374    shll_s.ph     s7, s7, 2
2375    addq.ph       t1, s4, t3         // tmp4
2376    addq.ph       s1, t2, t5         // tmp1 + tmp6
2377    subq.ph       s6, t2, t5         // tmp1 - tmp6
2378    addq.ph       s2, t4, t3         // tmp2 + tmp5
2379    subq.ph       s5, t4, t3         // tmp2 - tmp5
2380    addq.ph       s4, t6, t1         // tmp3 + tmp4
2381    subq.ph       s3, t6, t1         // tmp3 - tmp4
2382    shll_s.ph     s1, s1, 2
2383    shll_s.ph     s2, s2, 2
2384    shll_s.ph     s3, s3, 2
2385    shll_s.ph     s4, s4, 2
2386    shll_s.ph     s5, s5, 2
2387    shll_s.ph     s6, s6, 2
2388    precrq.ph.w   t0, s1, s0         // B A
2389    ins           s0, s1, 16, 16     // b a
2390    precrq.ph.w   t2, s3, s2         // D C
2391    ins           s2, s3, 16, 16     // d c
2392    precrq.ph.w   t4, s5, s4         // F E
2393    ins           s4, s5, 16, 16     // f e
2394    precrq.ph.w   t6, s7, s6         // H G
2395    ins           s6, s7, 16, 16     // h g
2396    precrq.qb.ph  t0, t2, t0         // D C B A
2397    precrq.qb.ph  s0, s2, s0         // d c b a
2398    precrq.qb.ph  t4, t6, t4         // H G F E
2399    precrq.qb.ph  s4, s6, s4         // h g f e
2400    addu.qb       s0, s0, s8
2401    addu.qb       s4, s4, s8
2402    sw            s0, 0(a3)          // outptr[0/1/2/3]       d c b a
2403    sw            s4, 4(a3)          // outptr[4/5/6/7]       h g f e
2404    lw            a3, -4(a1)
2405    addu.qb       t0, t0, s8
2406    addu          a3, a3, a2
2407    addu.qb       t4, t4, s8
2408    sw            t0, 0(a3)          // outptr[0/1/2/3]       D C B A
2409    bne           a0, t9, 0b
2410     sw           t4, 4(a3)          // outptr[4/5/6/7]       H G F E
2411
24122:
2413
2414    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2415
2416    j             ra
2417     nop
2418
2419END(jsimd_idct_ifast_rows_mips_dspr2)
2420
2421/*****************************************************************************/
2422LEAF_MIPS_DSPR2(jsimd_fdct_islow_mips_dspr2)
2423/*
2424 * a0     - data
2425 */
2426
2427    SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2428
2429    lui       t0, 6437
2430    ori       t0, 2260
2431    lui       t1, 9633
2432    ori       t1, 11363
2433    lui       t2, 0xd39e
2434    ori       t2, 0xe6dc
2435    lui       t3, 0xf72d
2436    ori       t3, 9633
2437    lui       t4, 2261
2438    ori       t4, 9633
2439    lui       t5, 0xd39e
2440    ori       t5, 6437
2441    lui       t6, 9633
2442    ori       t6, 0xd39d
2443    lui       t7, 0xe6dc
2444    ori       t7, 2260
2445    lui       t8, 4433
2446    ori       t8, 10703
2447    lui       t9, 0xd630
2448    ori       t9, 4433
2449    li        s8, 8
2450    move      a1, a0
24511:
2452    lw        s0, 0(a1)     // tmp0 = 1|0
2453    lw        s1, 4(a1)     // tmp1 = 3|2
2454    lw        s2, 8(a1)     // tmp2 = 5|4
2455    lw        s3, 12(a1)    // tmp3 = 7|6
2456    packrl.ph s1, s1, s1    // tmp1 = 2|3
2457    packrl.ph s3, s3, s3    // tmp3 = 6|7
2458    subq.ph   s7, s1, s2    // tmp7 = 2-5|3-4 = t5|t4
2459    subq.ph   s5, s0, s3    // tmp5 = 1-6|0-7 = t6|t7
2460    mult      $0, $0        // ac0  = 0
2461    dpa.w.ph  $ac0, s7, t0  // ac0 += t5*  6437 + t4*  2260
2462    dpa.w.ph  $ac0, s5, t1  // ac0 += t6*  9633 + t7* 11363
2463    mult      $ac1, $0, $0  // ac1  = 0
2464    dpa.w.ph  $ac1, s7, t2  // ac1 += t5*-11362 + t4* -6436
2465    dpa.w.ph  $ac1, s5, t3  // ac1 += t6* -2259 + t7*  9633
2466    mult      $ac2, $0, $0  // ac2  = 0
2467    dpa.w.ph  $ac2, s7, t4  // ac2 += t5*  2261 + t4*  9633
2468    dpa.w.ph  $ac2, s5, t5  // ac2 += t6*-11362 + t7*  6437
2469    mult      $ac3, $0, $0  // ac3  = 0
2470    dpa.w.ph  $ac3, s7, t6  // ac3 += t5*  9633 + t4*-11363
2471    dpa.w.ph  $ac3, s5, t7  // ac3 += t6* -6436 + t7*  2260
2472    addq.ph   s6, s1, s2    // tmp6 = 2+5|3+4 = t2|t3
2473    addq.ph   s4, s0, s3    // tmp4 = 1+6|0+7 = t1|t0
2474    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
2475    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
2476    extr_r.w  s2, $ac2, 11  // tmp2 = (ac2 + 1024) >> 11
2477    extr_r.w  s3, $ac3, 11  // tmp3 = (ac3 + 1024) >> 11
2478    addq.ph   s5, s4, s6    // tmp5 = t1+t2|t0+t3 = t11|t10
2479    subq.ph   s7, s4, s6    // tmp7 = t1-t2|t0-t3 = t12|t13
2480    sh        s0, 2(a1)
2481    sh        s1, 6(a1)
2482    sh        s2, 10(a1)
2483    sh        s3, 14(a1)
2484    mult      $0, $0        // ac0  = 0
2485    dpa.w.ph  $ac0, s7, t8  // ac0 += t12*  4433 + t13* 10703
2486    mult      $ac1, $0, $0  // ac1  = 0
2487    dpa.w.ph  $ac1, s7, t9  // ac1 += t12*-10704 + t13*  4433
2488    sra       s4, s5, 16    // tmp4 = t11
2489    addiu     a1, a1, 16
2490    addiu     s8, s8, -1
2491    extr_r.w  s0, $ac0, 11  // tmp0 = (ac0 + 1024) >> 11
2492    extr_r.w  s1, $ac1, 11  // tmp1 = (ac1 + 1024) >> 11
2493    addu      s2, s5, s4    // tmp2 = t10 + t11
2494    subu      s3, s5, s4    // tmp3 = t10 - t11
2495    sll       s2, s2, 2     // tmp2 = (t10 + t11) << 2
2496    sll       s3, s3, 2     // tmp3 = (t10 - t11) << 2
2497    sh        s2, -16(a1)
2498    sh        s3, -8(a1)
2499    sh        s0, -12(a1)
2500    bgtz      s8, 1b
2501     sh       s1, -4(a1)
2502    li        t0, 2260
2503    li        t1, 11363
2504    li        t2, 9633
2505    li        t3, 6436
2506    li        t4, 6437
2507    li        t5, 2261
2508    li        t6, 11362
2509    li        t7, 2259
2510    li        t8, 4433
2511    li        t9, 10703
2512    li        a1, 10704
2513    li        s8, 8
2514
25152:
2516    lh        a2, 0(a0)     // 0
2517    lh        a3, 16(a0)    // 8
2518    lh        v0, 32(a0)    // 16
2519    lh        v1, 48(a0)    // 24
2520    lh        s4, 64(a0)    // 32
2521    lh        s5, 80(a0)    // 40
2522    lh        s6, 96(a0)    // 48
2523    lh        s7, 112(a0)   // 56
2524    addu      s2, v0, s5    // tmp2 = 16 + 40
2525    subu      s5, v0, s5    // tmp5 = 16 - 40
2526    addu      s3, v1, s4    // tmp3 = 24 + 32
2527    subu      s4, v1, s4    // tmp4 = 24 - 32
2528    addu      s0, a2, s7    // tmp0 =  0 + 56
2529    subu      s7, a2, s7    // tmp7 =  0 - 56
2530    addu      s1, a3, s6    // tmp1 =  8 + 48
2531    subu      s6, a3, s6    // tmp6 =  8 - 48
2532    addu      a2, s0, s3    // tmp10 = tmp0 + tmp3
2533    subu      v1, s0, s3    // tmp13 = tmp0 - tmp3
2534    addu      a3, s1, s2    // tmp11 = tmp1 + tmp2
2535    subu      v0, s1, s2    // tmp12 = tmp1 - tmp2
2536    mult      s7, t1        // ac0  = tmp7 * c1
2537    madd      s4, t0        // ac0 += tmp4 * c0
2538    madd      s5, t4        // ac0 += tmp5 * c4
2539    madd      s6, t2        // ac0 += tmp6 * c2
2540    mult      $ac1, s7, t2  // ac1  = tmp7 * c2
2541    msub      $ac1, s4, t3  // ac1 -= tmp4 * c3
2542    msub      $ac1, s5, t6  // ac1 -= tmp5 * c6
2543    msub      $ac1, s6, t7  // ac1 -= tmp6 * c7
2544    mult      $ac2, s7, t4  // ac2  = tmp7 * c4
2545    madd      $ac2, s4, t2  // ac2 += tmp4 * c2
2546    madd      $ac2, s5, t5  // ac2 += tmp5 * c5
2547    msub      $ac2, s6, t6  // ac2 -= tmp6 * c6
2548    mult      $ac3, s7, t0  // ac3  = tmp7 * c0
2549    msub      $ac3, s4, t1  // ac3 -= tmp4 * c1
2550    madd      $ac3, s5, t2  // ac3 += tmp5 * c2
2551    msub      $ac3, s6, t3  // ac3 -= tmp6 * c3
2552    extr_r.w  s0, $ac0, 15  // tmp0 = (ac0 + 16384) >> 15
2553    extr_r.w  s1, $ac1, 15  // tmp1 = (ac1 + 16384) >> 15
2554    extr_r.w  s2, $ac2, 15  // tmp2 = (ac2 + 16384) >> 15
2555    extr_r.w  s3, $ac3, 15  // tmp3 = (ac3 + 16384) >> 15
2556    addiu     s8, s8, -1
2557    addu      s4, a2, a3    // tmp4 = tmp10 + tmp11
2558    subu      s5, a2, a3    // tmp5 = tmp10 - tmp11
2559    sh        s0, 16(a0)
2560    sh        s1, 48(a0)
2561    sh        s2, 80(a0)
2562    sh        s3, 112(a0)
2563    mult      v0, t8        // ac0  = tmp12 * c8
2564    madd      v1, t9        // ac0 += tmp13 * c9
2565    mult      $ac1, v1, t8  // ac1  = tmp13 * c8
2566    msub      $ac1, v0, a1  // ac1 -= tmp12 * c10
2567    addiu     a0, a0, 2
2568    extr_r.w  s6, $ac0, 15  // tmp6 = (ac0 + 16384) >> 15
2569    extr_r.w  s7, $ac1, 15  // tmp7 = (ac1 + 16384) >> 15
2570    shra_r.w  s4, s4, 2     // tmp4 = (tmp4 + 2) >> 2
2571    shra_r.w  s5, s5, 2     // tmp5 = (tmp5 + 2) >> 2
2572    sh        s4, -2(a0)
2573    sh        s5, 62(a0)
2574    sh        s6, 30(a0)
2575    bgtz      s8, 2b
2576     sh       s7, 94(a0)
2577
2578    RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2579
2580    jr       ra
2581     nop
2582
2583END(jsimd_fdct_islow_mips_dspr2)
2584
2585/*****************************************************************************/
2586LEAF_MIPS_DSPR2(jsimd_fdct_ifast_mips_dspr2)
2587/*
2588 * a0     - data
2589 */
2590    .set at
2591    SAVE_REGS_ON_STACK 8, s0, s1
2592    li           a1, 0x014e014e  // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
2593    li           a2, 0x008b008b  // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
2594    li           a3, 0x00620062  // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
2595    li           s1, 0x00b500b5  // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
2596
2597    move         v0, a0
2598    addiu        v1, v0, 128     // end address
2599
26000:
2601    lw           t0, 0(v0)       // tmp0 = 1|0
2602    lw           t1, 4(v0)       // tmp1 = 3|2
2603    lw           t2, 8(v0)       // tmp2 = 5|4
2604    lw           t3, 12(v0)      // tmp3 = 7|6
2605    packrl.ph    t1, t1, t1      // tmp1 = 2|3
2606    packrl.ph    t3, t3, t3      // tmp3 = 6|7
2607    subq.ph      t7, t1, t2      // tmp7 = 2-5|3-4 = t5|t4
2608    subq.ph      t5, t0, t3      // tmp5 = 1-6|0-7 = t6|t7
2609    addq.ph      t6, t1, t2      // tmp6 = 2+5|3+4 = t2|t3
2610    addq.ph      t4, t0, t3      // tmp4 = 1+6|0+7 = t1|t0
2611    addq.ph      t8, t4, t6      // tmp5 = t1+t2|t0+t3 = t11|t10
2612    subq.ph      t9, t4, t6      // tmp7 = t1-t2|t0-t3 = t12|t13
2613    sra          t4, t8, 16      // tmp4 = t11
2614    mult         $0, $0          // ac0  = 0
2615    dpa.w.ph     $ac0, t9, s1
2616    mult         $ac1, $0, $0    // ac1  = 0
2617    dpa.w.ph     $ac1, t7, a3    // ac1 += t4*98 + t5*98
2618    dpsx.w.ph    $ac1, t5, a3    // ac1 += t6*98 + t7*98
2619    mult         $ac2, $0, $0    // ac2  = 0
2620    dpa.w.ph     $ac2, t7, a2    // ac2 += t4*139 + t5*139
2621    mult         $ac3, $0, $0    // ac3  = 0
2622    dpa.w.ph     $ac3, t5, a1    // ac3 += t6*334 + t7*334
2623    precrq.ph.w  t0, t5, t7      // t0 = t5|t6
2624    addq.ph      t2, t8, t4      // tmp2 = t10 + t11
2625    subq.ph      t3, t8, t4      // tmp3 = t10 - t11
2626    extr.w       t4, $ac0, 8
2627    mult         $0, $0          // ac0  = 0
2628    dpa.w.ph     $ac0, t0, s1    // ac0 += t5*181 + t6*181
2629    extr.w       t0, $ac1, 8     // t0 = z5
2630    extr.w       t1, $ac2, 8     // t1 = MULTIPLY(tmp10, 139)
2631    extr.w       t7, $ac3, 8     // t2 = MULTIPLY(tmp12, 334)
2632    extr.w       t8, $ac0, 8     // t8 = z3 = MULTIPLY(tmp11, 181)
2633    add          t6, t1, t0      // t6 = z2
2634    add          t7, t7, t0      // t7 = z4
2635    subq.ph      t0, t5, t8      // t0 = z13 = tmp7 - z3
2636    addq.ph      t8, t5, t8      // t9 = z11 = tmp7 + z3
2637    addq.ph      t1, t0, t6      // t1 = z13 + z2
2638    subq.ph      t6, t0, t6      // t6 = z13 - z2
2639    addq.ph      t0, t8, t7      // t0 = z11 + z4
2640    subq.ph      t7, t8, t7      // t7 = z11 - z4
2641    addq.ph      t5, t4, t9
2642    subq.ph      t4, t9, t4
2643    sh           t2, 0(v0)
2644    sh           t5, 4(v0)
2645    sh           t3, 8(v0)
2646    sh           t4, 12(v0)
2647    sh           t1, 10(v0)
2648    sh           t6, 6(v0)
2649    sh           t0, 2(v0)
2650    sh           t7, 14(v0)
2651    addiu        v0, 16
2652    bne          v1, v0, 0b
2653     nop
2654    move         v0, a0
2655    addiu        v1, v0, 16
2656
26571:
2658    lh           t0, 0(v0)       // 0
2659    lh           t1, 16(v0)      // 8
2660    lh           t2, 32(v0)      // 16
2661    lh           t3, 48(v0)      // 24
2662    lh           t4, 64(v0)      // 32
2663    lh           t5, 80(v0)      // 40
2664    lh           t6, 96(v0)      // 48
2665    lh           t7, 112(v0)     // 56
2666    add          t8, t0, t7      // t8 = tmp0
2667    sub          t7, t0, t7      // t7 = tmp7
2668    add          t0, t1, t6      // t0 = tmp1
2669    sub          t1, t1, t6      // t1 = tmp6
2670    add          t6, t2, t5      // t6 = tmp2
2671    sub          t5, t2, t5      // t5 = tmp5
2672    add          t2, t3, t4      // t2 = tmp3
2673    sub          t3, t3, t4      // t3 = tmp4
2674    add          t4, t8, t2      // t4 = tmp10 = tmp0 + tmp3
2675    sub          t8, t8, t2      // t8 = tmp13 = tmp0 - tmp3
2676    sub          s0, t0, t6      // s0 = tmp12 = tmp1 - tmp2
2677    ins          t8, s0, 16, 16  // t8 = tmp12|tmp13
2678    add          t2, t0, t6      // t2 = tmp11 = tmp1 + tmp2
2679    mult         $0, $0          // ac0  = 0
2680    dpa.w.ph     $ac0, t8, s1    // ac0 += t12*181 + t13*181
2681    add          s0, t4, t2      // t8 = tmp10+tmp11
2682    sub          t4, t4, t2      // t4 = tmp10-tmp11
2683    sh           s0, 0(v0)
2684    sh           t4, 64(v0)
2685    extr.w       t2, $ac0, 8     // z1 = MULTIPLY(tmp12+tmp13,FIX_0_707106781)
2686    addq.ph      t4, t8, t2      // t9 = tmp13 + z1
2687    subq.ph      t8, t8, t2      // t2 = tmp13 - z1
2688    sh           t4, 32(v0)
2689    sh           t8, 96(v0)
2690    add          t3, t3, t5      // t3 = tmp10 = tmp4 + tmp5
2691    add          t0, t5, t1      // t0 = tmp11 = tmp5 + tmp6
2692    add          t1, t1, t7      // t1 = tmp12 = tmp6 + tmp7
2693    andi         t4, a1, 0xffff
2694    mul          s0, t1, t4
2695    sra          s0, s0, 8       // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
2696    ins          t1, t3, 16, 16  // t1 = tmp10|tmp12
2697    mult         $0, $0          // ac0  = 0
2698    mulsa.w.ph   $ac0, t1, a3    // ac0 += t10*98 - t12*98
2699    extr.w       t8, $ac0, 8     // z5 = MULTIPLY(tmp10-tmp12,FIX_0_382683433)
2700    add          t2, t7, t8      // t2 = tmp7 + z5
2701    sub          t7, t7, t8      // t7 = tmp7 - z5
2702    andi         t4, a2, 0xffff
2703    mul          t8, t3, t4
2704    sra          t8, t8, 8       // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
2705    andi         t4, s1, 0xffff
2706    mul          t6, t0, t4
2707    sra          t6, t6, 8       // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
2708    add          t0, t6, t8      // t0 = z3 + z2
2709    sub          t1, t6, t8      // t1 = z3 - z2
2710    add          t3, t6, s0      // t3 = z3 + z4
2711    sub          t4, t6, s0      // t4 = z3 - z4
2712    sub          t5, t2, t1      // t5 = dataptr[5]
2713    sub          t6, t7, t0      // t6 = dataptr[3]
2714    add          t3, t2, t3      // t3 = dataptr[1]
2715    add          t4, t7, t4      // t4 = dataptr[7]
2716    sh           t5, 80(v0)
2717    sh           t6, 48(v0)
2718    sh           t3, 16(v0)
2719    sh           t4, 112(v0)
2720    addiu        v0, 2
2721    bne          v0, v1, 1b
2722     nop
2723
2724    RESTORE_REGS_FROM_STACK 8, s0, s1
2725
2726    j            ra
2727     nop
2728END(jsimd_fdct_ifast_mips_dspr2)
2729
2730/*****************************************************************************/
2731LEAF_MIPS_DSPR2(jsimd_quantize_mips_dspr2)
2732/*
2733 * a0     - coef_block
2734 * a1     - divisors
2735 * a2     - workspace
2736 */
2737
2738    .set at
2739
2740    SAVE_REGS_ON_STACK 16, s0, s1, s2
2741
2742    addiu   v0, a2, 124  // v0 = workspace_end
2743    lh      t0, 0(a2)
2744    lh      t1, 0(a1)
2745    lh      t2, 128(a1)
2746    sra     t3, t0, 15
2747    sll     t3, t3, 1
2748    addiu   t3, t3, 1
2749    mul     t0, t0, t3
2750    lh      t4, 384(a1)
2751    lh      t5, 130(a1)
2752    lh      t6, 2(a2)
2753    lh      t7, 2(a1)
2754    lh      t8, 386(a1)
2755
27561:
2757    andi    t1, 0xffff
2758    add     t9, t0, t2
2759    andi    t9, 0xffff
2760    mul     v1, t9, t1
2761    sra     s0, t6, 15
2762    sll     s0, s0, 1
2763    addiu   s0, s0, 1
2764    addiu   t9, t4, 16
2765    srav    v1, v1, t9
2766    mul     v1, v1, t3
2767    mul     t6, t6, s0
2768    andi    t7, 0xffff
2769    addiu   a2, a2, 4
2770    addiu   a1, a1, 4
2771    add     s1, t6, t5
2772    andi    s1, 0xffff
2773    sh      v1, 0(a0)
2774
2775    mul     s2, s1, t7
2776    addiu   s1, t8, 16
2777    srav    s2, s2, s1
2778    mul     s2,s2, s0
2779    lh      t0, 0(a2)
2780    lh      t1, 0(a1)
2781    sra     t3, t0, 15
2782    sll     t3, t3, 1
2783    addiu   t3, t3, 1
2784    mul     t0, t0, t3
2785    lh      t2, 128(a1)
2786    lh      t4, 384(a1)
2787    lh      t5, 130(a1)
2788    lh      t8, 386(a1)
2789    lh      t6, 2(a2)
2790    lh      t7, 2(a1)
2791    sh      s2, 2(a0)
2792    lh      t0, 0(a2)
2793    sra     t3, t0, 15
2794    sll     t3, t3, 1
2795    addiu   t3, t3, 1
2796    mul     t0, t0,t3
2797    bne     a2, v0, 1b
2798     addiu  a0, a0, 4
2799
2800    andi    t1, 0xffff
2801    add     t9, t0, t2
2802    andi    t9, 0xffff
2803    mul     v1, t9, t1
2804    sra     s0, t6, 15
2805    sll     s0, s0, 1
2806    addiu   s0, s0, 1
2807    addiu   t9, t4, 16
2808    srav    v1, v1, t9
2809    mul     v1, v1, t3
2810    mul     t6, t6, s0
2811    andi    t7, 0xffff
2812    sh      v1, 0(a0)
2813    add     s1, t6, t5
2814    andi    s1, 0xffff
2815    mul     s2, s1, t7
2816    addiu   s1, t8, 16
2817    addiu   a2, a2, 4
2818    addiu   a1, a1, 4
2819    srav    s2, s2, s1
2820    mul     s2, s2, s0
2821    sh      s2, 2(a0)
2822
2823    RESTORE_REGS_FROM_STACK 16, s0, s1, s2
2824
2825    j       ra
2826     nop
2827
2828END(jsimd_quantize_mips_dspr2)
2829
2830/*****************************************************************************/
2831LEAF_MIPS_DSPR2(jsimd_quantize_float_mips_dspr2)
2832/*
2833 * a0     - coef_block
2834 * a1     - divisors
2835 * a2     - workspace
2836 */
2837
2838    .set at
2839
2840    li         t1, 0x46800100     //integer representation 16384.5
2841    mtc1       t1, f0
2842    li         t0, 63
28430:
2844    lwc1       f1, 0(a2)
2845    lwc1       f5, 0(a1)
2846    lwc1       f2, 4(a2)
2847    lwc1       f6, 4(a1)
2848    lwc1       f3, 8(a2)
2849    lwc1       f7, 8(a1)
2850    lwc1       f4, 12(a2)
2851    lwc1       f8, 12(a1)
2852    madd.s     f1, f0, f1, f5
2853    madd.s     f2, f0, f2, f6
2854    madd.s     f3, f0, f3, f7
2855    madd.s     f4, f0, f4, f8
2856    lwc1       f5, 16(a1)
2857    lwc1       f6, 20(a1)
2858    trunc.w.s  f1, f1
2859    trunc.w.s  f2, f2
2860    trunc.w.s  f3, f3
2861    trunc.w.s  f4, f4
2862    lwc1       f7, 24(a1)
2863    lwc1       f8, 28(a1)
2864    mfc1       t1, f1
2865    mfc1       t2, f2
2866    mfc1       t3, f3
2867    mfc1       t4, f4
2868    lwc1       f1, 16(a2)
2869    lwc1       f2, 20(a2)
2870    lwc1       f3, 24(a2)
2871    lwc1       f4, 28(a2)
2872    madd.s     f1, f0, f1, f5
2873    madd.s     f2, f0, f2, f6
2874    madd.s     f3, f0, f3, f7
2875    madd.s     f4, f0, f4, f8
2876    addiu      t1, t1, -16384
2877    addiu      t2, t2, -16384
2878    addiu      t3, t3, -16384
2879    addiu      t4, t4, -16384
2880    trunc.w.s  f1, f1
2881    trunc.w.s  f2, f2
2882    trunc.w.s  f3, f3
2883    trunc.w.s  f4, f4
2884    sh         t1, 0(a0)
2885    sh         t2, 2(a0)
2886    sh         t3, 4(a0)
2887    sh         t4, 6(a0)
2888    mfc1       t1, f1
2889    mfc1       t2, f2
2890    mfc1       t3, f3
2891    mfc1       t4, f4
2892    addiu      t0, t0, -8
2893    addiu      a2, a2, 32
2894    addiu      a1, a1, 32
2895    addiu      t1, t1, -16384
2896    addiu      t2, t2, -16384
2897    addiu      t3, t3, -16384
2898    addiu      t4, t4, -16384
2899    sh         t1, 8(a0)
2900    sh         t2, 10(a0)
2901    sh         t3, 12(a0)
2902    sh         t4, 14(a0)
2903    bgez       t0, 0b
2904     addiu     a0, a0, 16
2905
2906    j          ra
2907     nop
2908
2909END(jsimd_quantize_float_mips_dspr2)
2910/*****************************************************************************/
2911LEAF_MIPS_DSPR2(jsimd_idct_2x2_mips_dspr2)
2912/*
2913 * a0     - compptr->dct_table
2914 * a1     - coef_block
2915 * a2     - output_buf
2916 * a3     - output_col
2917 */
2918    .set at
2919
2920    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
2921
2922    addiu     sp, sp, -40
2923    move      v0, sp
2924    addiu     s2, zero, 29692
2925    addiu     s3, zero, -10426
2926    addiu     s4, zero, 6967
2927    addiu     s5, zero, -5906
2928    lh        t0, 0(a1)         // t0 = inptr[DCTSIZE*0]
2929    lh        t5, 0(a0)         // t5 = quantptr[DCTSIZE*0]
2930    lh        t1, 48(a1)        // t1 = inptr[DCTSIZE*3]
2931    lh        t6, 48(a0)        // t6 = quantptr[DCTSIZE*3]
2932    mul       t4, t5, t0
2933    lh        t0, 16(a1)        // t0 = inptr[DCTSIZE*1]
2934    lh        t5, 16(a0)        // t5 = quantptr[DCTSIZE*1]
2935    mul       t6, t6, t1
2936    mul       t5, t5, t0
2937    lh        t2, 80(a1)        // t2 = inptr[DCTSIZE*5]
2938    lh        t7, 80(a0)        // t7 = quantptr[DCTSIZE*5]
2939    lh        t3, 112(a1)       // t3 = inptr[DCTSIZE*7]
2940    lh        t8, 112(a0)       // t8 = quantptr[DCTSIZE*7]
2941    mul       t7, t7, t2
2942    mult      zero, zero
2943    mul       t8, t8, t3
2944    li        s0, 0x73FCD746    // s0 = (29692 << 16) | (-10426 & 0xffff)
2945    li        s1, 0x1B37E8EE    // s1 = (6967 << 16) | (-5906 & 0xffff)
2946    ins       t6, t5, 16, 16    // t6 = t5|t6
2947    sll       t4, t4, 15
2948    dpa.w.ph  $ac0, t6, s0
2949    lh        t1, 2(a1)
2950    lh        t6, 2(a0)
2951    ins       t8, t7, 16, 16    // t8 = t7|t8
2952    dpa.w.ph  $ac0, t8, s1
2953    mflo      t0, $ac0
2954    mul       t5, t6, t1
2955    lh        t1, 18(a1)
2956    lh        t6, 18(a0)
2957    lh        t2, 50(a1)
2958    lh        t7, 50(a0)
2959    mul       t6, t6, t1
2960    subu      t8, t4, t0
2961    mul       t7, t7, t2
2962    addu      t0, t4, t0
2963    shra_r.w  t0, t0, 13
2964    lh        t1, 82(a1)
2965    lh        t2, 82(a0)
2966    lh        t3, 114(a1)
2967    lh        t4, 114(a0)
2968    shra_r.w  t8, t8, 13
2969    mul       t1, t1, t2
2970    mul       t3, t3, t4
2971    sw        t0, 0(v0)
2972    sw        t8, 20(v0)
2973    sll       t4, t5, 15
2974    ins       t7, t6, 16, 16
2975    mult      zero, zero
2976    dpa.w.ph  $ac0, t7, s0
2977    ins       t3, t1, 16, 16
2978    lh        t1, 6(a1)
2979    lh        t6, 6(a0)
2980    dpa.w.ph  $ac0, t3, s1
2981    mflo      t0, $ac0
2982    mul       t5, t6, t1
2983    lh        t1, 22(a1)
2984    lh        t6, 22(a0)
2985    lh        t2, 54(a1)
2986    lh        t7, 54(a0)
2987    mul       t6, t6, t1
2988    subu      t8, t4, t0
2989    mul       t7, t7, t2
2990    addu      t0, t4, t0
2991    shra_r.w  t0, t0, 13
2992    lh        t1, 86(a1)
2993    lh        t2, 86(a0)
2994    lh        t3, 118(a1)
2995    lh        t4, 118(a0)
2996    shra_r.w  t8, t8, 13
2997    mul       t1, t1, t2
2998    mul       t3, t3, t4
2999    sw        t0, 4(v0)
3000    sw        t8, 24(v0)
3001    sll       t4, t5, 15
3002    ins       t7, t6, 16, 16
3003    mult      zero, zero
3004    dpa.w.ph  $ac0, t7, s0
3005    ins       t3, t1, 16, 16
3006    lh        t1, 10(a1)
3007    lh        t6, 10(a0)
3008    dpa.w.ph  $ac0, t3, s1
3009    mflo      t0, $ac0
3010    mul       t5, t6, t1
3011    lh        t1, 26(a1)
3012    lh        t6, 26(a0)
3013    lh        t2, 58(a1)
3014    lh        t7, 58(a0)
3015    mul       t6, t6, t1
3016    subu      t8, t4, t0
3017    mul       t7, t7, t2
3018    addu      t0, t4, t0
3019    shra_r.w  t0, t0, 13
3020    lh        t1, 90(a1)
3021    lh        t2, 90(a0)
3022    lh        t3, 122(a1)
3023    lh        t4, 122(a0)
3024    shra_r.w  t8, t8, 13
3025    mul       t1, t1, t2
3026    mul       t3, t3, t4
3027    sw        t0, 8(v0)
3028    sw        t8, 28(v0)
3029    sll       t4, t5, 15
3030    ins       t7, t6, 16, 16
3031    mult      zero, zero
3032    dpa.w.ph  $ac0, t7, s0
3033    ins       t3, t1, 16, 16
3034    lh        t1, 14(a1)
3035    lh        t6, 14(a0)
3036    dpa.w.ph  $ac0, t3, s1
3037    mflo      t0, $ac0
3038    mul       t5, t6, t1
3039    lh        t1, 30(a1)
3040    lh        t6, 30(a0)
3041    lh        t2, 62(a1)
3042    lh        t7, 62(a0)
3043    mul       t6, t6, t1
3044    subu      t8, t4, t0
3045    mul       t7, t7, t2
3046    addu      t0, t4, t0
3047    shra_r.w  t0, t0, 13
3048    lh        t1, 94(a1)
3049    lh        t2, 94(a0)
3050    lh        t3, 126(a1)
3051    lh        t4, 126(a0)
3052    shra_r.w  t8, t8, 13
3053    mul       t1, t1, t2
3054    mul       t3, t3, t4
3055    sw        t0, 12(v0)
3056    sw        t8, 32(v0)
3057    sll       t4, t5, 15
3058    ins       t7, t6, 16, 16
3059    mult      zero, zero
3060    dpa.w.ph  $ac0, t7, s0
3061    ins       t3, t1, 16, 16
3062    dpa.w.ph  $ac0, t3, s1
3063    mflo      t0, $ac0
3064    lw        t9, 0(a2)
3065    lw        t3, 0(v0)
3066    lw        t7, 4(v0)
3067    lw        t1, 8(v0)
3068    addu      t9, t9, a3
3069    sll       t3, t3, 15
3070    subu      t8, t4, t0
3071    addu      t0, t4, t0
3072    shra_r.w  t0, t0, 13
3073    shra_r.w  t8, t8, 13
3074    sw        t0, 16(v0)
3075    sw        t8, 36(v0)
3076    lw        t5, 12(v0)
3077    lw        t6, 16(v0)
3078    mult      t7, s2
3079    madd      t1, s3
3080    madd      t5, s4
3081    madd      t6, s5
3082    lw        t5, 24(v0)
3083    lw        t7, 28(v0)
3084    mflo      t0, $ac0
3085    lw        t8, 32(v0)
3086    lw        t2, 36(v0)
3087    mult      $ac1, t5, s2
3088    madd      $ac1, t7, s3
3089    madd      $ac1, t8, s4
3090    madd      $ac1, t2, s5
3091    addu      t1, t3, t0
3092    subu      t6, t3, t0
3093    shra_r.w  t1, t1, 20
3094    shra_r.w  t6, t6, 20
3095    mflo      t4, $ac1
3096    shll_s.w  t1, t1, 24
3097    shll_s.w  t6, t6, 24
3098    sra       t1, t1, 24
3099    sra       t6, t6, 24
3100    addiu     t1, t1, 128
3101    addiu     t6, t6, 128
3102    lw        t0, 20(v0)
3103    sb        t1, 0(t9)
3104    sb        t6, 1(t9)
3105    sll       t0, t0, 15
3106    lw        t9, 4(a2)
3107    addu      t1, t0, t4
3108    subu      t6, t0, t4
3109    addu      t9, t9, a3
3110    shra_r.w  t1, t1, 20
3111    shra_r.w  t6, t6, 20
3112    shll_s.w  t1, t1, 24
3113    shll_s.w  t6, t6, 24
3114    sra       t1, t1, 24
3115    sra       t6, t6, 24
3116    addiu     t1, t1, 128
3117    addiu     t6, t6, 128
3118    sb        t1, 0(t9)
3119    sb        t6, 1(t9)
3120    addiu     sp, sp, 40
3121
3122    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
3123
3124    j         ra
3125     nop
3126
3127END(jsimd_idct_2x2_mips_dspr2)
3128
3129/*****************************************************************************/
3130LEAF_MIPS_DSPR2(jsimd_idct_4x4_mips_dspr2)
3131/*
3132 * a0     - compptr->dct_table
3133 * a1     - coef_block
3134 * a2     - output_buf
3135 * a3     - output_col
3136 * 16(sp) - workspace[DCTSIZE*4];  // buffers data between passes
3137 */
3138
3139    .set at
3140    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3141
3142    lw        v1, 48(sp)
3143    move      t0, a1
3144    move      t1, v1
3145    li        t9, 4
3146    li        s0, 0x2e75f93e
3147    li        s1, 0x21f9ba79
3148    li        s2, 0xecc2efb0
3149    li        s3, 0x52031ccd
3150
31510:
3152    lh        s6, 32(t0)        // inptr[DCTSIZE*2]
3153    lh        t6, 32(a0)        // quantptr[DCTSIZE*2]
3154    lh        s7, 96(t0)        // inptr[DCTSIZE*6]
3155    lh        t7, 96(a0)        // quantptr[DCTSIZE*6]
3156    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3157    lh        s4, 0(t0)         // inptr[DCTSIZE*0]
3158    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3159    lh        s5, 0(a0)         // quantptr[0]
3160    li        s6, 15137
3161    li        s7, 6270
3162    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
3163    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3164    lh        t5, 112(t0)       // inptr[DCTSIZE*7]
3165    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3166    lh        s4, 112(a0)       // quantptr[DCTSIZE*7]
3167    lh        v0, 80(t0)        // inptr[DCTSIZE*5]
3168    lh        s5, 80(a0)        // quantptr[DCTSIZE*5]
3169    lh        s6, 48(a0)        // quantptr[DCTSIZE*3]
3170    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
3171    lh        s7, 16(a0)        // quantptr[DCTSIZE*1]
3172    lh        t8, 16(t0)        // inptr[DCTSIZE*1]
3173    subu      t6, t6, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3174    lh        t7, 48(t0)        // inptr[DCTSIZE*3]
3175    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3176    mul       v0, s5, v0        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3177    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3178    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3179    addu      t3, t2, t6        // tmp10 = tmp0 + z2
3180    subu      t4, t2, t6        // tmp10 = tmp0 - z2
3181    mult      $ac0, zero, zero
3182    mult      $ac1, zero, zero
3183    ins       t5, v0, 16, 16
3184    ins       t7, t8, 16, 16
3185    addiu     t9, t9, -1
3186    dpa.w.ph  $ac0, t5, s0
3187    dpa.w.ph  $ac0, t7, s1
3188    dpa.w.ph  $ac1, t5, s2
3189    dpa.w.ph  $ac1, t7, s3
3190    mflo      s4, $ac0
3191    mflo      s5, $ac1
3192    addiu     a0, a0, 2
3193    addiu     t1, t1, 4
3194    addiu     t0, t0, 2
3195    addu      t6, t4, s4
3196    subu      t5, t4, s4
3197    addu      s6, t3, s5
3198    subu      s7, t3, s5
3199    shra_r.w  t6, t6, 12        // DESCALE(tmp12 + temp1, 12)
3200    shra_r.w  t5, t5, 12        // DESCALE(tmp12 - temp1, 12)
3201    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
3202    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
3203    sw        t6, 28(t1)
3204    sw        t5, 60(t1)
3205    sw        s6, -4(t1)
3206    bgtz      t9, 0b
3207     sw       s7, 92(t1)
3208    // second loop three pass
3209    li        t9, 3
32101:
3211    lh        s6, 34(t0)        // inptr[DCTSIZE*2]
3212    lh        t6, 34(a0)        // quantptr[DCTSIZE*2]
3213    lh        s7, 98(t0)        // inptr[DCTSIZE*6]
3214    lh        t7, 98(a0)        // quantptr[DCTSIZE*6]
3215    mul       t6, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3216    lh        s4, 2(t0)         // inptr[DCTSIZE*0]
3217    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3218    lh        s5, 2(a0)         // quantptr[DCTSIZE*0]
3219    li        s6, 15137
3220    li        s7, 6270
3221    mul       t2, s4, s5        // tmp0 = (inptr[0] * quantptr[0])
3222    mul       v0, s6, t6        // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3223    lh        t5, 114(t0)       // inptr[DCTSIZE*7]
3224    mul       t7, s7, t7        // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3225    lh        s4, 114(a0)       // quantptr[DCTSIZE*7]
3226    lh        s5, 82(a0)        // quantptr[DCTSIZE*5]
3227    lh        t6, 82(t0)        // inptr[DCTSIZE*5]
3228    sll       t2, t2, 14        // tmp0 <<= (CONST_BITS+1)
3229    lh        s6, 50(a0)        // quantptr[DCTSIZE*3]
3230    lh        t8, 18(t0)        // inptr[DCTSIZE*1]
3231    subu      v0, v0, t7        // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3232    lh        t7, 50(t0)        // inptr[DCTSIZE*3]
3233    lh        s7, 18(a0)        // quantptr[DCTSIZE*1]
3234    mul       t5, s4, t5        // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3235    mul       t6, s5, t6        // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3236    mul       t7, s6, t7        // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3237    mul       t8, s7, t8        // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3238    addu      t3, t2, v0        // tmp10 = tmp0 + z2
3239    subu      t4, t2, v0        // tmp10 = tmp0 - z2
3240    mult      $ac0, zero, zero
3241    mult      $ac1, zero, zero
3242    ins       t5, t6, 16, 16
3243    ins       t7, t8, 16, 16
3244    dpa.w.ph  $ac0, t5, s0
3245    dpa.w.ph  $ac0, t7, s1
3246    dpa.w.ph  $ac1, t5, s2
3247    dpa.w.ph  $ac1, t7, s3
3248    mflo      t5, $ac0
3249    mflo      t6, $ac1
3250    addiu     t9, t9, -1
3251    addiu     t0, t0, 2
3252    addiu     a0, a0, 2
3253    addiu     t1, t1, 4
3254    addu      s5, t4, t5
3255    subu      s4, t4, t5
3256    addu      s6, t3, t6
3257    subu      s7, t3, t6
3258    shra_r.w  s5, s5, 12        // DESCALE(tmp12 + temp1, 12)
3259    shra_r.w  s4, s4, 12        // DESCALE(tmp12 - temp1, 12)
3260    shra_r.w  s6, s6, 12        // DESCALE(tmp10 + temp2, 12)
3261    shra_r.w  s7, s7, 12        // DESCALE(tmp10 - temp2, 12)
3262    sw        s5, 32(t1)
3263    sw        s4, 64(t1)
3264    sw        s6, 0(t1)
3265    bgtz      t9, 1b
3266     sw       s7, 96(t1)
3267    move      t1, v1
3268    li        s4, 15137
3269    lw        s6, 8(t1)         // wsptr[2]
3270    li        s5, 6270
3271    lw        s7, 24(t1)        // wsptr[6]
3272    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
3273    lw        t2, 0(t1)         // wsptr[0]
3274    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
3275    lh        t5, 28(t1)        // wsptr[7]
3276    lh        t6, 20(t1)        // wsptr[5]
3277    lh        t7, 12(t1)        // wsptr[3]
3278    lh        t8, 4(t1)         // wsptr[1]
3279    ins       t5, t6, 16, 16
3280    ins       t7, t8, 16, 16
3281    mult      $ac0, zero, zero
3282    dpa.w.ph  $ac0, t5, s0
3283    dpa.w.ph  $ac0, t7, s1
3284    mult      $ac1, zero, zero
3285    dpa.w.ph  $ac1, t5, s2
3286    dpa.w.ph  $ac1, t7, s3
3287    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
3288    mflo      s6, $ac0
3289    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3290    subu      s4, s4, s5
3291    addu      t3, t2, s4        // tmp10 = tmp0 + z2
3292    mflo      s7, $ac1
3293    subu      t4, t2, s4        // tmp10 = tmp0 - z2
3294    addu      t7, t4, s6
3295    subu      t8, t4, s6
3296    addu      t5, t3, s7
3297    subu      t6, t3, s7
3298    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
3299    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
3300    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
3301    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
3302    sll       s4, t9, 2
3303    lw        v0, 0(a2)         // output_buf[ctr]
3304    shll_s.w  t5, t5, 24
3305    shll_s.w  t6, t6, 24
3306    shll_s.w  t7, t7, 24
3307    shll_s.w  t8, t8, 24
3308    sra       t5, t5, 24
3309    sra       t6, t6, 24
3310    sra       t7, t7, 24
3311    sra       t8, t8, 24
3312    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
3313    addiu     t5, t5, 128
3314    addiu     t6, t6, 128
3315    addiu     t7, t7, 128
3316    addiu     t8, t8, 128
3317    sb        t5, 0(v0)
3318    sb        t7, 1(v0)
3319    sb        t8, 2(v0)
3320    sb        t6, 3(v0)
3321    // 2
3322    li        s4, 15137
3323    lw        s6, 40(t1)        // wsptr[2]
3324    li        s5, 6270
3325    lw        s7, 56(t1)        // wsptr[6]
3326    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
3327    lw        t2, 32(t1)        // wsptr[0]
3328    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
3329    lh        t5, 60(t1)        // wsptr[7]
3330    lh        t6, 52(t1)        // wsptr[5]
3331    lh        t7, 44(t1)        // wsptr[3]
3332    lh        t8, 36(t1)        // wsptr[1]
3333    ins       t5, t6, 16, 16
3334    ins       t7, t8, 16, 16
3335    mult      $ac0, zero, zero
3336    dpa.w.ph  $ac0, t5, s0
3337    dpa.w.ph  $ac0, t7, s1
3338    mult      $ac1, zero, zero
3339    dpa.w.ph  $ac1, t5, s2
3340    dpa.w.ph  $ac1, t7, s3
3341    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
3342    mflo      s6, $ac0
3343    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3344    subu      s4, s4, s5
3345    addu      t3, t2, s4        // tmp10 = tmp0 + z2
3346    mflo      s7, $ac1
3347    subu      t4, t2, s4        // tmp10 = tmp0 - z2
3348    addu      t7, t4, s6
3349    subu      t8, t4, s6
3350    addu      t5, t3, s7
3351    subu      t6, t3, s7
3352    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
3353    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
3354    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
3355    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
3356    sll       s4, t9, 2
3357    lw        v0, 4(a2)         // output_buf[ctr]
3358    shll_s.w  t5, t5, 24
3359    shll_s.w  t6, t6, 24
3360    shll_s.w  t7, t7, 24
3361    shll_s.w  t8, t8, 24
3362    sra       t5, t5, 24
3363    sra       t6, t6, 24
3364    sra       t7, t7, 24
3365    sra       t8, t8, 24
3366    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
3367    addiu     t5, t5, 128
3368    addiu     t6, t6, 128
3369    addiu     t7, t7, 128
3370    addiu     t8, t8, 128
3371    sb        t5, 0(v0)
3372    sb        t7, 1(v0)
3373    sb        t8, 2(v0)
3374    sb        t6, 3(v0)
3375    // 3
3376    li        s4, 15137
3377    lw        s6, 72(t1)        // wsptr[2]
3378    li        s5, 6270
3379    lw        s7, 88(t1)        // wsptr[6]
3380    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
3381    lw        t2, 64(t1)        // wsptr[0]
3382    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], - FIX_0_765366865)
3383    lh        t5, 92(t1)        // wsptr[7]
3384    lh        t6, 84(t1)        // wsptr[5]
3385    lh        t7, 76(t1)        // wsptr[3]
3386    lh        t8, 68(t1)        // wsptr[1]
3387    ins       t5, t6, 16, 16
3388    ins       t7, t8, 16, 16
3389    mult      $ac0, zero, zero
3390    dpa.w.ph  $ac0, t5, s0
3391    dpa.w.ph  $ac0, t7, s1
3392    mult      $ac1, zero, zero
3393    dpa.w.ph  $ac1, t5, s2
3394    dpa.w.ph  $ac1, t7, s3
3395    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
3396    mflo      s6, $ac0
3397    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3398    subu      s4, s4, s5
3399    addu      t3, t2, s4        // tmp10 = tmp0 + z2
3400    mflo      s7, $ac1
3401    subu      t4, t2, s4        // tmp10 = tmp0 - z2
3402    addu      t7, t4, s6
3403    subu      t8, t4, s6
3404    addu      t5, t3, s7
3405    subu      t6, t3, s7
3406    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
3407    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
3408    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
3409    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
3410    sll       s4, t9, 2
3411    lw        v0, 8(a2)         // output_buf[ctr]
3412    shll_s.w  t5, t5, 24
3413    shll_s.w  t6, t6, 24
3414    shll_s.w  t7, t7, 24
3415    shll_s.w  t8, t8, 24
3416    sra       t5, t5, 24
3417    sra       t6, t6, 24
3418    sra       t7, t7, 24
3419    sra       t8, t8, 24
3420    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
3421    addiu     t5, t5, 128
3422    addiu     t6, t6, 128
3423    addiu     t7, t7, 128
3424    addiu     t8, t8, 128
3425    sb        t5, 0(v0)
3426    sb        t7, 1(v0)
3427    sb        t8, 2(v0)
3428    sb        t6, 3(v0)
3429    li        s4, 15137
3430    lw        s6, 104(t1)       // wsptr[2]
3431    li        s5, 6270
3432    lw        s7, 120(t1)       // wsptr[6]
3433    mul       s4, s4, s6        // MULTIPLY((INT32) wsptr[2], FIX_1_847759065)
3434    lw        t2, 96(t1)        // wsptr[0]
3435    mul       s5, s5, s7        // MULTIPLY((INT32) wsptr[6], -FIX_0_765366865)
3436    lh        t5, 124(t1)       // wsptr[7]
3437    lh        t6, 116(t1)       // wsptr[5]
3438    lh        t7, 108(t1)       // wsptr[3]
3439    lh        t8, 100(t1)       // wsptr[1]
3440    ins       t5, t6, 16, 16
3441    ins       t7, t8, 16, 16
3442    mult      $ac0, zero, zero
3443    dpa.w.ph  $ac0, t5, s0
3444    dpa.w.ph  $ac0, t7, s1
3445    mult      $ac1, zero, zero
3446    dpa.w.ph  $ac1, t5, s2
3447    dpa.w.ph  $ac1, t7, s3
3448    sll       t2, t2, 14        // tmp0 = ((INT32) wsptr[0]) << (CONST_BITS+1)
3449    mflo      s6, $ac0
3450    // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3451    subu      s4, s4, s5
3452    addu      t3, t2, s4        // tmp10 = tmp0 + z2;
3453    mflo      s7, $ac1
3454    subu      t4, t2, s4        // tmp10 = tmp0 - z2;
3455    addu      t7, t4, s6
3456    subu      t8, t4, s6
3457    addu      t5, t3, s7
3458    subu      t6, t3, s7
3459    shra_r.w  t5, t5, 19        // DESCALE(tmp10 + temp2, 19)
3460    shra_r.w  t6, t6, 19        // DESCALE(tmp10 - temp2, 19)
3461    shra_r.w  t7, t7, 19        // DESCALE(tmp12 + temp1, 19)
3462    shra_r.w  t8, t8, 19        // DESCALE(tmp12 - temp1, 19)
3463    sll       s4, t9, 2
3464    lw        v0, 12(a2)        // output_buf[ctr]
3465    shll_s.w  t5, t5, 24
3466    shll_s.w  t6, t6, 24
3467    shll_s.w  t7, t7, 24
3468    shll_s.w  t8, t8, 24
3469    sra       t5, t5, 24
3470    sra       t6, t6, 24
3471    sra       t7, t7, 24
3472    sra       t8, t8, 24
3473    addu      v0, v0, a3        // outptr = output_buf[ctr] + output_col
3474    addiu     t5, t5, 128
3475    addiu     t6, t6, 128
3476    addiu     t7, t7, 128
3477    addiu     t8, t8, 128
3478    sb        t5, 0(v0)
3479    sb        t7, 1(v0)
3480    sb        t8, 2(v0)
3481    sb        t6, 3(v0)
3482
3483    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3484
3485    j         ra
3486     nop
3487END(jsimd_idct_4x4_mips_dspr2)
3488
3489/*****************************************************************************/
3490LEAF_MIPS_DSPR2(jsimd_idct_6x6_mips_dspr2)
3491/*
3492 * a0     - compptr->dct_table
3493 * a1     - coef_block
3494 * a2     - output_buf
3495 * a3     - output_col
3496 */
3497    .set at
3498
3499    SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3500
3501    addiu     sp, sp, -144
3502    move      v0, sp
3503    addiu     v1, v0, 24
3504    addiu     t9, zero, 5793
3505    addiu     s0, zero, 10033
3506    addiu     s1, zero, 2998
3507
35081:
3509    lh        s2, 0(a0)   // q0 = quantptr[ 0]
3510    lh        s3, 32(a0)  // q1 = quantptr[16]
3511    lh        s4, 64(a0)  // q2 = quantptr[32]
3512    lh        t2, 64(a1)  // tmp2 = inptr[32]
3513    lh        t1, 32(a1)  // tmp1 = inptr[16]
3514    lh        t0, 0(a1)   // tmp0 = inptr[ 0]
3515    mul       t2, t2, s4  // tmp2 = tmp2 * q2
3516    mul       t1, t1, s3  // tmp1 = tmp1 * q1
3517    mul       t0, t0, s2  // tmp0 = tmp0 * q0
3518    lh        t6, 16(a1)  // z1 = inptr[ 8]
3519    lh        t8, 80(a1)  // z3 = inptr[40]
3520    lh        t7, 48(a1)  // z2 = inptr[24]
3521    lh        s2, 16(a0)  // q0 = quantptr[ 8]
3522    lh        s4, 80(a0)  // q2 = quantptr[40]
3523    lh        s3, 48(a0)  // q1 = quantptr[24]
3524    mul       t2, t2, t9  // tmp2 = tmp2 * 5793
3525    mul       t1, t1, s0  // tmp1 = tmp1 * 10033
3526    sll       t0, t0, 13  // tmp0 = tmp0 << 13
3527    mul       t6, t6, s2  // z1 = z1 * q0
3528    mul       t8, t8, s4  // z3 = z3 * q2
3529    mul       t7, t7, s3  // z2 = z2 * q1
3530    addu      t3, t0, t2  // tmp10 = tmp0 + tmp2
3531    sll       t2, t2, 1   // tmp2 = tmp2 << 2
3532    subu      t4, t0, t2  // tmp11 = tmp0 - tmp2;
3533    subu      t5, t3, t1  // tmp12 = tmp10 - tmp1
3534    addu      t3, t3, t1  // tmp10 = tmp10 + tmp1
3535    addu      t1, t6, t8  // tmp1 = z1 + z3
3536    mul       t1, t1, s1  // tmp1 = tmp1 * 2998
3537    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
3538    subu      t2, t6, t8  // tmp2 = z1 - z3
3539    subu      t2, t2, t7  // tmp2 = tmp2 - z2
3540    sll       t2, t2, 2   // tmp2 = tmp2 << 2
3541    addu      t0, t6, t7  // tmp0 = z1 + z2
3542    sll       t0, t0, 13  // tmp0 = tmp0 << 13
3543    subu      s2, t8, t7  // q0 = z3 - z2
3544    sll       s2, s2, 13  // q0 = q0 << 13
3545    addu      t0, t0, t1  // tmp0 = tmp0 + tmp1
3546    addu      t1, s2, t1  // tmp1 = q0 + tmp1
3547    addu      s2, t4, t2  // q0 = tmp11 + tmp2
3548    subu      s3, t4, t2  // q1 = tmp11 - tmp2
3549    addu      t6, t3, t0  // z1 = tmp10 + tmp0
3550    subu      t7, t3, t0  // z2 = tmp10 - tmp0
3551    addu      t4, t5, t1  // tmp11 = tmp12 + tmp1
3552    subu      t5, t5, t1  // tmp12 = tmp12 - tmp1
3553    shra_r.w  t6, t6, 11  // z1 = (z1 + 1024) >> 11
3554    shra_r.w  t7, t7, 11  // z2 = (z2 + 1024) >> 11
3555    shra_r.w  t4, t4, 11  // tmp11 = (tmp11 + 1024) >> 11
3556    shra_r.w  t5, t5, 11  // tmp12 = (tmp12 + 1024) >> 11
3557    sw        s2, 24(v0)
3558    sw        s3, 96(v0)
3559    sw        t6, 0(v0)
3560    sw        t7, 120(v0)
3561    sw        t4, 48(v0)
3562    sw        t5, 72(v0)
3563    addiu     v0, v0, 4
3564    addiu     a1, a1, 2
3565    bne       v0, v1, 1b
3566     addiu    a0, a0, 2
3567
3568    /* Pass 2: process 6 rows from work array, store into output array. */
3569    move      v0, sp
3570    addiu     v1, v0, 144
3571
35722:
3573    lw        t0, 0(v0)
3574    lw        t2, 16(v0)
3575    lw        s5, 0(a2)
3576    addiu     t0, t0, 16
3577    sll       t0, t0, 13
3578    mul       t3, t2, t9
3579    lw        t6, 4(v0)
3580    lw        t8, 20(v0)
3581    lw        t7, 12(v0)
3582    addu      s5, s5, a3
3583    addu      s6, t6, t8
3584    mul       s6, s6, s1
3585    addu      t1, t0, t3
3586    subu      t4, t0, t3
3587    subu      t4, t4, t3
3588    lw        t3, 8(v0)
3589    mul       t0, t3, s0
3590    addu      s7, t6, t7
3591    sll       s7, s7, 13
3592    addu      s7, s6, s7
3593    subu      t2, t8, t7
3594    sll       t2, t2, 13
3595    addu      t2, s6, t2
3596    subu      s6, t6, t7
3597    subu      s6, s6, t8
3598    sll       s6, s6, 13
3599    addu      t3, t1, t0
3600    subu      t5, t1, t0
3601    addu      t6, t3, s7
3602    subu      t3, t3, s7
3603    addu      t7, t4, s6
3604    subu      t4, t4, s6
3605    addu      t8, t5, t2
3606    subu      t5, t5, t2
3607    shll_s.w  t6, t6, 6
3608    shll_s.w  t3, t3, 6
3609    shll_s.w  t7, t7, 6
3610    shll_s.w  t4, t4, 6
3611    shll_s.w  t8, t8, 6
3612    shll_s.w  t5, t5, 6
3613    sra       t6, t6, 24
3614    addiu     t6, t6, 128
3615    sra       t3, t3, 24
3616    addiu     t3, t3, 128
3617    sb        t6, 0(s5)
3618    sra       t7, t7, 24
3619    addiu     t7, t7, 128
3620    sb        t3, 5(s5)
3621    sra       t4, t4, 24
3622    addiu     t4, t4, 128
3623    sb        t7, 1(s5)
3624    sra       t8, t8, 24
3625    addiu     t8, t8, 128
3626    sb        t4, 4(s5)
3627    addiu     v0, v0, 24
3628    sra       t5, t5, 24
3629    addiu     t5, t5, 128
3630    sb        t8, 2(s5)
3631    addiu     a2, a2,  4
3632    bne       v0, v1, 2b
3633     sb       t5, 3(s5)
3634
3635    addiu     sp, sp, 144
3636
3637    RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3638
3639    j         ra
3640     nop
3641
3642END(jsimd_idct_6x6_mips_dspr2)
3643
3644/*****************************************************************************/
3645LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass1_mips_dspr2)
3646/*
3647 * a0     - compptr->dct_table
3648 * a1     - coef_block
3649 * a2     - workspace
3650 */
3651
3652    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3653
3654    li         a3, 8
3655
36561:
3657    // odd part
3658    lh         t0, 48(a1)
3659    lh         t1, 48(a0)
3660    lh         t2, 16(a1)
3661    lh         t3, 16(a0)
3662    lh         t4, 80(a1)
3663    lh         t5, 80(a0)
3664    lh         t6, 112(a1)
3665    lh         t7, 112(a0)
3666    mul        t0, t0, t1    // z2
3667    mul        t1, t2, t3    // z1
3668    mul        t2, t4, t5    // z3
3669    mul        t3, t6, t7    // z4
3670    li         t4, 10703     // FIX(1.306562965)
3671    li         t5, 4433      // FIX_0_541196100
3672    li         t6, 7053      // FIX(0.860918669)
3673    mul        t4, t0,t4     // tmp11
3674    mul        t5, t0,t5     // -tmp14
3675    addu       t7, t1,t2     // tmp10
3676    addu       t8, t7,t3     // tmp10 + z4
3677    mul        t6, t6, t8    // tmp15
3678    li         t8, 2139      // FIX(0.261052384)
3679    mul        t8, t7, t8    // MULTIPLY(tmp10, FIX(0.261052384))
3680    li         t7, 2295      // FIX(0.280143716)
3681    mul        t7, t1, t7    // MULTIPLY(z1, FIX(0.280143716))
3682    addu       t9, t2, t3    // z3 + z4
3683    li         s0, 8565      // FIX(1.045510580)
3684    mul        t9, t9, s0    // -tmp13
3685    li         s0, 12112     // FIX(1.478575242)
3686    mul        s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242)
3687    li         s1, 12998     // FIX(1.586706681)
3688    mul        s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
3689    li         s2, 5540      // FIX(0.676326758)
3690    mul        s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
3691    li         s3, 16244     // FIX(1.982889723)
3692    mul        s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
3693    subu       t1, t1, t3    // z1-=z4
3694    subu       t0, t0, t2    // z2-=z3
3695    addu       t2, t0, t1    // z1+z2
3696    li         t3, 4433      // FIX_0_541196100
3697    mul        t2, t2, t3    // z3
3698    li         t3, 6270      // FIX_0_765366865
3699    mul        t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
3700    li         t3, 15137     // FIX_0_765366865
3701    mul        t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
3702    addu       t8, t6, t8    // tmp12
3703    addu       t3, t8, t4    // tmp12 + tmp11
3704    addu       t3, t3, t7    // tmp10
3705    subu       t8, t8, t9    // tmp12 + tmp13
3706    addu       s0, t5, s0
3707    subu       t8, t8, s0    // tmp12
3708    subu       t9, t6, t9
3709    subu       s1, s1, t4
3710    addu       t9, t9, s1    // tmp13
3711    subu       t6, t6, t5
3712    subu       t6, t6, s2
3713    subu       t6, t6, s3    // tmp15
3714    // even part start
3715    lh         t4, 64(a1)
3716    lh         t5, 64(a0)
3717    lh         t7, 32(a1)
3718    lh         s0, 32(a0)
3719    lh         s1, 0(a1)
3720    lh         s2, 0(a0)
3721    lh         s3, 96(a1)
3722    lh         v0, 96(a0)
3723    mul        t4, t4, t5    // DEQUANTIZE(inptr[DCTSIZE*4],quantptr[DCTSIZE*4])
3724    mul        t5, t7, s0    // DEQUANTIZE(inptr[DCTSIZE*2],quantptr[DCTSIZE*2])
3725    mul        t7, s1, s2    // DEQUANTIZE(inptr[DCTSIZE*0],quantptr[DCTSIZE*0])
3726    mul        s0, s3, v0    // DEQUANTIZE(inptr[DCTSIZE*6],quantptr[DCTSIZE*6])
3727    // odd part end
3728    addu       t1, t2, t1    // tmp11
3729    subu       t0, t2, t0    // tmp14
3730    // update counter and pointers
3731    addiu      a3, a3, -1
3732    addiu      a0, a0, 2
3733    addiu      a1, a1, 2
3734    // even part rest
3735    li         s1, 10033
3736    li         s2, 11190
3737    mul        t4, t4, s1    // z4
3738    mul        s1, t5, s2    // z4
3739    sll        t5, t5, 13    // z1
3740    sll        t7, t7, 13
3741    addiu      t7, t7, 1024  // z3
3742    sll        s0, s0, 13    // z2
3743    addu       s2, t7, t4    // tmp10
3744    subu       t4, t7, t4    // tmp11
3745    subu       s3, t5, s0    // tmp12
3746    addu       t2, t7, s3    // tmp21
3747    subu       s3, t7, s3    // tmp24
3748    addu       t7, s1, s0    // tmp12
3749    addu       v0, s2, t7    // tmp20
3750    subu       s2, s2, t7    // tmp25
3751    subu       s1, s1, t5    // z4 - z1
3752    subu       s1, s1, s0    // tmp12
3753    addu       s0, t4, s1    // tmp22
3754    subu       t4, t4, s1    // tmp23
3755    // final output stage
3756    addu       t5, v0, t3
3757    subu       v0, v0, t3
3758    addu       t3, t2, t1
3759    subu       t2, t2, t1
3760    addu       t1, s0, t8
3761    subu       s0, s0, t8
3762    addu       t8, t4, t9
3763    subu       t4, t4, t9
3764    addu       t9, s3, t0
3765    subu       s3, s3, t0
3766    addu       t0, s2, t6
3767    subu       s2, s2, t6
3768    sra        t5, t5, 11
3769    sra        t3, t3, 11
3770    sra        t1, t1, 11
3771    sra        t8, t8, 11
3772    sra        t9, t9, 11
3773    sra        t0, t0, 11
3774    sra        s2, s2, 11
3775    sra        s3, s3, 11
3776    sra        t4, t4, 11
3777    sra        s0, s0, 11
3778    sra        t2, t2, 11
3779    sra        v0, v0, 11
3780    sw         t5, 0(a2)
3781    sw         t3, 32(a2)
3782    sw         t1, 64(a2)
3783    sw         t8, 96(a2)
3784    sw         t9, 128(a2)
3785    sw         t0, 160(a2)
3786    sw         s2, 192(a2)
3787    sw         s3, 224(a2)
3788    sw         t4, 256(a2)
3789    sw         s0, 288(a2)
3790    sw         t2, 320(a2)
3791    sw         v0, 352(a2)
3792    bgtz       a3, 1b
3793     addiu     a2, a2, 4
3794
3795    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3796
3797    j          ra
3798     nop
3799
3800END(jsimd_idct_12x12_pass1_mips_dspr2)
3801
3802/*****************************************************************************/
3803LEAF_MIPS_DSPR2(jsimd_idct_12x12_pass2_mips_dspr2)
3804/*
3805 * a0     - workspace
3806 * a1     - output
3807 */
3808
3809    SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3810
3811    li        a3, 12
3812
38131:
3814    // Odd part
3815    lw        t0, 12(a0)
3816    lw        t1, 4(a0)
3817    lw        t2, 20(a0)
3818    lw        t3, 28(a0)
3819    li        t4, 10703     // FIX(1.306562965)
3820    li        t5, 4433      // FIX_0_541196100
3821    mul       t4, t0, t4    // tmp11
3822    mul       t5, t0, t5    // -tmp14
3823    addu      t6, t1, t2    // tmp10
3824    li        t7, 2139      // FIX(0.261052384)
3825    mul       t7, t6, t7    // MULTIPLY(tmp10, FIX(0.261052384))
3826    addu      t6, t6, t3    // tmp10 + z4
3827    li        t8, 7053      // FIX(0.860918669)
3828    mul       t6, t6, t8    // tmp15
3829    li        t8, 2295      // FIX(0.280143716)
3830    mul       t8, t1, t8    // MULTIPLY(z1, FIX(0.280143716))
3831    addu      t9, t2, t3    // z3 + z4
3832    li        s0, 8565      // FIX(1.045510580)
3833    mul       t9, t9, s0    // -tmp13
3834    li        s0, 12112     // FIX(1.478575242)
3835    mul       s0, t2, s0    // MULTIPLY(z3, FIX(1.478575242))
3836    li        s1, 12998     // FIX(1.586706681)
3837    mul       s1, t3, s1    // MULTIPLY(z4, FIX(1.586706681))
3838    li        s2, 5540      // FIX(0.676326758)
3839    mul       s2, t1, s2    // MULTIPLY(z1, FIX(0.676326758))
3840    li        s3, 16244     // FIX(1.982889723)
3841    mul       s3, t3, s3    // MULTIPLY(z4, FIX(1.982889723))
3842    subu      t1, t1, t3    // z1 -= z4
3843    subu      t0, t0, t2    // z2 -= z3
3844    addu      t2, t1, t0    // z1 + z2
3845    li        t3, 4433      // FIX_0_541196100
3846    mul       t2, t2, t3    // z3
3847    li        t3, 6270      // FIX_0_765366865
3848    mul       t1, t1, t3    // MULTIPLY(z1, FIX_0_765366865)
3849    li        t3, 15137     // FIX_1_847759065
3850    mul       t0, t0, t3    // MULTIPLY(z2, FIX_1_847759065)
3851    addu      t3, t6, t7    // tmp12
3852    addu      t7, t3, t4
3853    addu      t7, t7, t8    // tmp10
3854    subu      t3, t3, t9
3855    subu      t3, t3, t5
3856    subu      t3, t3, s0    // tmp12
3857    subu      t9, t6, t9
3858    subu      t9, t9, t4
3859    addu      t9, t9, s1    // tmp13
3860    subu      t6, t6, t5
3861    subu      t6, t6, s2
3862    subu      t6, t6, s3    // tmp15
3863    addu      t1, t2, t1    // tmp11
3864    subu      t0, t2, t0    // tmp14
3865    // even part
3866    lw        t2, 16(a0)    // z4
3867    lw        t4, 8(a0)     // z1
3868    lw        t5, 0(a0)     // z3
3869    lw        t8, 24(a0)    // z2
3870    li        s0, 10033     // FIX(1.224744871)
3871    li        s1, 11190     // FIX(1.366025404)
3872    mul       t2, t2, s0    // z4
3873    mul       s0, t4, s1    // z4
3874    addiu     t5, t5, 0x10
3875    sll       t5, t5, 13    // z3
3876    sll       t4, t4, 13    // z1
3877    sll       t8, t8, 13    // z2
3878    subu      s1, t4, t8    // tmp12
3879    addu      s2, t5, t2    // tmp10
3880    subu      t2, t5, t2    // tmp11
3881    addu      s3, t5, s1    // tmp21
3882    subu      s1, t5, s1    // tmp24
3883    addu      t5, s0, t8    // tmp12
3884    addu      v0, s2, t5    // tmp20
3885    subu      t5, s2, t5    // tmp25
3886    subu      t4, s0, t4
3887    subu      t4, t4, t8    // tmp12
3888    addu      t8, t2, t4    // tmp22
3889    subu      t2, t2, t4    // tmp23
3890    // increment counter and pointers
3891    addiu     a3, a3, -1
3892    addiu     a0, a0, 32
3893    // Final stage
3894    addu      t4, v0, t7
3895    subu      v0, v0, t7
3896    addu      t7, s3, t1
3897    subu      s3, s3, t1
3898    addu      t1, t8, t3
3899    subu      t8, t8, t3
3900    addu      t3, t2, t9
3901    subu      t2, t2, t9
3902    addu      t9, s1, t0
3903    subu      s1, s1, t0
3904    addu      t0, t5, t6
3905    subu      t5, t5, t6
3906    sll       t4, t4, 4
3907    sll       t7, t7, 4
3908    sll       t1, t1, 4
3909    sll       t3, t3, 4
3910    sll       t9, t9, 4
3911    sll       t0, t0, 4
3912    sll       t5, t5, 4
3913    sll       s1, s1, 4
3914    sll       t2, t2, 4
3915    sll       t8, t8, 4
3916    sll       s3, s3, 4
3917    sll       v0, v0, 4
3918    shll_s.w  t4, t4, 2
3919    shll_s.w  t7, t7, 2
3920    shll_s.w  t1, t1, 2
3921    shll_s.w  t3, t3, 2
3922    shll_s.w  t9, t9, 2
3923    shll_s.w  t0, t0, 2
3924    shll_s.w  t5, t5, 2
3925    shll_s.w  s1, s1, 2
3926    shll_s.w  t2, t2, 2
3927    shll_s.w  t8, t8, 2
3928    shll_s.w  s3, s3, 2
3929    shll_s.w  v0, v0, 2
3930    srl       t4, t4, 24
3931    srl       t7, t7, 24
3932    srl       t1, t1, 24
3933    srl       t3, t3, 24
3934    srl       t9, t9, 24
3935    srl       t0, t0, 24
3936    srl       t5, t5, 24
3937    srl       s1, s1, 24
3938    srl       t2, t2, 24
3939    srl       t8, t8, 24
3940    srl       s3, s3, 24
3941    srl       v0, v0, 24
3942    lw        t6, 0(a1)
3943    addiu     t4, t4, 0x80
3944    addiu     t7, t7, 0x80
3945    addiu     t1, t1, 0x80
3946    addiu     t3, t3, 0x80
3947    addiu     t9, t9, 0x80
3948    addiu     t0, t0, 0x80
3949    addiu     t5, t5, 0x80
3950    addiu     s1, s1, 0x80
3951    addiu     t2, t2, 0x80
3952    addiu     t8, t8, 0x80
3953    addiu     s3, s3, 0x80
3954    addiu     v0, v0, 0x80
3955    sb        t4, 0(t6)
3956    sb        t7, 1(t6)
3957    sb        t1, 2(t6)
3958    sb        t3, 3(t6)
3959    sb        t9, 4(t6)
3960    sb        t0, 5(t6)
3961    sb        t5, 6(t6)
3962    sb        s1, 7(t6)
3963    sb        t2, 8(t6)
3964    sb        t8, 9(t6)
3965    sb        s3, 10(t6)
3966    sb        v0, 11(t6)
3967    bgtz      a3, 1b
3968     addiu    a1, a1, 4
3969
3970    RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3971
3972    jr        ra
3973     nop
3974
3975END(jsimd_idct_12x12_pass2_mips_dspr2)
3976
3977/*****************************************************************************/
3978LEAF_MIPS_DSPR2(jsimd_convsamp_mips_dspr2)
3979/*
3980 * a0     - sample_data
3981 * a1     - start_col
3982 * a2     - workspace
3983 */
3984
3985    lw             t0, 0(a0)
3986    li             t7, 0xff80ff80
3987    addu           t0, t0, a1
3988    ulw            t1, 0(t0)
3989    ulw            t2, 4(t0)
3990    preceu.ph.qbr  t3, t1
3991    preceu.ph.qbl  t4, t1
3992    lw             t0, 4(a0)
3993    preceu.ph.qbr  t5, t2
3994    preceu.ph.qbl  t6, t2
3995    addu           t0, t0, a1
3996    addu.ph        t3, t3, t7
3997    addu.ph        t4, t4, t7
3998    ulw            t1, 0(t0)
3999    ulw            t2, 4(t0)
4000    addu.ph        t5, t5, t7
4001    addu.ph        t6, t6, t7
4002    usw            t3, 0(a2)
4003    usw            t4, 4(a2)
4004    preceu.ph.qbr  t3, t1
4005    preceu.ph.qbl  t4, t1
4006    usw            t5, 8(a2)
4007    usw            t6, 12(a2)
4008
4009    lw             t0, 8(a0)
4010    preceu.ph.qbr  t5, t2
4011    preceu.ph.qbl  t6, t2
4012    addu           t0, t0, a1
4013    addu.ph        t3, t3, t7
4014    addu.ph        t4, t4, t7
4015    ulw            t1, 0(t0)
4016    ulw            t2, 4(t0)
4017    addu.ph        t5, t5, t7
4018    addu.ph        t6, t6, t7
4019    usw            t3, 16(a2)
4020    usw            t4, 20(a2)
4021    preceu.ph.qbr  t3, t1
4022    preceu.ph.qbl  t4, t1
4023    usw            t5, 24(a2)
4024    usw            t6, 28(a2)
4025
4026    lw             t0, 12(a0)
4027    preceu.ph.qbr  t5, t2
4028    preceu.ph.qbl  t6, t2
4029    addu           t0, t0, a1
4030    addu.ph        t3, t3, t7
4031    addu.ph        t4, t4, t7
4032    ulw            t1, 0(t0)
4033    ulw            t2, 4(t0)
4034    addu.ph        t5, t5, t7
4035    addu.ph        t6, t6, t7
4036    usw            t3, 32(a2)
4037    usw            t4, 36(a2)
4038    preceu.ph.qbr  t3, t1
4039    preceu.ph.qbl  t4, t1
4040    usw            t5, 40(a2)
4041    usw            t6, 44(a2)
4042
4043    lw             t0, 16(a0)
4044    preceu.ph.qbr  t5, t2
4045    preceu.ph.qbl  t6, t2
4046    addu           t0, t0, a1
4047    addu.ph        t3, t3, t7
4048    addu.ph        t4, t4, t7
4049    ulw            t1, 0(t0)
4050    ulw            t2, 4(t0)
4051    addu.ph        t5, t5, t7
4052    addu.ph        t6, t6, t7
4053    usw            t3, 48(a2)
4054    usw            t4, 52(a2)
4055    preceu.ph.qbr  t3, t1
4056    preceu.ph.qbl  t4, t1
4057    usw            t5, 56(a2)
4058    usw            t6, 60(a2)
4059
4060    lw             t0, 20(a0)
4061    preceu.ph.qbr  t5, t2
4062    preceu.ph.qbl  t6, t2
4063    addu           t0, t0, a1
4064    addu.ph        t3, t3, t7
4065    addu.ph        t4, t4, t7
4066    ulw            t1, 0(t0)
4067    ulw            t2, 4(t0)
4068    addu.ph        t5, t5, t7
4069    addu.ph        t6, t6, t7
4070    usw            t3, 64(a2)
4071    usw            t4, 68(a2)
4072    preceu.ph.qbr  t3, t1
4073    preceu.ph.qbl  t4, t1
4074    usw            t5, 72(a2)
4075    usw            t6, 76(a2)
4076
4077    lw             t0, 24(a0)
4078    preceu.ph.qbr  t5, t2
4079    preceu.ph.qbl  t6, t2
4080    addu           t0, t0, a1
4081    addu.ph        t3, t3, t7
4082    addu.ph        t4, t4, t7
4083    ulw            t1, 0(t0)
4084    ulw            t2, 4(t0)
4085    addu.ph        t5, t5, t7
4086    addu.ph        t6, t6, t7
4087    usw            t3, 80(a2)
4088    usw            t4, 84(a2)
4089    preceu.ph.qbr  t3, t1
4090    preceu.ph.qbl  t4, t1
4091    usw            t5, 88(a2)
4092    usw            t6, 92(a2)
4093
4094    lw             t0, 28(a0)
4095    preceu.ph.qbr  t5, t2
4096    preceu.ph.qbl  t6, t2
4097    addu           t0, t0, a1
4098    addu.ph        t3, t3, t7
4099    addu.ph        t4, t4, t7
4100    ulw            t1, 0(t0)
4101    ulw            t2, 4(t0)
4102    addu.ph        t5, t5, t7
4103    addu.ph        t6, t6, t7
4104    usw            t3, 96(a2)
4105    usw            t4, 100(a2)
4106    preceu.ph.qbr  t3, t1
4107    preceu.ph.qbl  t4, t1
4108    usw            t5, 104(a2)
4109    usw            t6, 108(a2)
4110    preceu.ph.qbr  t5, t2
4111    preceu.ph.qbl  t6, t2
4112    addu.ph        t3, t3, t7
4113    addu.ph        t4, t4, t7
4114    addu.ph        t5, t5, t7
4115    addu.ph        t6, t6, t7
4116    usw            t3, 112(a2)
4117    usw            t4, 116(a2)
4118    usw            t5, 120(a2)
4119    usw            t6, 124(a2)
4120
4121    j              ra
4122     nop
4123
4124END(jsimd_convsamp_mips_dspr2)
4125
4126/*****************************************************************************/
4127LEAF_MIPS_DSPR2(jsimd_convsamp_float_mips_dspr2)
4128/*
4129 * a0     - sample_data
4130 * a1     - start_col
4131 * a2     - workspace
4132 */
4133
4134    .set at
4135
4136    lw       t0, 0(a0)
4137    addu     t0, t0, a1
4138    lbu      t1, 0(t0)
4139    lbu      t2, 1(t0)
4140    lbu      t3, 2(t0)
4141    lbu      t4, 3(t0)
4142    lbu      t5, 4(t0)
4143    lbu      t6, 5(t0)
4144    lbu      t7, 6(t0)
4145    lbu      t8, 7(t0)
4146    addiu    t1, t1, -128
4147    addiu    t2, t2, -128
4148    addiu    t3, t3, -128
4149    addiu    t4, t4, -128
4150    addiu    t5, t5, -128
4151    addiu    t6, t6, -128
4152    addiu    t7, t7, -128
4153    addiu    t8, t8, -128
4154    mtc1     t1, f1
4155    mtc1     t2, f2
4156    mtc1     t3, f3
4157    mtc1     t4, f4
4158    mtc1     t5, f5
4159    mtc1     t6, f6
4160    mtc1     t7, f7
4161    mtc1     t8, f8
4162    cvt.s.w  f1, f1
4163    cvt.s.w  f2, f2
4164    cvt.s.w  f3, f3
4165    cvt.s.w  f4, f4
4166    cvt.s.w  f5, f5
4167    cvt.s.w  f6, f6
4168    cvt.s.w  f7, f7
4169    cvt.s.w  f8, f8
4170    lw       t0, 4(a0)
4171    swc1     f1, 0(a2)
4172    swc1     f2, 4(a2)
4173    swc1     f3, 8(a2)
4174    addu     t0, t0, a1
4175    swc1     f4, 12(a2)
4176    swc1     f5, 16(a2)
4177    swc1     f6, 20(a2)
4178    swc1     f7, 24(a2)
4179    swc1     f8, 28(a2)
4180    //elemr 1
4181    lbu      t1, 0(t0)
4182    lbu      t2, 1(t0)
4183    lbu      t3, 2(t0)
4184    lbu      t4, 3(t0)
4185    lbu      t5, 4(t0)
4186    lbu      t6, 5(t0)
4187    lbu      t7, 6(t0)
4188    lbu      t8, 7(t0)
4189    addiu    t1, t1, -128
4190    addiu    t2, t2, -128
4191    addiu    t3, t3, -128
4192    addiu    t4, t4, -128
4193    addiu    t5, t5, -128
4194    addiu    t6, t6, -128
4195    addiu    t7, t7, -128
4196    addiu    t8, t8, -128
4197    mtc1     t1, f1
4198    mtc1     t2, f2
4199    mtc1     t3, f3
4200    mtc1     t4, f4
4201    mtc1     t5, f5
4202    mtc1     t6, f6
4203    mtc1     t7, f7
4204    mtc1     t8, f8
4205    cvt.s.w  f1, f1
4206    cvt.s.w  f2, f2
4207    cvt.s.w  f3, f3
4208    cvt.s.w  f4, f4
4209    cvt.s.w  f5, f5
4210    cvt.s.w  f6, f6
4211    cvt.s.w  f7, f7
4212    cvt.s.w  f8, f8
4213    lw       t0, 8(a0)
4214    swc1     f1, 32(a2)
4215    swc1     f2, 36(a2)
4216    swc1     f3, 40(a2)
4217    addu     t0, t0, a1
4218    swc1     f4, 44(a2)
4219    swc1     f5, 48(a2)
4220    swc1     f6, 52(a2)
4221    swc1     f7, 56(a2)
4222    swc1     f8, 60(a2)
4223    //elemr 2
4224    lbu      t1, 0(t0)
4225    lbu      t2, 1(t0)
4226    lbu      t3, 2(t0)
4227    lbu      t4, 3(t0)
4228    lbu      t5, 4(t0)
4229    lbu      t6, 5(t0)
4230    lbu      t7, 6(t0)
4231    lbu      t8, 7(t0)
4232    addiu    t1, t1, -128
4233    addiu    t2, t2, -128
4234    addiu    t3, t3, -128
4235    addiu    t4, t4, -128
4236    addiu    t5, t5, -128
4237    addiu    t6, t6, -128
4238    addiu    t7, t7, -128
4239    addiu    t8, t8, -128
4240    mtc1     t1, f1
4241    mtc1     t2, f2
4242    mtc1     t3, f3
4243    mtc1     t4, f4
4244    mtc1     t5, f5
4245    mtc1     t6, f6
4246    mtc1     t7, f7
4247    mtc1     t8, f8
4248    cvt.s.w  f1, f1
4249    cvt.s.w  f2, f2
4250    cvt.s.w  f3, f3
4251    cvt.s.w  f4, f4
4252    cvt.s.w  f5, f5
4253    cvt.s.w  f6, f6
4254    cvt.s.w  f7, f7
4255    cvt.s.w  f8, f8
4256    lw       t0, 12(a0)
4257    swc1     f1, 64(a2)
4258    swc1     f2, 68(a2)
4259    swc1     f3, 72(a2)
4260    addu     t0, t0, a1
4261    swc1     f4, 76(a2)
4262    swc1     f5, 80(a2)
4263    swc1     f6, 84(a2)
4264    swc1     f7, 88(a2)
4265    swc1     f8, 92(a2)
4266    //elemr 3
4267    lbu      t1, 0(t0)
4268    lbu      t2, 1(t0)
4269    lbu      t3, 2(t0)
4270    lbu      t4, 3(t0)
4271    lbu      t5, 4(t0)
4272    lbu      t6, 5(t0)
4273    lbu      t7, 6(t0)
4274    lbu      t8, 7(t0)
4275    addiu    t1, t1, -128
4276    addiu    t2, t2, -128
4277    addiu    t3, t3, -128
4278    addiu    t4, t4, -128
4279    addiu    t5, t5, -128
4280    addiu    t6, t6, -128
4281    addiu    t7, t7, -128
4282    addiu    t8, t8, -128
4283    mtc1     t1, f1
4284    mtc1     t2, f2
4285    mtc1     t3, f3
4286    mtc1     t4, f4
4287    mtc1     t5, f5
4288    mtc1     t6, f6
4289    mtc1     t7, f7
4290    mtc1     t8, f8
4291    cvt.s.w  f1, f1
4292    cvt.s.w  f2, f2
4293    cvt.s.w  f3, f3
4294    cvt.s.w  f4, f4
4295    cvt.s.w  f5, f5
4296    cvt.s.w  f6, f6
4297    cvt.s.w  f7, f7
4298    cvt.s.w  f8, f8
4299    lw       t0, 16(a0)
4300    swc1     f1, 96(a2)
4301    swc1     f2, 100(a2)
4302    swc1     f3, 104(a2)
4303    addu     t0, t0, a1
4304    swc1     f4, 108(a2)
4305    swc1     f5, 112(a2)
4306    swc1     f6, 116(a2)
4307    swc1     f7, 120(a2)
4308    swc1     f8, 124(a2)
4309    //elemr 4
4310    lbu      t1, 0(t0)
4311    lbu      t2, 1(t0)
4312    lbu      t3, 2(t0)
4313    lbu      t4, 3(t0)
4314    lbu      t5, 4(t0)
4315    lbu      t6, 5(t0)
4316    lbu      t7, 6(t0)
4317    lbu      t8, 7(t0)
4318    addiu    t1, t1, -128
4319    addiu    t2, t2, -128
4320    addiu    t3, t3, -128
4321    addiu    t4, t4, -128
4322    addiu    t5, t5, -128
4323    addiu    t6, t6, -128
4324    addiu    t7, t7, -128
4325    addiu    t8, t8, -128
4326    mtc1     t1, f1
4327    mtc1     t2, f2
4328    mtc1     t3, f3
4329    mtc1     t4, f4
4330    mtc1     t5, f5
4331    mtc1     t6, f6
4332    mtc1     t7, f7
4333    mtc1     t8, f8
4334    cvt.s.w  f1, f1
4335    cvt.s.w  f2, f2
4336    cvt.s.w  f3, f3
4337    cvt.s.w  f4, f4
4338    cvt.s.w  f5, f5
4339    cvt.s.w  f6, f6
4340    cvt.s.w  f7, f7
4341    cvt.s.w  f8, f8
4342    lw       t0, 20(a0)
4343    swc1     f1, 128(a2)
4344    swc1     f2, 132(a2)
4345    swc1     f3, 136(a2)
4346    addu     t0, t0, a1
4347    swc1     f4, 140(a2)
4348    swc1     f5, 144(a2)
4349    swc1     f6, 148(a2)
4350    swc1     f7, 152(a2)
4351    swc1     f8, 156(a2)
4352    //elemr 5
4353    lbu      t1, 0(t0)
4354    lbu      t2, 1(t0)
4355    lbu      t3, 2(t0)
4356    lbu      t4, 3(t0)
4357    lbu      t5, 4(t0)
4358    lbu      t6, 5(t0)
4359    lbu      t7, 6(t0)
4360    lbu      t8, 7(t0)
4361    addiu    t1, t1, -128
4362    addiu    t2, t2, -128
4363    addiu    t3, t3, -128
4364    addiu    t4, t4, -128
4365    addiu    t5, t5, -128
4366    addiu    t6, t6, -128
4367    addiu    t7, t7, -128
4368    addiu    t8, t8, -128
4369    mtc1     t1, f1
4370    mtc1     t2, f2
4371    mtc1     t3, f3
4372    mtc1     t4, f4
4373    mtc1     t5, f5
4374    mtc1     t6, f6
4375    mtc1     t7, f7
4376    mtc1     t8, f8
4377    cvt.s.w  f1, f1
4378    cvt.s.w  f2, f2
4379    cvt.s.w  f3, f3
4380    cvt.s.w  f4, f4
4381    cvt.s.w  f5, f5
4382    cvt.s.w  f6, f6
4383    cvt.s.w  f7, f7
4384    cvt.s.w  f8, f8
4385    lw       t0, 24(a0)
4386    swc1     f1, 160(a2)
4387    swc1     f2, 164(a2)
4388    swc1     f3, 168(a2)
4389    addu     t0, t0, a1
4390    swc1     f4, 172(a2)
4391    swc1     f5, 176(a2)
4392    swc1     f6, 180(a2)
4393    swc1     f7, 184(a2)
4394    swc1     f8, 188(a2)
4395    //elemr 6
4396    lbu      t1, 0(t0)
4397    lbu      t2, 1(t0)
4398    lbu      t3, 2(t0)
4399    lbu      t4, 3(t0)
4400    lbu      t5, 4(t0)
4401    lbu      t6, 5(t0)
4402    lbu      t7, 6(t0)
4403    lbu      t8, 7(t0)
4404    addiu    t1, t1, -128
4405    addiu    t2, t2, -128
4406    addiu    t3, t3, -128
4407    addiu    t4, t4, -128
4408    addiu    t5, t5, -128
4409    addiu    t6, t6, -128
4410    addiu    t7, t7, -128
4411    addiu    t8, t8, -128
4412    mtc1     t1, f1
4413    mtc1     t2, f2
4414    mtc1     t3, f3
4415    mtc1     t4, f4
4416    mtc1     t5, f5
4417    mtc1     t6, f6
4418    mtc1     t7, f7
4419    mtc1     t8, f8
4420    cvt.s.w  f1, f1
4421    cvt.s.w  f2, f2
4422    cvt.s.w  f3, f3
4423    cvt.s.w  f4, f4
4424    cvt.s.w  f5, f5
4425    cvt.s.w  f6, f6
4426    cvt.s.w  f7, f7
4427    cvt.s.w  f8, f8
4428    lw       t0, 28(a0)
4429    swc1     f1, 192(a2)
4430    swc1     f2, 196(a2)
4431    swc1     f3, 200(a2)
4432    addu     t0, t0, a1
4433    swc1     f4, 204(a2)
4434    swc1     f5, 208(a2)
4435    swc1     f6, 212(a2)
4436    swc1     f7, 216(a2)
4437    swc1     f8, 220(a2)
4438    //elemr 7
4439    lbu      t1, 0(t0)
4440    lbu      t2, 1(t0)
4441    lbu      t3, 2(t0)
4442    lbu      t4, 3(t0)
4443    lbu      t5, 4(t0)
4444    lbu      t6, 5(t0)
4445    lbu      t7, 6(t0)
4446    lbu      t8, 7(t0)
4447    addiu    t1, t1, -128
4448    addiu    t2, t2, -128
4449    addiu    t3, t3, -128
4450    addiu    t4, t4, -128
4451    addiu    t5, t5, -128
4452    addiu    t6, t6, -128
4453    addiu    t7, t7, -128
4454    addiu    t8, t8, -128
4455    mtc1     t1, f1
4456    mtc1     t2, f2
4457    mtc1     t3, f3
4458    mtc1     t4, f4
4459    mtc1     t5, f5
4460    mtc1     t6, f6
4461    mtc1     t7, f7
4462    mtc1     t8, f8
4463    cvt.s.w  f1, f1
4464    cvt.s.w  f2, f2
4465    cvt.s.w  f3, f3
4466    cvt.s.w  f4, f4
4467    cvt.s.w  f5, f5
4468    cvt.s.w  f6, f6
4469    cvt.s.w  f7, f7
4470    cvt.s.w  f8, f8
4471    swc1     f1, 224(a2)
4472    swc1     f2, 228(a2)
4473    swc1     f3, 232(a2)
4474    swc1     f4, 236(a2)
4475    swc1     f5, 240(a2)
4476    swc1     f6, 244(a2)
4477    swc1     f7, 248(a2)
4478    swc1     f8, 252(a2)
4479
4480    j        ra
4481     nop
4482
4483END(jsimd_convsamp_float_mips_dspr2)
4484
4485/*****************************************************************************/
4486
4487